diff --git a/venv/lib/python3.13/site-packages/transformers/models/albert/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/albert/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..40cf13994c18b8d10e7e770a2c23da820753c0f8
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/albert/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/albert/__pycache__/configuration_albert.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/albert/__pycache__/configuration_albert.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6dc919c12d742a9c7dd3225cb20dbeee93e3f399
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/albert/__pycache__/configuration_albert.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/albert/__pycache__/modeling_albert.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/albert/__pycache__/modeling_albert.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..03fe413513185c3cf26ce153329419bdef47dafe
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/albert/__pycache__/modeling_albert.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/albert/__pycache__/modeling_flax_albert.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/albert/__pycache__/modeling_flax_albert.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dee1567ec94e6e22dddf5ee93cbb7d64557d87ff
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/albert/__pycache__/modeling_flax_albert.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/albert/__pycache__/modeling_tf_albert.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/albert/__pycache__/modeling_tf_albert.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e55940927373bfda3d8a218ee8aab9deb0d7fad6
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/albert/__pycache__/modeling_tf_albert.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/albert/__pycache__/tokenization_albert.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/albert/__pycache__/tokenization_albert.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2bd3bf0917a6bea3419a25aad963b382f01ccbfc
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/albert/__pycache__/tokenization_albert.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/albert/__pycache__/tokenization_albert_fast.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/albert/__pycache__/tokenization_albert_fast.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d5c1ad8e8df902f4a5746bfc507f82d481f62312
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/albert/__pycache__/tokenization_albert_fast.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/apertus/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/apertus/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6cfaf48edd01d401486f3959d71e893a71306829
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/apertus/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/apertus/__pycache__/configuration_apertus.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/apertus/__pycache__/configuration_apertus.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..89d0ff302d25b0a3bfb08e3ff48d42d8494d46d3
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/apertus/__pycache__/configuration_apertus.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/apertus/__pycache__/modeling_apertus.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/apertus/__pycache__/modeling_apertus.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..74eedaa53de5e6b31ada27ae02c73161fe95034d
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/apertus/__pycache__/modeling_apertus.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/apertus/__pycache__/modular_apertus.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/apertus/__pycache__/modular_apertus.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3e3ab745881a5e452aba8fddaa60464af0ea0953
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/apertus/__pycache__/modular_apertus.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/arcee/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/arcee/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..94e930ee11b1f28074435a52dcf9fd943b43ffa8
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/arcee/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/arcee/__pycache__/configuration_arcee.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/arcee/__pycache__/configuration_arcee.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3d220f9afbb10c265dfc0d99fa571892b68316e1
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/arcee/__pycache__/configuration_arcee.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/arcee/__pycache__/modeling_arcee.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/arcee/__pycache__/modeling_arcee.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..34e45d41ba6a910d2a174398de6dae44c4a9068d
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/arcee/__pycache__/modeling_arcee.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/arcee/__pycache__/modular_arcee.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/arcee/__pycache__/modular_arcee.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..58ffe19c036de68dceb0e3807abc112bd6d8260d
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/arcee/__pycache__/modular_arcee.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/aria/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/aria/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c91de83d20d06097c4e9ede76faad011e7820a16
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/aria/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/aria/__pycache__/configuration_aria.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/aria/__pycache__/configuration_aria.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e89df346412c3bfbd421d8b253addd63d4e2a9aa
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/aria/__pycache__/configuration_aria.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/aria/__pycache__/image_processing_aria.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/aria/__pycache__/image_processing_aria.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d72d7a790badb077fa418b6259c9d672fce48fdb
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/aria/__pycache__/image_processing_aria.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/aria/__pycache__/modeling_aria.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/aria/__pycache__/modeling_aria.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..84abc16b71855dea643f21b924ce9e97b205945c
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/aria/__pycache__/modeling_aria.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/aria/__pycache__/modular_aria.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/aria/__pycache__/modular_aria.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..77e46b76f44ea405597e3ebf1c1298b78cd690cc
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/aria/__pycache__/modular_aria.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/aria/__pycache__/processing_aria.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/aria/__pycache__/processing_aria.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3a7e2df4d561cb1c1270a467f3d5563cdb76dfd6
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/aria/__pycache__/processing_aria.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/auto/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/auto/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..27eae9cd2390677b2c99d96a5642c3b6f34e4bdd
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/auto/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/auto/__pycache__/auto_factory.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/auto/__pycache__/auto_factory.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4cb71c8cbbd22afdd9c118a0dc6602ab7456e61e
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/auto/__pycache__/auto_factory.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/auto/__pycache__/configuration_auto.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/auto/__pycache__/configuration_auto.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..344702045596efe923bf9e4c552d63ca3030a83e
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/auto/__pycache__/configuration_auto.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/auto/__pycache__/feature_extraction_auto.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/auto/__pycache__/feature_extraction_auto.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c606baf95c857d026bb1c0f56658a925e08c6a81
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/auto/__pycache__/feature_extraction_auto.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/auto/__pycache__/image_processing_auto.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/auto/__pycache__/image_processing_auto.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..31c2178491cc9b1d69f49925e297782d2b3376ba
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/auto/__pycache__/image_processing_auto.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/auto/__pycache__/modeling_auto.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/auto/__pycache__/modeling_auto.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7c44bf10bad8df97866e2fe0f0bd67b28323227a
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/auto/__pycache__/modeling_auto.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/auto/__pycache__/modeling_flax_auto.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/auto/__pycache__/modeling_flax_auto.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..832adf180df89cb2a649bfbb0c53e379e2212e61
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/auto/__pycache__/modeling_flax_auto.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/auto/__pycache__/modeling_tf_auto.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/auto/__pycache__/modeling_tf_auto.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8dcf74ff4ad80f31ba04158ef16d6a23e14ac34f
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/auto/__pycache__/modeling_tf_auto.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/auto/__pycache__/processing_auto.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/auto/__pycache__/processing_auto.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..15544fdf3da66bc7c025c155f30f61717f3b3bb4
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/auto/__pycache__/processing_auto.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/auto/__pycache__/tokenization_auto.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/auto/__pycache__/tokenization_auto.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2a814b615371b2d20e8505789c8639fb3e3690ae
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/auto/__pycache__/tokenization_auto.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/auto/__pycache__/video_processing_auto.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/auto/__pycache__/video_processing_auto.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..95e511b273e18b0041820ff21d6a37342e79f751
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/auto/__pycache__/video_processing_auto.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/aya_vision/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/aya_vision/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..efcc5357f568f214f2c8e22bbc0877a03a112b90
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/aya_vision/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/aya_vision/__pycache__/configuration_aya_vision.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/aya_vision/__pycache__/configuration_aya_vision.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3d44180376546f864347c49db8d997c104427ac1
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/aya_vision/__pycache__/configuration_aya_vision.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/aya_vision/__pycache__/modeling_aya_vision.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/aya_vision/__pycache__/modeling_aya_vision.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..07c55717162fb8b25b8cbdfda4b22524dc41fc86
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/aya_vision/__pycache__/modeling_aya_vision.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/aya_vision/__pycache__/modular_aya_vision.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/aya_vision/__pycache__/modular_aya_vision.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..82018938237686db39d0a82e2791247bd96fd23b
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/aya_vision/__pycache__/modular_aya_vision.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/aya_vision/__pycache__/processing_aya_vision.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/aya_vision/__pycache__/processing_aya_vision.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7356307f28b1b0a8f0bee9be5234729aa04efde7
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/aya_vision/__pycache__/processing_aya_vision.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/bertweet/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/bertweet/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b375f6fb8e9c1982c0e469b55125860d4405fa08
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/bertweet/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/bertweet/__pycache__/tokenization_bertweet.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/bertweet/__pycache__/tokenization_bertweet.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1b1ab43a4564a9fc55b402518b33418666725284
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/bertweet/__pycache__/tokenization_bertweet.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/biogpt/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/biogpt/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c553b8a0d1b5051904c1af3c2022b8f5df849359
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/biogpt/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/biogpt/__pycache__/configuration_biogpt.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/biogpt/__pycache__/configuration_biogpt.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..405318f5e670f965934dc8b3446e9ac7499d9602
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/biogpt/__pycache__/configuration_biogpt.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/biogpt/__pycache__/modeling_biogpt.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/biogpt/__pycache__/modeling_biogpt.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5cb5964f9cfd0c6f6d7f17b1b0e5d4f19b0bfe3a
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/biogpt/__pycache__/modeling_biogpt.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/biogpt/__pycache__/modular_biogpt.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/biogpt/__pycache__/modular_biogpt.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ba6a4c6849c23fe1a33d30fd283da7ef7247c2db
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/biogpt/__pycache__/modular_biogpt.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/biogpt/__pycache__/tokenization_biogpt.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/biogpt/__pycache__/tokenization_biogpt.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..93173c91e0483c416ec4ce8dc7f5b8c38189ae56
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/biogpt/__pycache__/tokenization_biogpt.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/bit/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/bit/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6afb19fbaeea5d5c337b959f6a746e2bf4d33ebc
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/bit/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/bit/__pycache__/configuration_bit.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/bit/__pycache__/configuration_bit.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..86b71e488571f95af9311c61d93c25484f0540f9
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/bit/__pycache__/configuration_bit.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/bit/__pycache__/image_processing_bit.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/bit/__pycache__/image_processing_bit.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..13c71e5bfab2fb833a1f490b835d2b689fa33e44
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/bit/__pycache__/image_processing_bit.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/bit/__pycache__/image_processing_bit_fast.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/bit/__pycache__/image_processing_bit_fast.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..baa1bef9059eda78bb4f399dbb12e65c53196a93
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/bit/__pycache__/image_processing_bit_fast.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/bit/__pycache__/modeling_bit.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/bit/__pycache__/modeling_bit.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..157683d3b5dd9df7af6d726b0329e04867147487
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/bit/__pycache__/modeling_bit.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/bloom/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/bloom/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9e89a85e2b8e87d2a8eab9e061e079e66ceeeffa
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/bloom/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/bloom/__pycache__/configuration_bloom.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/bloom/__pycache__/configuration_bloom.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2850a085592d1e01b22f7162d78d5d38597b8450
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/bloom/__pycache__/configuration_bloom.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/bloom/__pycache__/modeling_bloom.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/bloom/__pycache__/modeling_bloom.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..10a5317ed92ad7837c30368d7de3fb78bd081ee4
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/bloom/__pycache__/modeling_bloom.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/bloom/__pycache__/modeling_flax_bloom.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/bloom/__pycache__/modeling_flax_bloom.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7ec2e93ef933f2361843a54e40d1d2cbdf5650f2
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/bloom/__pycache__/modeling_flax_bloom.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/bloom/__pycache__/tokenization_bloom_fast.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/bloom/__pycache__/tokenization_bloom_fast.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..72392594a7c113e305d2decebe5afb31bf130bb2
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/bloom/__pycache__/tokenization_bloom_fast.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/bridgetower/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/bridgetower/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fb3036dc3c28b2205b7769349a9e878b14c14b9b
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/bridgetower/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/bridgetower/__pycache__/configuration_bridgetower.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/bridgetower/__pycache__/configuration_bridgetower.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d4d9699f48f9422d5d4f7e7fbaf210af9ae862af
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/bridgetower/__pycache__/configuration_bridgetower.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/bridgetower/__pycache__/image_processing_bridgetower.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/bridgetower/__pycache__/image_processing_bridgetower.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e04d5a1983be88b3faba171c827f4a1375967bd9
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/bridgetower/__pycache__/image_processing_bridgetower.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/bridgetower/__pycache__/image_processing_bridgetower_fast.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/bridgetower/__pycache__/image_processing_bridgetower_fast.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..22f88faac7e6ed3e9799569488f0be862a223b8d
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/bridgetower/__pycache__/image_processing_bridgetower_fast.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/bridgetower/__pycache__/modeling_bridgetower.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/bridgetower/__pycache__/modeling_bridgetower.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..89f9bb05789fb130d003825e326621c9cff7b0cf
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/bridgetower/__pycache__/modeling_bridgetower.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/bridgetower/__pycache__/processing_bridgetower.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/bridgetower/__pycache__/processing_bridgetower.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ae1e5a15fc996c768163b897d5005af4009dfa14
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/bridgetower/__pycache__/processing_bridgetower.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/chameleon/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/chameleon/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..99269e5ee9b6326ff4026e76330c4075ffa758e8
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/chameleon/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/chameleon/__pycache__/configuration_chameleon.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/chameleon/__pycache__/configuration_chameleon.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..19858a2ee5b476c700a358395e2ced8e4cebae16
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/chameleon/__pycache__/configuration_chameleon.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/chameleon/__pycache__/image_processing_chameleon.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/chameleon/__pycache__/image_processing_chameleon.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..747258583028d3d216721d1e1e5fbf9d84095bf7
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/chameleon/__pycache__/image_processing_chameleon.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/chameleon/__pycache__/image_processing_chameleon_fast.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/chameleon/__pycache__/image_processing_chameleon_fast.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1f803da7f469cecdab22ba355f7e2421580e0523
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/chameleon/__pycache__/image_processing_chameleon_fast.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/chameleon/__pycache__/modeling_chameleon.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/chameleon/__pycache__/modeling_chameleon.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3e73ba9a64f94ff6ea149540a84a755578e7c4dd
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/chameleon/__pycache__/modeling_chameleon.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/chameleon/__pycache__/processing_chameleon.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/chameleon/__pycache__/processing_chameleon.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ccd8b2699c8ea28695498faae853de41d8453535
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/chameleon/__pycache__/processing_chameleon.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/code_llama/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/code_llama/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dd6b736b577eb59de4519917e8472beb3d446c63
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/code_llama/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/code_llama/__pycache__/tokenization_code_llama.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/code_llama/__pycache__/tokenization_code_llama.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6d8296560d711ecb72fd5fa248d170cfda1f4f46
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/code_llama/__pycache__/tokenization_code_llama.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/code_llama/__pycache__/tokenization_code_llama_fast.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/code_llama/__pycache__/tokenization_code_llama_fast.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b7588ac613fa250fe139473eb3d69088af52efff
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/code_llama/__pycache__/tokenization_code_llama_fast.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/cpmant/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/cpmant/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c624b1fa1933532b2ec5b48f450aefcbb460c542
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/cpmant/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/cpmant/__pycache__/configuration_cpmant.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/cpmant/__pycache__/configuration_cpmant.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d2145b52c061efc8d8d30f5a74dfdca295d8c87a
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/cpmant/__pycache__/configuration_cpmant.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/cpmant/__pycache__/modeling_cpmant.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/cpmant/__pycache__/modeling_cpmant.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fe2d869ddb06dcae42dede86e824b7b51696b0ca
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/cpmant/__pycache__/modeling_cpmant.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/cpmant/__pycache__/tokenization_cpmant.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/cpmant/__pycache__/tokenization_cpmant.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a8c9f4d6acd41464971dde08729daa76618847d4
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/cpmant/__pycache__/tokenization_cpmant.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/csm/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/csm/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c4239d2a7235bbae2bf056ef2b23f5137cab33a3
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/csm/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/csm/__pycache__/configuration_csm.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/csm/__pycache__/configuration_csm.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c477f9d6abb27fcfe3fed26d1c24eb1342e16175
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/csm/__pycache__/configuration_csm.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/csm/__pycache__/generation_csm.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/csm/__pycache__/generation_csm.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..06245f8c1d3b89bc72b6cda020067aa331200ffd
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/csm/__pycache__/generation_csm.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/csm/__pycache__/modeling_csm.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/csm/__pycache__/modeling_csm.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..89f169d7be3201e0beb922f1df8d559a6e0fbdf5
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/csm/__pycache__/modeling_csm.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/csm/__pycache__/modular_csm.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/csm/__pycache__/modular_csm.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..314aae1846b503eb73f153b0025abd4556a85609
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/csm/__pycache__/modular_csm.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/csm/__pycache__/processing_csm.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/csm/__pycache__/processing_csm.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3cc02a34d92dfb896b259b4f39bfea68268b6ff4
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/csm/__pycache__/processing_csm.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/d_fine/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/d_fine/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..759974419dfed0244aaedbd224f72f0ac99b99b6
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/d_fine/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/d_fine/__pycache__/configuration_d_fine.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/d_fine/__pycache__/configuration_d_fine.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..35169cac9812e75f9cfa4aac44ad4d42fe109fdd
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/d_fine/__pycache__/configuration_d_fine.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/d_fine/__pycache__/modular_d_fine.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/d_fine/__pycache__/modular_d_fine.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c73967edc254bce7dfc336e681064d68052b463a
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/d_fine/__pycache__/modular_d_fine.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deepseek_v3/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deepseek_v3/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b1e0f83047c7b1f2c087d4b3fb323ff2ecb1f984
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deepseek_v3/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deepseek_v3/__pycache__/configuration_deepseek_v3.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deepseek_v3/__pycache__/configuration_deepseek_v3.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7d6abf550ec24ce66147661ed021f04fd4932881
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deepseek_v3/__pycache__/configuration_deepseek_v3.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deepseek_v3/__pycache__/modeling_deepseek_v3.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deepseek_v3/__pycache__/modeling_deepseek_v3.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d3497bb8e1fd04b12e194c2597030d790902fd41
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deepseek_v3/__pycache__/modeling_deepseek_v3.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deepseek_v3/__pycache__/modular_deepseek_v3.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deepseek_v3/__pycache__/modular_deepseek_v3.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..34fd269f38b508f076b5a0a4fd76be2a154e01c4
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deepseek_v3/__pycache__/modular_deepseek_v3.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deepseek_vl/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deepseek_vl/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..45967d3545c7e368eeef0266618302fdf411e479
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deepseek_vl/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deepseek_vl/__pycache__/configuration_deepseek_vl.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deepseek_vl/__pycache__/configuration_deepseek_vl.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..580ca719af3c914f4b50cee322e313f1d2c4801e
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deepseek_vl/__pycache__/configuration_deepseek_vl.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deepseek_vl/__pycache__/image_processing_deepseek_vl.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deepseek_vl/__pycache__/image_processing_deepseek_vl.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a3022220995e0060e164515c1cf5f516dae8459f
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deepseek_vl/__pycache__/image_processing_deepseek_vl.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deepseek_vl/__pycache__/image_processing_deepseek_vl_fast.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deepseek_vl/__pycache__/image_processing_deepseek_vl_fast.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6b184f89dfa663f81ca60f45995158d9e9832f44
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deepseek_vl/__pycache__/image_processing_deepseek_vl_fast.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deepseek_vl/__pycache__/modeling_deepseek_vl.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deepseek_vl/__pycache__/modeling_deepseek_vl.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e60e9e3a634be9620c163d9cdfcadb431787b303
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deepseek_vl/__pycache__/modeling_deepseek_vl.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deepseek_vl/__pycache__/modular_deepseek_vl.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deepseek_vl/__pycache__/modular_deepseek_vl.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9a2db5c36416444e1eb7f4a708fa41f339707716
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deepseek_vl/__pycache__/modular_deepseek_vl.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deepseek_vl/__pycache__/processing_deepseek_vl.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deepseek_vl/__pycache__/processing_deepseek_vl.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e5f5be69f7136acf82e2cf14379c4dd822a781e1
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deepseek_vl/__pycache__/processing_deepseek_vl.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deformable_detr/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deformable_detr/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a323652a4d47af1dabd04f4f0e000a43367ccf44
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deformable_detr/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deformable_detr/__pycache__/configuration_deformable_detr.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deformable_detr/__pycache__/configuration_deformable_detr.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5c83fedf127ed69439508f34b67492d637e5d58b
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deformable_detr/__pycache__/configuration_deformable_detr.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deformable_detr/__pycache__/feature_extraction_deformable_detr.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deformable_detr/__pycache__/feature_extraction_deformable_detr.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..26d668f8bfdfebee762bcf549b9df536e1330f84
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deformable_detr/__pycache__/feature_extraction_deformable_detr.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deformable_detr/__pycache__/image_processing_deformable_detr.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deformable_detr/__pycache__/image_processing_deformable_detr.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..83bb9b001c6f0de2b1bb5a03b65a91477beedb98
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deformable_detr/__pycache__/image_processing_deformable_detr.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deformable_detr/__pycache__/image_processing_deformable_detr_fast.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deformable_detr/__pycache__/image_processing_deformable_detr_fast.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a8448b2d817dc9009e47fff003d5b41da2804db2
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deformable_detr/__pycache__/image_processing_deformable_detr_fast.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deformable_detr/__pycache__/modeling_deformable_detr.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deformable_detr/__pycache__/modeling_deformable_detr.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e85c98bac2d4396cb724fbfb98d988577125e0d8
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deformable_detr/__pycache__/modeling_deformable_detr.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deformable_detr/__pycache__/modular_deformable_detr.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deformable_detr/__pycache__/modular_deformable_detr.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6e3c4c34c3a42ff1db90504338d7e4d2b4f62d1f
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deformable_detr/__pycache__/modular_deformable_detr.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..35059f3988a4045c912ce346578e3d54f6574827
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/bort/__init__.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/bort/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/bort/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/bort/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eb881f60aab26bcc6b6908b6951142bfd1dde878
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/bort/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/deta/__init__.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/deta/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e06e186741991b2f9183f30b6591477bc60dfb5
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/deta/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ....utils import _LazyModule
+from ....utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_deta import *
+    from .image_processing_deta import *
+    from .modeling_deta import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/deta/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/deta/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8f2a64baeb62262f08d9cba73130be3b66a522c6
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/deta/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/deta/__pycache__/configuration_deta.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/deta/__pycache__/configuration_deta.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..26a7b9e1be008306326409567e498d6d24e1c9d5
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/deta/__pycache__/configuration_deta.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/deta/__pycache__/image_processing_deta.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/deta/__pycache__/image_processing_deta.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..989c1d60a7acf009259a99abb006a056d725f113
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/deta/__pycache__/image_processing_deta.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/deta/configuration_deta.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/deta/configuration_deta.py
new file mode 100644
index 0000000000000000000000000000000000000000..2109902ac06546ce4e235403dd3b95bf27486272
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/deta/configuration_deta.py
@@ -0,0 +1,278 @@
+# coding=utf-8
+# Copyright 2022 SenseTime and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""DETA model configuration"""
+
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
+from ...auto import CONFIG_MAPPING
+
+
+logger = logging.get_logger(__name__)
+
+
+class DetaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DetaModel`]. It is used to instantiate a DETA
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the DETA
+    [SenseTime/deformable-detr](https://huggingface.co/SenseTime/deformable-detr) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        backbone_config (`PretrainedConfig` or `dict`, *optional*, defaults to `ResNetConfig()`):
+            The configuration of the backbone model.
+        backbone (`str`, *optional*):
+            Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
+            will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
+            is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
+        use_pretrained_backbone (`bool`, *optional*, `False`):
+            Whether to use pretrained weights for the backbone.
+        use_timm_backbone (`bool`, *optional*, `False`):
+            Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
+            library.
+        backbone_kwargs (`dict`, *optional*):
+            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
+            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
+        num_queries (`int`, *optional*, defaults to 900):
+            Number of object queries, i.e. detection slots. This is the maximal number of objects [`DetaModel`] can
+            detect in a single image. In case `two_stage` is set to `True`, we use `two_stage_num_proposals` instead.
+        d_model (`int`, *optional*, defaults to 256):
+            Dimension of the layers.
+        encoder_layers (`int`, *optional*, defaults to 6):
+            Number of encoder layers.
+        decoder_layers (`int`, *optional*, defaults to 6):
+            Number of decoder layers.
+        encoder_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 2048):
+            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 2048):
+            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+        activation_function (`str` or `function`, *optional*, defaults to `"relu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        init_xavier_std (`float`, *optional*, defaults to 1):
+            The scaling factor used for the Xavier initialization gain in the HM Attention map module.
+        encoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://huggingface.co/papers/1909.11556)
+            for more details.
+        auxiliary_loss (`bool`, *optional*, defaults to `False`):
+            Whether auxiliary decoding losses (loss at each decoder layer) are to be used.
+        position_embedding_type (`str`, *optional*, defaults to `"sine"`):
+            Type of position embeddings to be used on top of the image features. One of `"sine"` or `"learned"`.
+        class_cost (`float`, *optional*, defaults to 1):
+            Relative weight of the classification error in the Hungarian matching cost.
+        bbox_cost (`float`, *optional*, defaults to 5):
+            Relative weight of the L1 error of the bounding box coordinates in the Hungarian matching cost.
+        giou_cost (`float`, *optional*, defaults to 2):
+            Relative weight of the generalized IoU loss of the bounding box in the Hungarian matching cost.
+        mask_loss_coefficient (`float`, *optional*, defaults to 1):
+            Relative weight of the Focal loss in the panoptic segmentation loss.
+        dice_loss_coefficient (`float`, *optional*, defaults to 1):
+            Relative weight of the DICE/F-1 loss in the panoptic segmentation loss.
+        bbox_loss_coefficient (`float`, *optional*, defaults to 5):
+            Relative weight of the L1 bounding box loss in the object detection loss.
+        giou_loss_coefficient (`float`, *optional*, defaults to 2):
+            Relative weight of the generalized IoU loss in the object detection loss.
+        eos_coefficient (`float`, *optional*, defaults to 0.1):
+            Relative classification weight of the 'no-object' class in the object detection loss.
+        num_feature_levels (`int`, *optional*, defaults to 5):
+            The number of input feature levels.
+        encoder_n_points (`int`, *optional*, defaults to 4):
+            The number of sampled keys in each feature level for each attention head in the encoder.
+        decoder_n_points (`int`, *optional*, defaults to 4):
+            The number of sampled keys in each feature level for each attention head in the decoder.
+        two_stage (`bool`, *optional*, defaults to `True`):
+            Whether to apply a two-stage deformable DETR, where the region proposals are also generated by a variant of
+            DETA, which are further fed into the decoder for iterative bounding box refinement.
+        two_stage_num_proposals (`int`, *optional*, defaults to 300):
+            The number of region proposals to be generated, in case `two_stage` is set to `True`.
+        with_box_refine (`bool`, *optional*, defaults to `True`):
+            Whether to apply iterative bounding box refinement, where each decoder layer refines the bounding boxes
+            based on the predictions from the previous layer.
+        focal_alpha (`float`, *optional*, defaults to 0.25):
+            Alpha parameter in the focal loss.
+        assign_first_stage (`bool`, *optional*, defaults to `True`):
+            Whether to assign each prediction i to the highest overlapping ground truth object if the overlap is larger than a threshold 0.7.
+        assign_second_stage (`bool`, *optional*, defaults to `True`):
+            Whether to assign second assignment procedure in the second stage closely follows the first stage assignment procedure.
+        disable_custom_kernels (`bool`, *optional*, defaults to `True`):
+            Disable the use of custom CUDA and CPU kernels. This option is necessary for the ONNX export, as custom
+            kernels are not supported by PyTorch ONNX export.
+
+    Examples:
+
+    ```python
+    >>> from transformers import DetaConfig, DetaModel
+
+    >>> # Initializing a DETA SenseTime/deformable-detr style configuration
+    >>> configuration = DetaConfig()
+
+    >>> # Initializing a model (with random weights) from the SenseTime/deformable-detr style configuration
+    >>> model = DetaModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "deta"
+    attribute_map = {
+        "hidden_size": "d_model",
+        "num_attention_heads": "encoder_attention_heads",
+    }
+
+    def __init__(
+        self,
+        backbone_config=None,
+        backbone=None,
+        use_pretrained_backbone=False,
+        use_timm_backbone=False,
+        backbone_kwargs=None,
+        num_queries=900,
+        max_position_embeddings=2048,
+        encoder_layers=6,
+        encoder_ffn_dim=2048,
+        encoder_attention_heads=8,
+        decoder_layers=6,
+        decoder_ffn_dim=1024,
+        decoder_attention_heads=8,
+        encoder_layerdrop=0.0,
+        is_encoder_decoder=True,
+        activation_function="relu",
+        d_model=256,
+        dropout=0.1,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        init_std=0.02,
+        init_xavier_std=1.0,
+        return_intermediate=True,
+        auxiliary_loss=False,
+        position_embedding_type="sine",
+        num_feature_levels=5,
+        encoder_n_points=4,
+        decoder_n_points=4,
+        two_stage=True,
+        two_stage_num_proposals=300,
+        with_box_refine=True,
+        assign_first_stage=True,
+        assign_second_stage=True,
+        class_cost=1,
+        bbox_cost=5,
+        giou_cost=2,
+        mask_loss_coefficient=1,
+        dice_loss_coefficient=1,
+        bbox_loss_coefficient=5,
+        giou_loss_coefficient=2,
+        eos_coefficient=0.1,
+        focal_alpha=0.25,
+        disable_custom_kernels=True,
+        **kwargs,
+    ):
+        if use_pretrained_backbone:
+            raise ValueError("Pretrained backbones are not supported yet.")
+
+        if backbone_config is not None and backbone is not None:
+            raise ValueError("You can't specify both `backbone` and `backbone_config`.")
+
+        if backbone_config is None and backbone is None:
+            logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.")
+            backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage2", "stage3", "stage4"])
+        else:
+            if isinstance(backbone_config, dict):
+                backbone_model_type = backbone_config.pop("model_type")
+                config_class = CONFIG_MAPPING[backbone_model_type]
+                backbone_config = config_class.from_dict(backbone_config)
+
+        if backbone_kwargs is not None and backbone_kwargs and backbone_config is not None:
+            raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.")
+
+        self.backbone_config = backbone_config
+        self.backbone = backbone
+        self.use_pretrained_backbone = use_pretrained_backbone
+        self.use_timm_backbone = use_timm_backbone
+        self.backbone_kwargs = backbone_kwargs
+        self.num_queries = num_queries
+        self.max_position_embeddings = max_position_embeddings
+        self.d_model = d_model
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.init_xavier_std = init_xavier_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.auxiliary_loss = auxiliary_loss
+        self.position_embedding_type = position_embedding_type
+        # deformable attributes
+        self.num_feature_levels = num_feature_levels
+        self.encoder_n_points = encoder_n_points
+        self.decoder_n_points = decoder_n_points
+        self.two_stage = two_stage
+        self.two_stage_num_proposals = two_stage_num_proposals
+        self.with_box_refine = with_box_refine
+        self.assign_first_stage = assign_first_stage
+        self.assign_second_stage = assign_second_stage
+        if two_stage is True and with_box_refine is False:
+            raise ValueError("If two_stage is True, with_box_refine must be True.")
+        # Hungarian matcher
+        self.class_cost = class_cost
+        self.bbox_cost = bbox_cost
+        self.giou_cost = giou_cost
+        # Loss coefficients
+        self.mask_loss_coefficient = mask_loss_coefficient
+        self.dice_loss_coefficient = dice_loss_coefficient
+        self.bbox_loss_coefficient = bbox_loss_coefficient
+        self.giou_loss_coefficient = giou_loss_coefficient
+        self.eos_coefficient = eos_coefficient
+        self.focal_alpha = focal_alpha
+        self.disable_custom_kernels = disable_custom_kernels
+        super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
+
+    @property
+    def num_attention_heads(self) -> int:
+        return self.encoder_attention_heads
+
+    @property
+    def hidden_size(self) -> int:
+        return self.d_model
+
+    @property
+    def sub_configs(self):
+        return (
+            {"backbone_config": type(self.backbone_config)}
+            if getattr(self, "backbone_config", None) is not None
+            else {}
+        )
+
+
+__all__ = ["DetaConfig"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/deta/image_processing_deta.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/deta/image_processing_deta.py
new file mode 100644
index 0000000000000000000000000000000000000000..434d25a1ab5152b0de5e74463c3fe15eb16f901b
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/deta/image_processing_deta.py
@@ -0,0 +1,1228 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Deformable DETR."""
+
+import pathlib
+from collections.abc import Iterable
+from typing import Any, Callable, Optional, Union
+
+import numpy as np
+
+from ....feature_extraction_utils import BatchFeature
+from ....image_processing_utils import BaseImageProcessor, get_size_dict
+from ....image_transforms import (
+    PaddingMode,
+    center_to_corners_format,
+    corners_to_center_format,
+    pad,
+    rescale,
+    resize,
+    rgb_to_id,
+    to_channel_dimension_format,
+)
+from ....image_utils import (
+    IMAGENET_DEFAULT_MEAN,
+    IMAGENET_DEFAULT_STD,
+    AnnotationFormat,
+    AnnotationType,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_batched,
+    is_scaled_image,
+    to_numpy_array,
+    valid_images,
+    validate_annotations,
+    validate_preprocess_arguments,
+)
+from ....utils import (
+    is_flax_available,
+    is_jax_tensor,
+    is_tf_available,
+    is_tf_tensor,
+    is_torch_available,
+    is_torch_tensor,
+    is_torchvision_available,
+    is_vision_available,
+    logging,
+)
+from ....utils.generic import TensorType
+
+
+if is_torch_available():
+    import torch
+
+
+if is_torchvision_available():
+    from torchvision.ops.boxes import batched_nms
+
+if is_vision_available():
+    import PIL
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
+
+
+def get_size_with_aspect_ratio(image_size, size, max_size=None) -> tuple[int, int]:
+    """
+    Computes the output image size given the input image size and the desired output size.
+
+    Args:
+        image_size (`tuple[int, int]`):
+            The input image size.
+        size (`int`):
+            The desired output size.
+        max_size (`int`, *optional*):
+            The maximum allowed output size.
+    """
+    height, width = image_size
+    raw_size = None
+    if max_size is not None:
+        min_original_size = float(min((height, width)))
+        max_original_size = float(max((height, width)))
+        if max_original_size / min_original_size * size > max_size:
+            raw_size = max_size * min_original_size / max_original_size
+            size = int(round(raw_size))
+
+    if (height <= width and height == size) or (width <= height and width == size):
+        oh, ow = height, width
+    elif width < height:
+        ow = size
+        if max_size is not None and raw_size is not None:
+            oh = int(raw_size * height / width)
+        else:
+            oh = int(size * height / width)
+    else:
+        oh = size
+        if max_size is not None and raw_size is not None:
+            ow = int(raw_size * width / height)
+        else:
+            ow = int(size * width / height)
+
+    return (oh, ow)
+
+
+def get_resize_output_image_size(
+    input_image: np.ndarray,
+    size: Union[int, tuple[int, int], list[int]],
+    max_size: Optional[int] = None,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> tuple[int, int]:
+    """
+    Computes the output image size given the input image size and the desired output size. If the desired output size
+    is a tuple or list, the output image size is returned as is. If the desired output size is an integer, the output
+    image size is computed by keeping the aspect ratio of the input image size.
+
+    Args:
+        input_image (`np.ndarray`):
+            The image to resize.
+        size (`int` or `tuple[int, int]` or `list[int]`):
+            The desired output size.
+        max_size (`int`, *optional*):
+            The maximum allowed output size.
+        input_data_format (`ChannelDimension` or `str`, *optional*):
+            The channel dimension format of the input image. If not provided, it will be inferred from the input image.
+    """
+    image_size = get_image_size(input_image, input_data_format)
+    if isinstance(size, (list, tuple)):
+        return size
+
+    return get_size_with_aspect_ratio(image_size, size, max_size)
+
+
+def get_image_size_for_max_height_width(
+    input_image: np.ndarray,
+    max_height: int,
+    max_width: int,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> tuple[int, int]:
+    """
+    Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio.
+    Important, even if image_height < max_height and image_width < max_width, the image will be resized
+    to at least one of the edges be equal to max_height or max_width.
+
+    For example:
+        - input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50)
+        - input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400)
+
+    Args:
+        input_image (`np.ndarray`):
+            The image to resize.
+        max_height (`int`):
+            The maximum allowed height.
+        max_width (`int`):
+            The maximum allowed width.
+        input_data_format (`ChannelDimension` or `str`, *optional*):
+            The channel dimension format of the input image. If not provided, it will be inferred from the input image.
+    """
+    image_size = get_image_size(input_image, input_data_format)
+    height, width = image_size
+    height_scale = max_height / height
+    width_scale = max_width / width
+    min_scale = min(height_scale, width_scale)
+    new_height = int(height * min_scale)
+    new_width = int(width * min_scale)
+    return new_height, new_width
+
+
+def get_numpy_to_framework_fn(arr) -> Callable:
+    """
+    Returns a function that converts a numpy array to the framework of the input array.
+
+    Args:
+        arr (`np.ndarray`): The array to convert.
+    """
+    if isinstance(arr, np.ndarray):
+        return np.array
+    if is_tf_available() and is_tf_tensor(arr):
+        import tensorflow as tf
+
+        return tf.convert_to_tensor
+    if is_torch_available() and is_torch_tensor(arr):
+        import torch
+
+        return torch.tensor
+    if is_flax_available() and is_jax_tensor(arr):
+        import jax.numpy as jnp
+
+        return jnp.array
+    raise ValueError(f"Cannot convert arrays of type {type(arr)}")
+
+
+def safe_squeeze(arr: np.ndarray, axis: Optional[int] = None) -> np.ndarray:
+    """
+    Squeezes an array, but only if the axis specified has dim 1.
+    """
+    if axis is None:
+        return arr.squeeze()
+
+    try:
+        return arr.squeeze(axis=axis)
+    except ValueError:
+        return arr
+
+
+def normalize_annotation(annotation: dict, image_size: tuple[int, int]) -> dict:
+    image_height, image_width = image_size
+    norm_annotation = {}
+    for key, value in annotation.items():
+        if key == "boxes":
+            boxes = value
+            boxes = corners_to_center_format(boxes)
+            boxes /= np.asarray([image_width, image_height, image_width, image_height], dtype=np.float32)
+            norm_annotation[key] = boxes
+        else:
+            norm_annotation[key] = value
+    return norm_annotation
+
+
+def max_across_indices(values: Iterable[Any]) -> list[Any]:
+    """
+    Return the maximum value across all indices of an iterable of values.
+    """
+    return [max(values_i) for values_i in zip(*values)]
+
+
+def get_max_height_width(
+    images: list[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> list[int]:
+    """
+    Get the maximum height and width across all images in a batch.
+    """
+    if input_data_format is None:
+        input_data_format = infer_channel_dimension_format(images[0])
+
+    if input_data_format == ChannelDimension.FIRST:
+        _, max_height, max_width = max_across_indices([img.shape for img in images])
+    elif input_data_format == ChannelDimension.LAST:
+        max_height, max_width, _ = max_across_indices([img.shape for img in images])
+    else:
+        raise ValueError(f"Invalid channel dimension format: {input_data_format}")
+    return (max_height, max_width)
+
+
+def make_pixel_mask(
+    image: np.ndarray, output_size: tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> np.ndarray:
+    """
+    Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
+
+    Args:
+        image (`np.ndarray`):
+            Image to make the pixel mask for.
+        output_size (`tuple[int, int]`):
+            Output size of the mask.
+    """
+    input_height, input_width = get_image_size(image, channel_dim=input_data_format)
+    mask = np.zeros(output_size, dtype=np.int64)
+    mask[:input_height, :input_width] = 1
+    return mask
+
+
+def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndarray:
+    """
+    Convert a COCO polygon annotation to a mask.
+
+    Args:
+        segmentations (`list[list[float]]`):
+            List of polygons, each polygon represented by a list of x-y coordinates.
+        height (`int`):
+            Height of the mask.
+        width (`int`):
+            Width of the mask.
+    """
+    try:
+        from pycocotools import mask as coco_mask
+    except ImportError:
+        raise ImportError("Pycocotools is not installed in your environment.")
+
+    masks = []
+    for polygons in segmentations:
+        rles = coco_mask.frPyObjects(polygons, height, width)
+        mask = coco_mask.decode(rles)
+        if len(mask.shape) < 3:
+            mask = mask[..., None]
+        mask = np.asarray(mask, dtype=np.uint8)
+        mask = np.any(mask, axis=2)
+        masks.append(mask)
+    if masks:
+        masks = np.stack(masks, axis=0)
+    else:
+        masks = np.zeros((0, height, width), dtype=np.uint8)
+
+    return masks
+
+
+def prepare_coco_detection_annotation(
+    image,
+    target,
+    return_segmentation_masks: bool = False,
+    input_data_format: Optional[Union[ChannelDimension, str]] = None,
+):
+    """
+    Convert the target in COCO format into the format expected by DETA.
+    """
+    image_height, image_width = get_image_size(image, channel_dim=input_data_format)
+
+    image_id = target["image_id"]
+    image_id = np.asarray([image_id], dtype=np.int64)
+
+    # Get all COCO annotations for the given image.
+    annotations = target["annotations"]
+    annotations = [obj for obj in annotations if "iscrowd" not in obj or obj["iscrowd"] == 0]
+
+    classes = [obj["category_id"] for obj in annotations]
+    classes = np.asarray(classes, dtype=np.int64)
+
+    # for conversion to coco api
+    area = np.asarray([obj["area"] for obj in annotations], dtype=np.float32)
+    iscrowd = np.asarray([obj.get("iscrowd", 0) for obj in annotations], dtype=np.int64)
+
+    boxes = [obj["bbox"] for obj in annotations]
+    # guard against no boxes via resizing
+    boxes = np.asarray(boxes, dtype=np.float32).reshape(-1, 4)
+    boxes[:, 2:] += boxes[:, :2]
+    boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width)
+    boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height)
+
+    keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+
+    new_target = {}
+    new_target["image_id"] = image_id
+    new_target["class_labels"] = classes[keep]
+    new_target["boxes"] = boxes[keep]
+    new_target["area"] = area[keep]
+    new_target["iscrowd"] = iscrowd[keep]
+    new_target["orig_size"] = np.asarray([int(image_height), int(image_width)], dtype=np.int64)
+
+    if annotations and "keypoints" in annotations[0]:
+        keypoints = [obj["keypoints"] for obj in annotations]
+        # Converting the filtered keypoints list to a numpy array
+        keypoints = np.asarray(keypoints, dtype=np.float32)
+        # Apply the keep mask here to filter the relevant annotations
+        keypoints = keypoints[keep]
+        num_keypoints = keypoints.shape[0]
+        keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints
+        new_target["keypoints"] = keypoints
+
+    if return_segmentation_masks:
+        segmentation_masks = [obj["segmentation"] for obj in annotations]
+        masks = convert_coco_poly_to_mask(segmentation_masks, image_height, image_width)
+        new_target["masks"] = masks[keep]
+
+    return new_target
+
+
+def masks_to_boxes(masks: np.ndarray) -> np.ndarray:
+    """
+    Compute the bounding boxes around the provided panoptic segmentation masks.
+
+    Args:
+        masks: masks in format `[number_masks, height, width]` where N is the number of masks
+
+    Returns:
+        boxes: bounding boxes in format `[number_masks, 4]` in xyxy format
+    """
+    if masks.size == 0:
+        return np.zeros((0, 4))
+
+    h, w = masks.shape[-2:]
+    y = np.arange(0, h, dtype=np.float32)
+    x = np.arange(0, w, dtype=np.float32)
+    # see https://github.com/pytorch/pytorch/issues/50276
+    y, x = np.meshgrid(y, x, indexing="ij")
+
+    x_mask = masks * np.expand_dims(x, axis=0)
+    x_max = x_mask.reshape(x_mask.shape[0], -1).max(-1)
+    x = np.ma.array(x_mask, mask=~(np.array(masks, dtype=bool)))
+    x_min = x.filled(fill_value=1e8)
+    x_min = x_min.reshape(x_min.shape[0], -1).min(-1)
+
+    y_mask = masks * np.expand_dims(y, axis=0)
+    y_max = y_mask.reshape(x_mask.shape[0], -1).max(-1)
+    y = np.ma.array(y_mask, mask=~(np.array(masks, dtype=bool)))
+    y_min = y.filled(fill_value=1e8)
+    y_min = y_min.reshape(y_min.shape[0], -1).min(-1)
+
+    return np.stack([x_min, y_min, x_max, y_max], 1)
+
+
+def prepare_coco_panoptic_annotation(
+    image: np.ndarray,
+    target: dict,
+    masks_path: Union[str, pathlib.Path],
+    return_masks: bool = True,
+    input_data_format: Union[ChannelDimension, str] = None,
+) -> dict:
+    """
+    Prepare a coco panoptic annotation for DETA.
+    """
+    image_height, image_width = get_image_size(image, channel_dim=input_data_format)
+    annotation_path = pathlib.Path(masks_path) / target["file_name"]
+
+    new_target = {}
+    new_target["image_id"] = np.asarray([target["image_id"] if "image_id" in target else target["id"]], dtype=np.int64)
+    new_target["size"] = np.asarray([image_height, image_width], dtype=np.int64)
+    new_target["orig_size"] = np.asarray([image_height, image_width], dtype=np.int64)
+
+    if "segments_info" in target:
+        masks = np.asarray(PIL.Image.open(annotation_path), dtype=np.uint32)
+        masks = rgb_to_id(masks)
+
+        ids = np.array([segment_info["id"] for segment_info in target["segments_info"]])
+        masks = masks == ids[:, None, None]
+        masks = masks.astype(np.uint8)
+        if return_masks:
+            new_target["masks"] = masks
+        new_target["boxes"] = masks_to_boxes(masks)
+        new_target["class_labels"] = np.array(
+            [segment_info["category_id"] for segment_info in target["segments_info"]], dtype=np.int64
+        )
+        new_target["iscrowd"] = np.asarray(
+            [segment_info["iscrowd"] for segment_info in target["segments_info"]], dtype=np.int64
+        )
+        new_target["area"] = np.asarray(
+            [segment_info["area"] for segment_info in target["segments_info"]], dtype=np.float32
+        )
+
+    return new_target
+
+
+def resize_annotation(
+    annotation: dict[str, Any],
+    orig_size: tuple[int, int],
+    target_size: tuple[int, int],
+    threshold: float = 0.5,
+    resample: PILImageResampling = PILImageResampling.NEAREST,
+):
+    """
+    Resizes an annotation to a target size.
+
+    Args:
+        annotation (`dict[str, Any]`):
+            The annotation dictionary.
+        orig_size (`tuple[int, int]`):
+            The original size of the input image.
+        target_size (`tuple[int, int]`):
+            The target size of the image, as returned by the preprocessing `resize` step.
+        threshold (`float`, *optional*, defaults to 0.5):
+            The threshold used to binarize the segmentation masks.
+        resample (`PILImageResampling`, defaults to `PILImageResampling.NEAREST`):
+            The resampling filter to use when resizing the masks.
+    """
+    ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(target_size, orig_size))
+    ratio_height, ratio_width = ratios
+
+    new_annotation = {}
+    new_annotation["size"] = target_size
+
+    for key, value in annotation.items():
+        if key == "boxes":
+            boxes = value
+            scaled_boxes = boxes * np.asarray([ratio_width, ratio_height, ratio_width, ratio_height], dtype=np.float32)
+            new_annotation["boxes"] = scaled_boxes
+        elif key == "area":
+            area = value
+            scaled_area = area * (ratio_width * ratio_height)
+            new_annotation["area"] = scaled_area
+        elif key == "masks":
+            masks = value[:, None]
+            masks = np.array([resize(mask, target_size, resample=resample) for mask in masks])
+            masks = masks.astype(np.float32)
+            masks = masks[:, 0] > threshold
+            new_annotation["masks"] = masks
+        elif key == "size":
+            new_annotation["size"] = target_size
+        else:
+            new_annotation[key] = value
+
+    return new_annotation
+
+
+class DetaImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Deformable DETR image processor.
+
+    Args:
+        format (`str`, *optional*, defaults to `"coco_detection"`):
+            Data format of the annotations. One of "coco_detection" or "coco_panoptic".
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be
+            overridden by the `do_resize` parameter in the `preprocess` method.
+        size (`dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
+            Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter
+            in the `preprocess` method. Available options are:
+                - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
+                    Do NOT keep the aspect ratio.
+                - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
+                    the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
+                    less or equal to `longest_edge`.
+                - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
+                    aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
+                    `max_width`.
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+            Resampling filter to use if resizing the image.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Controls whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
+            `do_rescale` parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
+            `preprocess` method.
+        do_normalize:
+            Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the
+            `preprocess` method.
+        image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
+            Mean values to use when normalizing the image. Can be a single value or a list of values, one for each
+            channel. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
+            Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
+            for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_annotations (`bool`, *optional*, defaults to `True`):
+            Controls whether to convert the annotations to the format expected by the DETR model. Converts the
+            bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
+            Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
+        do_pad (`bool`, *optional*, defaults to `True`):
+            Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
+            method. If `True`, padding will be applied to the bottom and right of the image with zeros.
+            If `pad_size` is provided, the image will be padded to the specified dimensions.
+            Otherwise, the image will be padded to the maximum height and width of the batch.
+        pad_size (`dict[str, int]`, *optional*):
+            The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
+            provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
+            height and width in the batch.
+    """
+
+    model_input_names = ["pixel_values", "pixel_mask"]
+
+    def __init__(
+        self,
+        format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION,
+        do_resize: bool = True,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_convert_annotations: bool = True,
+        do_pad: bool = True,
+        pad_size: Optional[dict[str, int]] = None,
+        **kwargs,
+    ) -> None:
+        if "pad_and_return_pixel_mask" in kwargs:
+            do_pad = kwargs.pop("pad_and_return_pixel_mask")
+
+        size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
+        size = get_size_dict(size, default_to_square=False)
+
+        if do_convert_annotations is None:
+            do_convert_annotations = do_normalize
+
+        super().__init__(**kwargs)
+        self.format = format
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.do_convert_annotations = do_convert_annotations
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
+        self.do_pad = do_pad
+        self.pad_size = pad_size
+
+    def prepare_annotation(
+        self,
+        image: np.ndarray,
+        target: dict,
+        format: Optional[AnnotationFormat] = None,
+        return_segmentation_masks: Optional[bool] = None,
+        masks_path: Optional[Union[str, pathlib.Path]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> dict:
+        """
+        Prepare an annotation for feeding into DETA model.
+        """
+        format = format if format is not None else self.format
+
+        if format == AnnotationFormat.COCO_DETECTION:
+            return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
+            target = prepare_coco_detection_annotation(
+                image, target, return_segmentation_masks, input_data_format=input_data_format
+            )
+        elif format == AnnotationFormat.COCO_PANOPTIC:
+            return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks
+            target = prepare_coco_panoptic_annotation(
+                image,
+                target,
+                masks_path=masks_path,
+                return_masks=return_segmentation_masks,
+                input_data_format=input_data_format,
+            )
+        else:
+            raise ValueError(f"Format {format} is not supported.")
+        return target
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an
+        int, smaller edge of the image will be matched to this number.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`dict[str, int]`):
+                Size of the image's `(height, width)` dimensions after resizing. Available options are:
+                    - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
+                        Do NOT keep the aspect ratio.
+                    - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
+                        the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
+                        less or equal to `longest_edge`.
+                    - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
+                        aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
+                        `max_width`.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+                Resampling filter to use if resizing the image.
+            data_format (`ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred from the input
+                image.
+        """
+        size = get_size_dict(size, default_to_square=False)
+        if "shortest_edge" in size and "longest_edge" in size:
+            new_size = get_resize_output_image_size(
+                image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format
+            )
+        elif "height" in size and "width" in size:
+            new_size = (size["height"], size["width"])
+        elif "max_height" in size and "max_width" in size:
+            new_size = get_image_size_for_max_height_width(
+                image, size["max_height"], size["max_width"], input_data_format=input_data_format
+            )
+        else:
+            raise ValueError(
+                "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
+                f" {size.keys()}."
+            )
+        image = resize(
+            image, size=new_size, resample=resample, data_format=data_format, input_data_format=input_data_format
+        )
+        return image
+
+    def resize_annotation(
+        self,
+        annotation,
+        orig_size,
+        size,
+        resample: PILImageResampling = PILImageResampling.NEAREST,
+    ) -> dict:
+        """
+        Resize the annotation to match the resized image. If size is an int, smaller edge of the mask will be matched
+        to this number.
+        """
+        return resize_annotation(annotation, orig_size=orig_size, target_size=size, resample=resample)
+
+    def rescale(
+        self,
+        image: np.ndarray,
+        rescale_factor: float,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """
+        Rescale the image by the given factor. image = image * rescale_factor.
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            rescale_factor (`float`):
+                The value to use for rescaling.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the input image. If unset, is inferred from the input image. Can be
+                one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+        """
+        return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format)
+
+    def normalize_annotation(self, annotation: dict, image_size: tuple[int, int]) -> dict:
+        """
+        Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
+        `[center_x, center_y, width, height]` format and from absolute to relative pixel values.
+        """
+        return normalize_annotation(annotation, image_size=image_size)
+
+    def _update_annotation_for_padded_image(
+        self,
+        annotation: dict,
+        input_image_size: tuple[int, int],
+        output_image_size: tuple[int, int],
+        padding,
+        update_bboxes,
+    ) -> dict:
+        """
+        Update the annotation for a padded image.
+        """
+        new_annotation = {}
+        new_annotation["size"] = output_image_size
+
+        for key, value in annotation.items():
+            if key == "masks":
+                masks = value
+                masks = pad(
+                    masks,
+                    padding,
+                    mode=PaddingMode.CONSTANT,
+                    constant_values=0,
+                    input_data_format=ChannelDimension.FIRST,
+                )
+                masks = safe_squeeze(masks, 1)
+                new_annotation["masks"] = masks
+            elif key == "boxes" and update_bboxes:
+                boxes = value
+                boxes *= np.asarray(
+                    [
+                        input_image_size[1] / output_image_size[1],
+                        input_image_size[0] / output_image_size[0],
+                        input_image_size[1] / output_image_size[1],
+                        input_image_size[0] / output_image_size[0],
+                    ]
+                )
+                new_annotation["boxes"] = boxes
+            elif key == "size":
+                new_annotation["size"] = output_image_size
+            else:
+                new_annotation[key] = value
+        return new_annotation
+
+    def _pad_image(
+        self,
+        image: np.ndarray,
+        output_size: tuple[int, int],
+        annotation: Optional[dict[str, Any]] = None,
+        constant_values: Union[float, Iterable[float]] = 0,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        update_bboxes: bool = True,
+    ) -> np.ndarray:
+        """
+        Pad an image with zeros to the given size.
+        """
+        input_height, input_width = get_image_size(image, channel_dim=input_data_format)
+        output_height, output_width = output_size
+
+        pad_bottom = output_height - input_height
+        pad_right = output_width - input_width
+        padding = ((0, pad_bottom), (0, pad_right))
+        padded_image = pad(
+            image,
+            padding,
+            mode=PaddingMode.CONSTANT,
+            constant_values=constant_values,
+            data_format=data_format,
+            input_data_format=input_data_format,
+        )
+        if annotation is not None:
+            annotation = self._update_annotation_for_padded_image(
+                annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes
+            )
+        return padded_image, annotation
+
+    def pad(
+        self,
+        images: list[np.ndarray],
+        annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None,
+        constant_values: Union[float, Iterable[float]] = 0,
+        return_pixel_mask: bool = True,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        update_bboxes: bool = True,
+        pad_size: Optional[dict[str, int]] = None,
+    ) -> BatchFeature:
+        """
+        Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
+        in the batch and optionally returns their corresponding pixel mask.
+
+        Args:
+            images (list[`np.ndarray`]):
+                Images to pad.
+            annotations (`AnnotationType` or `list[AnnotationType]`, *optional*):
+                Annotations to transform according to the padding that is applied to the images.
+            constant_values (`float` or `Iterable[float]`, *optional*):
+                The value to use for the padding if `mode` is `"constant"`.
+            return_pixel_mask (`bool`, *optional*, defaults to `True`):
+                Whether to return a pixel mask.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+            update_bboxes (`bool`, *optional*, defaults to `True`):
+                Whether to update the bounding boxes in the annotations to match the padded images. If the
+                bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
+                format, the bounding boxes will not be updated.
+            pad_size (`dict[str, int]`, *optional*):
+                The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
+                provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
+                height and width in the batch.
+        """
+        pad_size = pad_size if pad_size is not None else self.pad_size
+        if pad_size is not None:
+            padded_size = (pad_size["height"], pad_size["width"])
+        else:
+            padded_size = get_max_height_width(images, input_data_format=input_data_format)
+
+        annotation_list = annotations if annotations is not None else [None] * len(images)
+        padded_images = []
+        padded_annotations = []
+        for image, annotation in zip(images, annotation_list):
+            padded_image, padded_annotation = self._pad_image(
+                image,
+                padded_size,
+                annotation,
+                constant_values=constant_values,
+                data_format=data_format,
+                input_data_format=input_data_format,
+                update_bboxes=update_bboxes,
+            )
+            padded_images.append(padded_image)
+            padded_annotations.append(padded_annotation)
+
+        data = {"pixel_values": padded_images}
+
+        if return_pixel_mask:
+            masks = [
+                make_pixel_mask(image=image, output_size=padded_size, input_data_format=input_data_format)
+                for image in images
+            ]
+            data["pixel_mask"] = masks
+
+        encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
+
+        if annotations is not None:
+            encoded_inputs["labels"] = [
+                BatchFeature(annotation, tensor_type=return_tensors) for annotation in padded_annotations
+            ]
+
+        return encoded_inputs
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        annotations: Optional[Union[list[dict], list[list[dict]]]] = None,
+        return_segmentation_masks: Optional[bool] = None,
+        masks_path: Optional[Union[str, pathlib.Path]] = None,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample=None,  # PILImageResampling
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[Union[int, float]] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_convert_annotations: Optional[bool] = None,
+        do_pad: Optional[bool] = None,
+        format: Optional[Union[str, AnnotationFormat]] = None,
+        return_tensors: Optional[Union[TensorType, str]] = None,
+        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        pad_size: Optional[dict[str, int]] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Preprocess an image or a batch of images so that it can be used by the model.
+
+        Args:
+            images (`ImageInput`):
+                Image or batch of images to preprocess. Expects a single or batch of images with pixel values ranging
+                from 0 to 255. If passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            annotations (`list[Dict]` or `list[list[Dict]]`, *optional*):
+                List of annotations associated with the image or batch of images. If annotation is for object
+                detection, the annotations should be a dictionary with the following keys:
+                - "image_id" (`int`): The image id.
+                - "annotations" (`list[Dict]`): List of annotations for an image. Each annotation should be a
+                  dictionary. An image can have no annotations, in which case the list should be empty.
+                If annotation is for segmentation, the annotations should be a dictionary with the following keys:
+                - "image_id" (`int`): The image id.
+                - "segments_info" (`list[Dict]`): List of segments for an image. Each segment should be a dictionary.
+                  An image can have no segments, in which case the list should be empty.
+                - "file_name" (`str`): The file name of the image.
+            return_segmentation_masks (`bool`, *optional*, defaults to self.return_segmentation_masks):
+                Whether to return segmentation masks.
+            masks_path (`str` or `pathlib.Path`, *optional*):
+                Path to the directory containing the segmentation masks.
+            do_resize (`bool`, *optional*, defaults to self.do_resize):
+                Whether to resize the image.
+            size (`dict[str, int]`, *optional*, defaults to self.size):
+                Size of the image's `(height, width)` dimensions after resizing. Available options are:
+                    - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
+                        Do NOT keep the aspect ratio.
+                    - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
+                        the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
+                        less or equal to `longest_edge`.
+                    - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
+                        aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
+                        `max_width`.
+            resample (`PILImageResampling`, *optional*, defaults to self.resample):
+                Resampling filter to use when resizing the image.
+            do_rescale (`bool`, *optional*, defaults to self.do_rescale):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to self.rescale_factor):
+                Rescale factor to use when rescaling the image.
+            do_normalize (`bool`, *optional*, defaults to self.do_normalize):
+                Whether to normalize the image.
+            image_mean (`float` or `list[float]`, *optional*, defaults to self.image_mean):
+                Mean to use when normalizing the image.
+            image_std (`float` or `list[float]`, *optional*, defaults to self.image_std):
+                Standard deviation to use when normalizing the image.
+            do_convert_annotations (`bool`, *optional*, defaults to self.do_convert_annotations):
+                Whether to convert the annotations to the format expected by the model. Converts the bounding
+                boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)`
+                and in relative coordinates.
+            do_pad (`bool`, *optional*, defaults to self.do_pad):
+                Whether to pad the image. If `True`, padding will be applied to the bottom and right of
+                the image with zeros. If `pad_size` is provided, the image will be padded to the specified
+                dimensions. Otherwise, the image will be padded to the maximum height and width of the batch.
+            format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
+                Format of the annotations.
+            return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
+                Type of tensors to return. If `None`, will return the list of images.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            pad_size (`dict[str, int]`, *optional*):
+                The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
+                provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
+                height and width in the batch.
+        """
+        if "pad_and_return_pixel_mask" in kwargs:
+            logger.warning_once(
+                "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, "
+                "use `do_pad` instead.",
+            )
+            do_pad = kwargs.pop("pad_and_return_pixel_mask")
+
+        do_resize = self.do_resize if do_resize is None else do_resize
+        size = self.size if size is None else size
+        size = get_size_dict(size=size, default_to_square=False)
+        resample = self.resample if resample is None else resample
+        do_rescale = self.do_rescale if do_rescale is None else do_rescale
+        rescale_factor = self.rescale_factor if rescale_factor is None else rescale_factor
+        do_normalize = self.do_normalize if do_normalize is None else do_normalize
+        image_mean = self.image_mean if image_mean is None else image_mean
+        image_std = self.image_std if image_std is None else image_std
+        do_convert_annotations = (
+            self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
+        )
+        do_pad = self.do_pad if do_pad is None else do_pad
+        pad_size = self.pad_size if pad_size is None else pad_size
+        format = self.format if format is None else format
+
+        # Here, the pad() method pads to the maximum of (width, height). It does not need to be validated.
+
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+
+        if not is_batched(images):
+            images = [images]
+            annotations = [annotations] if annotations is not None else None
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        if annotations is not None and len(images) != len(annotations):
+            raise ValueError(
+                f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match."
+            )
+
+        format = AnnotationFormat(format)
+        if annotations is not None:
+            validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations)
+
+        if (
+            masks_path is not None
+            and format == AnnotationFormat.COCO_PANOPTIC
+            and not isinstance(masks_path, (pathlib.Path, str))
+        ):
+            raise ValueError(
+                "The path to the directory containing the mask PNG files should be provided as a"
+                f" `pathlib.Path` or string object, but is {type(masks_path)} instead."
+            )
+
+        # All transformations expect numpy arrays
+        images = [to_numpy_array(image) for image in images]
+
+        if do_rescale and is_scaled_image(images[0]):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image)
+        if annotations is not None:
+            prepared_images = []
+            prepared_annotations = []
+            for image, target in zip(images, annotations):
+                target = self.prepare_annotation(
+                    image,
+                    target,
+                    format,
+                    return_segmentation_masks=return_segmentation_masks,
+                    masks_path=masks_path,
+                    input_data_format=input_data_format,
+                )
+                prepared_images.append(image)
+                prepared_annotations.append(target)
+            images = prepared_images
+            annotations = prepared_annotations
+            del prepared_images, prepared_annotations
+
+        # transformations
+        if do_resize:
+            if annotations is not None:
+                resized_images, resized_annotations = [], []
+                for image, target in zip(images, annotations):
+                    orig_size = get_image_size(image, input_data_format)
+                    resized_image = self.resize(
+                        image, size=size, resample=resample, input_data_format=input_data_format
+                    )
+                    resized_annotation = self.resize_annotation(
+                        target, orig_size, get_image_size(resized_image, input_data_format)
+                    )
+                    resized_images.append(resized_image)
+                    resized_annotations.append(resized_annotation)
+                images = resized_images
+                annotations = resized_annotations
+                del resized_images, resized_annotations
+            else:
+                images = [
+                    self.resize(image, size=size, resample=resample, input_data_format=input_data_format)
+                    for image in images
+                ]
+
+        if do_rescale:
+            images = [self.rescale(image, rescale_factor, input_data_format=input_data_format) for image in images]
+
+        if do_normalize:
+            images = [
+                self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
+            ]
+
+        if do_convert_annotations and annotations is not None:
+            annotations = [
+                self.normalize_annotation(annotation, get_image_size(image, input_data_format))
+                for annotation, image in zip(annotations, images)
+            ]
+
+        if do_pad:
+            # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
+            encoded_inputs = self.pad(
+                images,
+                annotations=annotations,
+                return_pixel_mask=True,
+                data_format=data_format,
+                input_data_format=input_data_format,
+                return_tensors=return_tensors,
+                update_bboxes=do_convert_annotations,
+                pad_size=pad_size,
+            )
+        else:
+            images = [
+                to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+                for image in images
+            ]
+            encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
+            if annotations is not None:
+                encoded_inputs["labels"] = [
+                    BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
+                ]
+
+        return encoded_inputs
+
+    def post_process_object_detection(
+        self,
+        outputs,
+        threshold: float = 0.5,
+        target_sizes: Union[TensorType, list[tuple]] = None,
+        nms_threshold: float = 0.7,
+    ):
+        """
+        Converts the output of [`DetaForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
+        bottom_right_x, bottom_right_y) format. Only supports PyTorch.
+
+        Args:
+            outputs ([`DetrObjectDetectionOutput`]):
+                Raw outputs of the model.
+            threshold (`float`, *optional*, defaults to 0.5):
+                Score threshold to keep object detection predictions.
+            target_sizes (`torch.Tensor` or `list[tuple[int, int]]`, *optional*):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`tuple[int, int]`) containing the target size
+                (height, width) of each image in the batch. If left to None, predictions will not be resized.
+            nms_threshold (`float`, *optional*, defaults to 0.7):
+                NMS threshold.
+
+        Returns:
+            `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+            in the batch as predicted by the model.
+        """
+        out_logits, out_bbox = outputs.logits, outputs.pred_boxes
+        batch_size, num_queries, num_labels = out_logits.shape
+
+        if target_sizes is not None:
+            if len(out_logits) != len(target_sizes):
+                raise ValueError(
+                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
+                )
+
+        prob = out_logits.sigmoid()
+
+        all_scores = prob.view(batch_size, num_queries * num_labels).to(out_logits.device)
+        all_indexes = torch.arange(num_queries * num_labels)[None].repeat(batch_size, 1).to(out_logits.device)
+        all_boxes = torch.div(all_indexes, out_logits.shape[2], rounding_mode="floor")
+        all_labels = all_indexes % out_logits.shape[2]
+
+        boxes = center_to_corners_format(out_bbox)
+        boxes = torch.gather(boxes, 1, all_boxes.unsqueeze(-1).repeat(1, 1, 4))
+
+        # and from relative [0, 1] to absolute [0, height] coordinates
+        if target_sizes is not None:
+            if isinstance(target_sizes, list):
+                img_h = torch.Tensor([i[0] for i in target_sizes])
+                img_w = torch.Tensor([i[1] for i in target_sizes])
+            else:
+                img_h, img_w = target_sizes.unbind(1)
+
+            scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
+            boxes = boxes * scale_fct[:, None, :]
+
+        results = []
+        for b in range(batch_size):
+            box = boxes[b]
+            score = all_scores[b]
+            lbls = all_labels[b]
+
+            pre_topk = score.topk(min(10000, num_queries * num_labels)).indices
+            box = box[pre_topk]
+            score = score[pre_topk]
+            lbls = lbls[pre_topk]
+
+            # apply NMS
+            keep_inds = batched_nms(box, score, lbls, nms_threshold)[:100]
+            score = score[keep_inds]
+            lbls = lbls[keep_inds]
+            box = box[keep_inds]
+
+            results.append(
+                {
+                    "scores": score[score > threshold],
+                    "labels": lbls[score > threshold],
+                    "boxes": box[score > threshold],
+                }
+            )
+
+        return results
+
+
+__all__ = ["DetaImageProcessor"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/deta/modeling_deta.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/deta/modeling_deta.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4f6f5c65dedc4e7a4d6e5151cff61226ea4a7ed
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/deta/modeling_deta.py
@@ -0,0 +1,2816 @@
+# coding=utf-8
+# Copyright 2022 SenseTime and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch DETA model."""
+
+import copy
+import math
+import os
+import warnings
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional, Union
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+from ....activations import ACT2FN
+from ....file_utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_scipy_available,
+    is_torch_cuda_available,
+    is_vision_available,
+    replace_return_docstrings,
+)
+from ....modeling_attn_mask_utils import _prepare_4d_attention_mask
+from ....modeling_layers import GradientCheckpointingLayer
+from ....modeling_outputs import BaseModelOutput
+from ....modeling_utils import PreTrainedModel
+from ....pytorch_utils import meshgrid
+from ....utils import is_accelerate_available, is_ninja_available, is_torchvision_available, logging, requires_backends
+from ....utils.backbone_utils import load_backbone
+from .configuration_deta import DetaConfig
+
+
+logger = logging.get_logger(__name__)
+
+MultiScaleDeformableAttention = None
+
+
+def load_cuda_kernels():
+    from torch.utils.cpp_extension import load
+
+    global MultiScaleDeformableAttention
+
+    root = Path(__file__).resolve().parent.parent.parent.parent / "kernels" / "deta"
+    src_files = [
+        root / filename
+        for filename in [
+            "vision.cpp",
+            os.path.join("cpu", "ms_deform_attn_cpu.cpp"),
+            os.path.join("cuda", "ms_deform_attn_cuda.cu"),
+        ]
+    ]
+
+    MultiScaleDeformableAttention = load(
+        "MultiScaleDeformableAttention",
+        src_files,
+        with_cuda=True,
+        extra_include_paths=[str(root)],
+        extra_cflags=["-DWITH_CUDA=1"],
+        extra_cuda_cflags=[
+            "-DCUDA_HAS_FP16=1",
+            "-D__CUDA_NO_HALF_OPERATORS__",
+            "-D__CUDA_NO_HALF_CONVERSIONS__",
+            "-D__CUDA_NO_HALF2_OPERATORS__",
+        ],
+    )
+
+
+class MultiScaleDeformableAttentionFunction(Function):
+    @staticmethod
+    def forward(
+        context,
+        value,
+        value_spatial_shapes,
+        value_level_start_index,
+        sampling_locations,
+        attention_weights,
+        im2col_step,
+    ):
+        context.im2col_step = im2col_step
+        output = MultiScaleDeformableAttention.ms_deform_attn_forward(
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+            context.im2col_step,
+        )
+        context.save_for_backward(
+            value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights
+        )
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(context, grad_output):
+        (
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+        ) = context.saved_tensors
+        grad_value, grad_sampling_loc, grad_attn_weight = MultiScaleDeformableAttention.ms_deform_attn_backward(
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+            grad_output,
+            context.im2col_step,
+        )
+
+        return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
+
+
+if is_accelerate_available():
+    from accelerate import PartialState
+    from accelerate.utils import reduce
+
+if is_vision_available():
+    from transformers.image_transforms import center_to_corners_format
+
+if is_torchvision_available():
+    from torchvision.ops.boxes import batched_nms
+
+if is_scipy_available():
+    from scipy.optimize import linear_sum_assignment
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "DetaConfig"
+_CHECKPOINT_FOR_DOC = "jozhang97/deta-swin-large-o365"
+
+
+@dataclass
+class DetaDecoderOutput(ModelOutput):
+    """
+    Base class for outputs of the DetaDecoder. This class adds two attributes to
+    BaseModelOutputWithCrossAttentions, namely:
+    - a stacked tensor of intermediate decoder hidden states (i.e. the output of each decoder layer)
+    - a stacked tensor of intermediate reference points.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
+            Stacked intermediate hidden states (output of each layer of the decoder).
+        intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, hidden_size)`):
+            Stacked intermediate reference points (reference points of each layer of the decoder).
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
+            plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
+            used to compute the weighted average in the cross-attention heads.
+    """
+
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+    intermediate_reference_points: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class DetaModelOutput(ModelOutput):
+    """
+    Base class for outputs of the Deformable DETR encoder-decoder model.
+
+    Args:
+        init_reference_points (`torch.FloatTensor` of shape  `(batch_size, num_queries, 4)`):
+            Initial reference points sent through the Transformer decoder.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the decoder of the model.
+        intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
+            Stacked intermediate hidden states (output of each layer of the decoder).
+        intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
+            Stacked intermediate reference points (reference points of each layer of the decoder).
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer
+            plus the initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries,
+            num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted
+            average in the self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
+            layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
+            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+            Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
+            picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
+            foreground and background).
+        enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+            Logits of predicted bounding boxes coordinates in the first stage.
+        output_proposals (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.two_stage=True`):
+            Logits of proposal bounding boxes coordinates in the gen_encoder_output_proposals.
+    """
+
+    init_reference_points: Optional[torch.FloatTensor] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+    intermediate_reference_points: Optional[torch.FloatTensor] = None
+    decoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[tuple[torch.FloatTensor]] = None
+    enc_outputs_class: Optional[torch.FloatTensor] = None
+    enc_outputs_coord_logits: Optional[torch.FloatTensor] = None
+    output_proposals: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+class DetaObjectDetectionOutput(ModelOutput):
+    """
+    Output type of [`DetaForObjectDetection`].
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
+            Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
+            bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
+            scale-invariant IoU loss.
+        loss_dict (`Dict`, *optional*):
+            A dictionary containing the individual losses. Useful for logging.
+        logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
+            Classification logits (including no-object) for all queries.
+        pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
+            Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
+            values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
+            possible padding). You can use [`~DetaProcessor.post_process_object_detection`] to retrieve the
+            unnormalized bounding boxes.
+        auxiliary_outputs (`list[Dict]`, *optional*):
+            Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
+            and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
+            `pred_boxes`) for each decoder layer.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the decoder of the model.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer
+            plus the initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries,
+            num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted
+            average in the self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
+            layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_heads, 4,
+            4)`. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average
+            in the self-attention heads.
+        intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
+            Stacked intermediate hidden states (output of each layer of the decoder).
+        intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
+            Stacked intermediate reference points (reference points of each layer of the decoder).
+        init_reference_points (`torch.FloatTensor` of shape  `(batch_size, num_queries, 4)`):
+            Initial reference points sent through the Transformer decoder.
+        enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+            Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
+            picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
+            foreground and background).
+        enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+            Logits of predicted bounding boxes coordinates in the first stage.
+        output_proposals (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.two_stage=True`):
+            Logits of proposal bounding boxes coordinates in the gen_encoder_output_proposals.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    loss_dict: Optional[dict] = None
+    logits: Optional[torch.FloatTensor] = None
+    pred_boxes: Optional[torch.FloatTensor] = None
+    auxiliary_outputs: Optional[list[dict]] = None
+    init_reference_points: Optional[torch.FloatTensor] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+    intermediate_reference_points: Optional[torch.FloatTensor] = None
+    decoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[tuple[torch.FloatTensor]] = None
+    enc_outputs_class: Optional[torch.FloatTensor] = None
+    enc_outputs_coord_logits: Optional[torch.FloatTensor] = None
+    output_proposals: Optional[torch.FloatTensor] = None
+
+
+def _get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+
+def inverse_sigmoid(x, eps=1e-5):
+    x = x.clamp(min=0, max=1)
+    x1 = x.clamp(min=eps)
+    x2 = (1 - x).clamp(min=eps)
+    return torch.log(x1 / x2)
+
+
+class DetaFrozenBatchNorm2d(nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+
+    Copy-paste from torchvision.misc.ops with added eps before rqsrt, without which any other models than
+    torchvision.models.resnet[18,34,50,101] produce nans.
+    """
+
+    def __init__(self, n):
+        super().__init__()
+        self.register_buffer("weight", torch.ones(n))
+        self.register_buffer("bias", torch.zeros(n))
+        self.register_buffer("running_mean", torch.zeros(n))
+        self.register_buffer("running_var", torch.ones(n))
+
+    def _load_from_state_dict(
+        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+    ):
+        num_batches_tracked_key = prefix + "num_batches_tracked"
+        if num_batches_tracked_key in state_dict:
+            del state_dict[num_batches_tracked_key]
+
+        super()._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+        )
+
+    def forward(self, x):
+        # move reshapes to the beginning
+        # to make it user-friendly
+        weight = self.weight.reshape(1, -1, 1, 1)
+        bias = self.bias.reshape(1, -1, 1, 1)
+        running_var = self.running_var.reshape(1, -1, 1, 1)
+        running_mean = self.running_mean.reshape(1, -1, 1, 1)
+        epsilon = 1e-5
+        scale = weight * (running_var + epsilon).rsqrt()
+        bias = bias - running_mean * scale
+        return x * scale + bias
+
+
+def replace_batch_norm(model):
+    r"""
+    Recursively replace all `torch.nn.BatchNorm2d` with `DetaFrozenBatchNorm2d`.
+
+    Args:
+        model (torch.nn.Module):
+            input model
+    """
+    for name, module in model.named_children():
+        if isinstance(module, nn.BatchNorm2d):
+            new_module = DetaFrozenBatchNorm2d(module.num_features)
+
+            if module.weight.device != torch.device("meta"):
+                new_module.weight.data.copy_(module.weight)
+                new_module.bias.data.copy_(module.bias)
+                new_module.running_mean.data.copy_(module.running_mean)
+                new_module.running_var.data.copy_(module.running_var)
+
+            model._modules[name] = new_module
+
+        if len(list(module.children())) > 0:
+            replace_batch_norm(module)
+
+
+class DetaBackboneWithPositionalEncodings(nn.Module):
+    """
+    Backbone model with positional embeddings.
+
+    nn.BatchNorm2d layers are replaced by DetaFrozenBatchNorm2d as defined above.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        backbone = load_backbone(config)
+        with torch.no_grad():
+            replace_batch_norm(backbone)
+        self.model = backbone
+        self.intermediate_channel_sizes = self.model.channels
+
+        # TODO fix this
+        if config.backbone_config.model_type == "resnet":
+            for name, parameter in self.model.named_parameters():
+                if "stages.1" not in name and "stages.2" not in name and "stages.3" not in name:
+                    parameter.requires_grad_(False)
+
+        self.position_embedding = build_position_encoding(config)
+
+    def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
+        """
+        Outputs feature maps of latter stages C_3 through C_5 in ResNet if `config.num_feature_levels > 1`, otherwise
+        outputs feature maps of C_5.
+        """
+        # first, send pixel_values through the backbone to get list of feature maps
+        features = self.model(pixel_values).feature_maps
+
+        # next, create position embeddings
+        out = []
+        pos = []
+        for feature_map in features:
+            # downsample pixel_mask to match shape of corresponding feature_map
+            mask = nn.functional.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0]
+            position_embeddings = self.position_embedding(feature_map, mask).to(feature_map.dtype)
+            out.append((feature_map, mask))
+            pos.append(position_embeddings)
+
+        return out, pos
+
+
+class DetaSinePositionEmbedding(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one used by the Attention is all you
+    need paper, generalized to work on images.
+    """
+
+    def __init__(self, embedding_dim=64, temperature=10000, normalize=False, scale=None):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+
+    def forward(self, pixel_values, pixel_mask):
+        if pixel_mask is None:
+            raise ValueError("No pixel mask provided")
+        y_embed = pixel_mask.cumsum(1, dtype=torch.float32)
+        x_embed = pixel_mask.cumsum(2, dtype=torch.float32)
+        if self.normalize:
+            eps = 1e-6
+            y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale
+
+        dim_t = torch.arange(self.embedding_dim, dtype=torch.int64, device=pixel_values.device).float()
+        dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim)
+
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
+
+
+class DetaLearnedPositionEmbedding(nn.Module):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    """
+
+    def __init__(self, embedding_dim=256):
+        super().__init__()
+        self.row_embeddings = nn.Embedding(50, embedding_dim)
+        self.column_embeddings = nn.Embedding(50, embedding_dim)
+
+    def forward(self, pixel_values, pixel_mask=None):
+        height, width = pixel_values.shape[-2:]
+        width_values = torch.arange(width, device=pixel_values.device)
+        height_values = torch.arange(height, device=pixel_values.device)
+        x_emb = self.column_embeddings(width_values)
+        y_emb = self.row_embeddings(height_values)
+        pos = torch.cat([x_emb.unsqueeze(0).repeat(height, 1, 1), y_emb.unsqueeze(1).repeat(1, width, 1)], dim=-1)
+        pos = pos.permute(2, 0, 1)
+        pos = pos.unsqueeze(0)
+        pos = pos.repeat(pixel_values.shape[0], 1, 1, 1)
+        return pos
+
+
+def build_position_encoding(config):
+    n_steps = config.d_model // 2
+    if config.position_embedding_type == "sine":
+        # TODO find a better way of exposing other arguments
+        position_embedding = DetaSinePositionEmbedding(n_steps, normalize=True)
+    elif config.position_embedding_type == "learned":
+        position_embedding = DetaLearnedPositionEmbedding(n_steps)
+    else:
+        raise ValueError(f"Not supported {config.position_embedding_type}")
+
+    return position_embedding
+
+
+def multi_scale_deformable_attention(
+    value: Tensor, value_spatial_shapes: Tensor, sampling_locations: Tensor, attention_weights: Tensor
+) -> Tensor:
+    batch_size, _, num_heads, hidden_dim = value.shape
+    _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
+    value_list = value.split([height.item() * width.item() for height, width in value_spatial_shapes], dim=1)
+    sampling_grids = 2 * sampling_locations - 1
+    sampling_value_list = []
+    for level_id, (height, width) in enumerate(value_spatial_shapes):
+        # batch_size, height*width, num_heads, hidden_dim
+        # -> batch_size, height*width, num_heads*hidden_dim
+        # -> batch_size, num_heads*hidden_dim, height*width
+        # -> batch_size*num_heads, hidden_dim, height, width
+        value_l_ = (
+            value_list[level_id].flatten(2).transpose(1, 2).reshape(batch_size * num_heads, hidden_dim, height, width)
+        )
+        # batch_size, num_queries, num_heads, num_points, 2
+        # -> batch_size, num_heads, num_queries, num_points, 2
+        # -> batch_size*num_heads, num_queries, num_points, 2
+        sampling_grid_l_ = sampling_grids[:, :, :, level_id].transpose(1, 2).flatten(0, 1)
+        # batch_size*num_heads, hidden_dim, num_queries, num_points
+        sampling_value_l_ = nn.functional.grid_sample(
+            value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False
+        )
+        sampling_value_list.append(sampling_value_l_)
+    # (batch_size, num_queries, num_heads, num_levels, num_points)
+    # -> (batch_size, num_heads, num_queries, num_levels, num_points)
+    # -> (batch_size, num_heads, 1, num_queries, num_levels*num_points)
+    attention_weights = attention_weights.transpose(1, 2).reshape(
+        batch_size * num_heads, 1, num_queries, num_levels * num_points
+    )
+    output = (
+        (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights)
+        .sum(-1)
+        .view(batch_size, num_heads * hidden_dim, num_queries)
+    )
+    return output.transpose(1, 2).contiguous()
+
+
+class DetaMultiscaleDeformableAttention(nn.Module):
+    """
+    Multiscale deformable attention as proposed in Deformable DETR.
+    """
+
+    def __init__(self, config: DetaConfig, num_heads: int, n_points: int):
+        super().__init__()
+
+        kernel_loaded = MultiScaleDeformableAttention is not None
+        if is_torch_cuda_available() and is_ninja_available() and not kernel_loaded:
+            try:
+                load_cuda_kernels()
+            except Exception as e:
+                logger.warning(f"Could not load the custom kernel for multi-scale deformable attention: {e}")
+
+        if config.d_model % num_heads != 0:
+            raise ValueError(
+                f"embed_dim (d_model) must be divisible by num_heads, but got {config.d_model} and {num_heads}"
+            )
+        dim_per_head = config.d_model // num_heads
+        # check if dim_per_head is power of 2
+        if not ((dim_per_head & (dim_per_head - 1) == 0) and dim_per_head != 0):
+            warnings.warn(
+                "You'd better set embed_dim (d_model) in DetaMultiscaleDeformableAttention to make the"
+                " dimension of each attention head a power of 2 which is more efficient in the authors' CUDA"
+                " implementation."
+            )
+
+        self.im2col_step = 64
+
+        self.d_model = config.d_model
+        self.n_levels = config.num_feature_levels
+        self.n_heads = num_heads
+        self.n_points = n_points
+
+        self.sampling_offsets = nn.Linear(config.d_model, num_heads * self.n_levels * n_points * 2)
+        self.attention_weights = nn.Linear(config.d_model, num_heads * self.n_levels * n_points)
+        self.value_proj = nn.Linear(config.d_model, config.d_model)
+        self.output_proj = nn.Linear(config.d_model, config.d_model)
+
+        self.disable_custom_kernels = config.disable_custom_kernels
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        nn.init.constant_(self.sampling_offsets.weight.data, 0.0)
+        default_dtype = torch.get_default_dtype()
+        thetas = torch.arange(self.n_heads, dtype=torch.int64).to(default_dtype) * (2.0 * math.pi / self.n_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (
+            (grid_init / grid_init.abs().max(-1, keepdim=True)[0])
+            .view(self.n_heads, 1, 1, 2)
+            .repeat(1, self.n_levels, self.n_points, 1)
+        )
+        for i in range(self.n_points):
+            grid_init[:, :, i, :] *= i + 1
+        with torch.no_grad():
+            self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
+        nn.init.constant_(self.attention_weights.weight.data, 0.0)
+        nn.init.constant_(self.attention_weights.bias.data, 0.0)
+        nn.init.xavier_uniform_(self.value_proj.weight.data)
+        nn.init.constant_(self.value_proj.bias.data, 0.0)
+        nn.init.xavier_uniform_(self.output_proj.weight.data)
+        nn.init.constant_(self.output_proj.bias.data, 0.0)
+
+    def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
+        return tensor if position_embeddings is None else tensor + position_embeddings
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        position_embeddings: Optional[torch.Tensor] = None,
+        reference_points=None,
+        spatial_shapes=None,
+        level_start_index=None,
+        output_attentions: bool = False,
+    ):
+        # add position embeddings to the hidden states before projecting to queries and keys
+        if position_embeddings is not None:
+            hidden_states = self.with_pos_embed(hidden_states, position_embeddings)
+
+        batch_size, num_queries, _ = hidden_states.shape
+        batch_size, sequence_length, _ = encoder_hidden_states.shape
+        if (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() != sequence_length:
+            raise ValueError(
+                "Make sure to align the spatial shapes with the sequence length of the encoder hidden states"
+            )
+
+        value = self.value_proj(encoder_hidden_states)
+        if attention_mask is not None:
+            # we invert the attention_mask
+            value = value.masked_fill(~attention_mask[..., None], float(0))
+        value = value.view(batch_size, sequence_length, self.n_heads, self.d_model // self.n_heads)
+        sampling_offsets = self.sampling_offsets(hidden_states).view(
+            batch_size, num_queries, self.n_heads, self.n_levels, self.n_points, 2
+        )
+        attention_weights = self.attention_weights(hidden_states).view(
+            batch_size, num_queries, self.n_heads, self.n_levels * self.n_points
+        )
+        attention_weights = F.softmax(attention_weights, -1).view(
+            batch_size, num_queries, self.n_heads, self.n_levels, self.n_points
+        )
+        # batch_size, num_queries, n_heads, n_levels, n_points, 2
+        num_coordinates = reference_points.shape[-1]
+        if num_coordinates == 2:
+            offset_normalizer = torch.stack([spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
+            sampling_locations = (
+                reference_points[:, :, None, :, None, :]
+                + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
+            )
+        elif num_coordinates == 4:
+            sampling_locations = (
+                reference_points[:, :, None, :, None, :2]
+                + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
+            )
+        else:
+            raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}")
+
+        if self.disable_custom_kernels:
+            # PyTorch implementation
+            output = multi_scale_deformable_attention(value, spatial_shapes, sampling_locations, attention_weights)
+        else:
+            try:
+                # custom kernel
+                output = MultiScaleDeformableAttentionFunction.apply(
+                    value,
+                    spatial_shapes,
+                    level_start_index,
+                    sampling_locations,
+                    attention_weights,
+                    self.im2col_step,
+                )
+            except Exception:
+                # PyTorch implementation
+                output = multi_scale_deformable_attention(value, spatial_shapes, sampling_locations, attention_weights)
+        output = self.output_proj(output)
+
+        return output, attention_weights
+
+
+class DetaMultiheadAttention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper.
+
+    Here, we add position embeddings to the queries and keys (as explained in the Deformable DETR paper).
+    """
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        if self.head_dim * num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
+        return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
+        return tensor if position_embeddings is None else tensor + position_embeddings
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        batch_size, target_len, embed_dim = hidden_states.size()
+        # add position embeddings to the hidden states before projecting to queries and keys
+        if position_embeddings is not None:
+            hidden_states_original = hidden_states
+            hidden_states = self.with_pos_embed(hidden_states, position_embeddings)
+
+        # get queries, keys and values
+        query_states = self.q_proj(hidden_states) * self.scaling
+        key_states = self._shape(self.k_proj(hidden_states), -1, batch_size)
+        value_states = self._shape(self.v_proj(hidden_states_original), -1, batch_size)
+
+        proj_shape = (batch_size * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, target_len, batch_size).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        source_len = key_states.size(1)
+
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (batch_size * self.num_heads, target_len, source_len):
+            raise ValueError(
+                f"Attention weights should be of size {(batch_size * self.num_heads, target_len, source_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        # expand attention_mask
+        if attention_mask is not None:
+            # [batch_size, seq_len] -> [batch_size, 1, target_seq_len, source_seq_len]
+            attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype)
+
+        if attention_mask is not None:
+            if attention_mask.size() != (batch_size, 1, target_len, source_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(batch_size, 1, target_len, source_len)}, but is"
+                    f" {attention_mask.size()}"
+                )
+            if attention_mask.dtype == torch.bool:
+                attention_mask = torch.zeros_like(attention_mask, dtype=attn_weights.dtype).masked_fill_(
+                    attention_mask, -torch.inf
+                )
+            attn_weights = attn_weights.view(batch_size, self.num_heads, target_len, source_len) + attention_mask
+            attn_weights = attn_weights.view(batch_size * self.num_heads, target_len, source_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(batch_size, self.num_heads, target_len, source_len)
+            attn_weights = attn_weights_reshaped.view(batch_size * self.num_heads, target_len, source_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (batch_size * self.num_heads, target_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(batch_size, self.num_heads, target_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(batch_size, self.num_heads, target_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(batch_size, target_len, embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped
+
+
+class DetaEncoderLayer(nn.Module):
+    def __init__(self, config: DetaConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = DetaMultiscaleDeformableAttention(
+            config,
+            num_heads=config.encoder_attention_heads,
+            n_points=config.encoder_n_points,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        position_embeddings: Optional[torch.Tensor] = None,
+        reference_points=None,
+        spatial_shapes=None,
+        level_start_index=None,
+        output_attentions: bool = False,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Input to the layer.
+            attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+                Attention mask.
+            position_embeddings (`torch.FloatTensor`, *optional*):
+                Position embeddings, to be added to `hidden_states`.
+            reference_points (`torch.FloatTensor`, *optional*):
+                Reference points.
+            spatial_shapes (`torch.LongTensor`, *optional*):
+                Spatial shapes of the backbone feature maps.
+            level_start_index (`torch.LongTensor`, *optional*):
+                Level start index.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        # Apply Multi-scale Deformable Attention Module on the multi-scale feature maps.
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=attention_mask,
+            position_embeddings=position_embeddings,
+            reference_points=reference_points,
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        if self.training:
+            if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any():
+                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class DetaDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: DetaConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        # self-attention
+        self.self_attn = DetaMultiheadAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        # cross-attention
+        self.encoder_attn = DetaMultiscaleDeformableAttention(
+            config,
+            num_heads=config.decoder_attention_heads,
+            n_points=config.decoder_n_points,
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        # feedforward neural networks
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Optional[torch.Tensor] = None,
+        reference_points=None,
+        spatial_shapes=None,
+        level_start_index=None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                Input to the layer of shape `(batch, seq_len, embed_dim)`.
+            position_embeddings (`torch.FloatTensor`, *optional*):
+                Position embeddings that are added to the queries and keys in the self-attention layer.
+            reference_points (`torch.FloatTensor`, *optional*):
+                Reference points.
+            spatial_shapes (`torch.LongTensor`, *optional*):
+                Spatial shapes.
+            level_start_index (`torch.LongTensor`, *optional*):
+                Level start index.
+            encoder_hidden_states (`torch.FloatTensor`):
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
+            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+                `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
+                values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            position_embeddings=position_embeddings,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        second_residual = hidden_states
+
+        # Cross-Attention
+        cross_attn_weights = None
+        hidden_states, cross_attn_weights = self.encoder_attn(
+            hidden_states=hidden_states,
+            attention_mask=encoder_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            position_embeddings=position_embeddings,
+            reference_points=reference_points,
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = second_residual + hidden_states
+
+        hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        return outputs
+
+
+class DetaPreTrainedModel(PreTrainedModel):
+    config: DetaConfig
+    base_model_prefix = "model"
+    main_input_name = "pixel_values"
+    _no_split_modules = [r"DetaBackboneWithPositionalEncodings", r"DetaEncoderLayer", r"DetaDecoderLayer"]
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        std = self.config.init_std
+
+        if isinstance(module, DetaLearnedPositionEmbedding):
+            nn.init.uniform_(module.row_embeddings.weight)
+            nn.init.uniform_(module.column_embeddings.weight)
+        elif isinstance(module, DetaMultiscaleDeformableAttention):
+            module._reset_parameters()
+        elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        if hasattr(module, "reference_points") and not self.config.two_stage:
+            nn.init.xavier_uniform_(module.reference_points.weight.data, gain=1.0)
+            nn.init.constant_(module.reference_points.bias.data, 0.0)
+        if hasattr(module, "level_embed"):
+            nn.init.normal_(module.level_embed)
+
+
+DETA_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`DetaConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+DETA_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it.
+
+            Pixel values can be obtained using [`AutoImageProcessor`]. See [`AutoImageProcessor.__call__`] for details.
+
+        pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
+            Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
+
+            - 1 for pixels that are real (i.e. **not masked**),
+            - 0 for pixels that are padding (i.e. **masked**).
+
+            [What are attention masks?](../glossary#attention-mask)
+
+        decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
+            Not used by default. Can be used to mask object queries.
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
+            can choose to directly pass a flattened representation of an image.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
+            Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
+            embedded representation.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class DetaEncoder(DetaPreTrainedModel):
+    """
+    Transformer encoder consisting of *config.encoder_layers* deformable attention layers. Each layer is a
+    [`DetaEncoderLayer`].
+
+    The encoder updates the flattened multi-scale feature maps through multiple deformable attention layers.
+
+    Args:
+        config: DetaConfig
+    """
+
+    def __init__(self, config: DetaConfig):
+        super().__init__(config)
+
+        self.dropout = config.dropout
+        self.layers = nn.ModuleList([DetaEncoderLayer(config) for _ in range(config.encoder_layers)])
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @staticmethod
+    def get_reference_points(spatial_shapes, valid_ratios, device):
+        """
+        Get reference points for each feature map. Used in decoder.
+
+        Args:
+            spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`):
+                Spatial shapes of each feature map.
+            valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`):
+                Valid ratios of each feature map.
+            device (`torch.device`):
+                Device on which to create the tensors.
+        Returns:
+            `torch.FloatTensor` of shape `(batch_size, num_queries, num_feature_levels, 2)`
+        """
+        reference_points_list = []
+        for level, (height, width) in enumerate(spatial_shapes):
+            ref_y, ref_x = meshgrid(
+                torch.linspace(0.5, height - 0.5, height, dtype=torch.float32, device=device),
+                torch.linspace(0.5, width - 0.5, width, dtype=torch.float32, device=device),
+                indexing="ij",
+            )
+            # TODO: valid_ratios could be useless here. check https://github.com/fundamentalvision/Deformable-DETR/issues/36
+            ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, level, 1] * height)
+            ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, level, 0] * width)
+            ref = torch.stack((ref_x, ref_y), -1)
+            reference_points_list.append(ref)
+        reference_points = torch.cat(reference_points_list, 1)
+        reference_points = reference_points[:, :, None] * valid_ratios[:, None]
+        return reference_points
+
+    def forward(
+        self,
+        inputs_embeds=None,
+        attention_mask=None,
+        position_embeddings=None,
+        spatial_shapes=None,
+        level_start_index=None,
+        valid_ratios=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Flattened feature map (output of the backbone + projection layer) that is passed to the encoder.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`:
+                - 1 for pixel features that are real (i.e. **not masked**),
+                - 0 for pixel features that are padding (i.e. **masked**).
+                [What are attention masks?](../glossary#attention-mask)
+            position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Position embeddings that are added to the queries and keys in each self-attention layer.
+            spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`):
+                Spatial shapes of each feature map.
+            level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`):
+                Starting index of each feature map.
+            valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`):
+                Ratio of valid area in each feature level.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        hidden_states = inputs_embeds
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=inputs_embeds.device)
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        for i, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            layer_outputs = encoder_layer(
+                hidden_states,
+                attention_mask,
+                position_embeddings=position_embeddings,
+                reference_points=reference_points,
+                spatial_shapes=spatial_shapes,
+                level_start_index=level_start_index,
+                output_attentions=output_attentions,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class DetaDecoder(DetaPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`DetaDecoderLayer`].
+
+    The decoder updates the query embeddings through multiple self-attention and cross-attention layers.
+
+    Some tweaks for Deformable DETR:
+
+    - `position_embeddings`, `reference_points`, `spatial_shapes` and `valid_ratios` are added to the forward pass.
+    - it also returns a stack of intermediate outputs and reference points from all decoding layers.
+
+    Args:
+        config: DetaConfig
+    """
+
+    def __init__(self, config: DetaConfig):
+        super().__init__(config)
+
+        self.dropout = config.dropout
+        self.layers = nn.ModuleList([DetaDecoderLayer(config) for _ in range(config.decoder_layers)])
+        self.gradient_checkpointing = False
+
+        # hack implementation for iterative bounding box refinement and two-stage Deformable DETR
+        self.bbox_embed = None
+        self.class_embed = None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        position_embeddings=None,
+        reference_points=None,
+        spatial_shapes=None,
+        level_start_index=None,
+        valid_ratios=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
+                The query embeddings that are passed into the decoder.
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing cross-attention on padding pixel_values of the encoder. Mask values selected
+                in `[0, 1]`:
+                - 1 for pixels that are real (i.e. **not masked**),
+                - 0 for pixels that are padding (i.e. **masked**).
+            position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
+                Position embeddings that are added to the queries and keys in each self-attention layer.
+            reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)` is `as_two_stage` else `(batch_size, num_queries, 2)` or , *optional*):
+                Reference point in range `[0, 1]`, top-left (0,0), bottom-right (1, 1), including padding area.
+            spatial_shapes (`torch.FloatTensor` of shape `(num_feature_levels, 2)`):
+                Spatial shapes of the feature maps.
+            level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`, *optional*):
+                Indexes for the start of each feature level. In range `[0, sequence_length]`.
+            valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`, *optional*):
+                Ratio of valid area in each feature level.
+
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+        intermediate = ()
+        intermediate_reference_points = ()
+
+        for idx, decoder_layer in enumerate(self.layers):
+            if reference_points.shape[-1] == 4:
+                reference_points_input = (
+                    reference_points[:, :, None] * torch.cat([valid_ratios, valid_ratios], -1)[:, None]
+                )
+            else:
+                if reference_points.shape[-1] != 2:
+                    raise ValueError("Reference points' last dimension must be of size 2")
+                reference_points_input = reference_points[:, :, None] * valid_ratios[:, None]
+
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                position_embeddings=position_embeddings,
+                encoder_hidden_states=encoder_hidden_states,
+                reference_points=reference_points_input,
+                spatial_shapes=spatial_shapes,
+                level_start_index=level_start_index,
+                encoder_attention_mask=encoder_attention_mask,
+                output_attentions=output_attentions,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            # hack implementation for iterative bounding box refinement
+            if self.bbox_embed is not None:
+                tmp = self.bbox_embed[idx](hidden_states)
+                if reference_points.shape[-1] == 4:
+                    new_reference_points = tmp + inverse_sigmoid(reference_points)
+                    new_reference_points = new_reference_points.sigmoid()
+                else:
+                    if reference_points.shape[-1] != 2:
+                        raise ValueError(
+                            f"Reference points' last dimension must be of size 2, but is {reference_points.shape[-1]}"
+                        )
+                    new_reference_points = tmp
+                    new_reference_points[..., :2] = tmp[..., :2] + inverse_sigmoid(reference_points)
+                    new_reference_points = new_reference_points.sigmoid()
+                reference_points = new_reference_points.detach()
+
+            intermediate += (hidden_states,)
+            intermediate_reference_points += (reference_points,)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        # Keep batch_size as first dimension
+        intermediate = torch.stack(intermediate, dim=1)
+        intermediate_reference_points = torch.stack(intermediate_reference_points, dim=1)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    intermediate,
+                    intermediate_reference_points,
+                    all_hidden_states,
+                    all_self_attns,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return DetaDecoderOutput(
+            last_hidden_state=hidden_states,
+            intermediate_hidden_states=intermediate,
+            intermediate_reference_points=intermediate_reference_points,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    The bare DETA Model (consisting of a backbone and encoder-decoder Transformer) outputting raw hidden-states without
+    any specific head on top.
+    """,
+    DETA_START_DOCSTRING,
+)
+class DetaModel(DetaPreTrainedModel):
+    def __init__(self, config: DetaConfig):
+        super().__init__(config)
+
+        if config.two_stage:
+            requires_backends(self, ["torchvision"])
+
+        # Create backbone with positional encoding
+        self.backbone = DetaBackboneWithPositionalEncodings(config)
+        intermediate_channel_sizes = self.backbone.intermediate_channel_sizes
+
+        # Create input projection layers
+        if config.num_feature_levels > 1:
+            num_backbone_outs = len(intermediate_channel_sizes)
+            input_proj_list = []
+            for _ in range(num_backbone_outs):
+                in_channels = intermediate_channel_sizes[_]
+                input_proj_list.append(
+                    nn.Sequential(
+                        nn.Conv2d(in_channels, config.d_model, kernel_size=1),
+                        nn.GroupNorm(32, config.d_model),
+                    )
+                )
+            for _ in range(config.num_feature_levels - num_backbone_outs):
+                input_proj_list.append(
+                    nn.Sequential(
+                        nn.Conv2d(in_channels, config.d_model, kernel_size=3, stride=2, padding=1),
+                        nn.GroupNorm(32, config.d_model),
+                    )
+                )
+                in_channels = config.d_model
+            self.input_proj = nn.ModuleList(input_proj_list)
+        else:
+            self.input_proj = nn.ModuleList(
+                [
+                    nn.Sequential(
+                        nn.Conv2d(intermediate_channel_sizes[-1], config.d_model, kernel_size=1),
+                        nn.GroupNorm(32, config.d_model),
+                    )
+                ]
+            )
+
+        if not config.two_stage:
+            self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model * 2)
+
+        self.encoder = DetaEncoder(config)
+        self.decoder = DetaDecoder(config)
+
+        self.level_embed = nn.Parameter(torch.Tensor(config.num_feature_levels, config.d_model))
+
+        if config.two_stage:
+            self.enc_output = nn.Linear(config.d_model, config.d_model)
+            self.enc_output_norm = nn.LayerNorm(config.d_model)
+            self.pos_trans = nn.Linear(config.d_model * 2, config.d_model * 2)
+            self.pos_trans_norm = nn.LayerNorm(config.d_model * 2)
+            self.pix_trans = nn.Linear(config.d_model, config.d_model)
+            self.pix_trans_norm = nn.LayerNorm(config.d_model)
+        else:
+            self.reference_points = nn.Linear(config.d_model, 2)
+
+        self.assign_first_stage = config.assign_first_stage
+        self.two_stage_num_proposals = config.two_stage_num_proposals
+
+        self.post_init()
+
+    def get_encoder(self):
+        return self.encoder
+
+    def freeze_backbone(self):
+        for name, param in self.backbone.model.named_parameters():
+            param.requires_grad_(False)
+
+    def unfreeze_backbone(self):
+        for name, param in self.backbone.model.named_parameters():
+            param.requires_grad_(True)
+
+    def get_valid_ratio(self, mask, dtype=torch.float32):
+        """Get the valid ratio of all feature maps."""
+
+        _, height, width = mask.shape
+        valid_height = torch.sum(mask[:, :, 0], 1)
+        valid_width = torch.sum(mask[:, 0, :], 1)
+        valid_ratio_height = valid_height.to(dtype) / height
+        valid_ratio_width = valid_width.to(dtype) / width
+        valid_ratio = torch.stack([valid_ratio_width, valid_ratio_height], -1)
+        return valid_ratio
+
+    def get_proposal_pos_embed(self, proposals):
+        """Get the position embedding of the proposals."""
+
+        num_pos_feats = self.config.d_model // 2
+        temperature = 10000
+        scale = 2 * math.pi
+
+        dim_t = torch.arange(num_pos_feats, dtype=torch.int64, device=proposals.device).float()
+        dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats)
+        # batch_size, num_queries, 4
+        proposals = proposals.sigmoid() * scale
+        # batch_size, num_queries, 4, 128
+        pos = proposals[:, :, :, None] / dim_t
+        # batch_size, num_queries, 4, 64, 2 -> batch_size, num_queries, 512
+        pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), dim=4).flatten(2)
+        return pos
+
+    def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes):
+        """Generate the encoder output proposals from encoded enc_output.
+
+        Args:
+            enc_output (Tensor[batch_size, sequence_length, hidden_size]): Output of the encoder.
+            padding_mask (Tensor[batch_size, sequence_length]): Padding mask for `enc_output`.
+            spatial_shapes (Tensor[num_feature_levels, 2]): Spatial shapes of the feature maps.
+
+        Returns:
+            `tuple(torch.FloatTensor)`: A tuple of feature map and bbox prediction.
+                - object_query (Tensor[batch_size, sequence_length, hidden_size]): Object query features. Later used to
+                  directly predict a bounding box. (without the need of a decoder)
+                - output_proposals (Tensor[batch_size, sequence_length, 4]): Normalized proposals, after an inverse
+                  sigmoid.
+        """
+        batch_size = enc_output.shape[0]
+        proposals = []
+        _cur = 0
+        level_ids = []
+        for level, (height, width) in enumerate(spatial_shapes):
+            mask_flatten_ = padding_mask[:, _cur : (_cur + height * width)].view(batch_size, height, width, 1)
+            valid_height = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
+            valid_width = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
+
+            grid_y, grid_x = meshgrid(
+                torch.linspace(0, height - 1, height, dtype=torch.float32, device=enc_output.device),
+                torch.linspace(0, width - 1, width, dtype=torch.float32, device=enc_output.device),
+                indexing="ij",
+            )
+            grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)
+
+            scale = torch.cat([valid_width.unsqueeze(-1), valid_height.unsqueeze(-1)], 1).view(batch_size, 1, 1, 2)
+            grid = (grid.unsqueeze(0).expand(batch_size, -1, -1, -1) + 0.5) / scale
+            width_height = torch.ones_like(grid) * 0.05 * (2.0**level)
+            proposal = torch.cat((grid, width_height), -1).view(batch_size, -1, 4)
+            proposals.append(proposal)
+            _cur += height * width
+            level_ids.append(grid.new_ones(height * width, dtype=torch.long) * level)
+        output_proposals = torch.cat(proposals, 1)
+        output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True)
+        output_proposals = torch.log(output_proposals / (1 - output_proposals))  # inverse sigmoid
+        output_proposals = output_proposals.masked_fill(padding_mask.unsqueeze(-1), float("inf"))
+        output_proposals = output_proposals.masked_fill(~output_proposals_valid, float("inf"))
+
+        # assign each pixel as an object query
+        object_query = enc_output
+        object_query = object_query.masked_fill(padding_mask.unsqueeze(-1), float(0))
+        object_query = object_query.masked_fill(~output_proposals_valid, float(0))
+        object_query = self.enc_output_norm(self.enc_output(object_query))
+        level_ids = torch.cat(level_ids)
+        return object_query, output_proposals, level_ids
+
+    @add_start_docstrings_to_model_forward(DETA_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=DetaModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        pixel_mask: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_outputs: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.FloatTensor], DetaModelOutput]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, DetaModel
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("jozhang97/deta-swin-large-o365")
+        >>> model = DetaModel.from_pretrained("jozhang97/deta-swin-large-o365", two_stage=False)
+
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+
+        >>> last_hidden_states = outputs.last_hidden_state
+        >>> list(last_hidden_states.shape)
+        [1, 900, 256]
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        batch_size, num_channels, height, width = pixel_values.shape
+        device = pixel_values.device
+
+        if pixel_mask is None:
+            pixel_mask = torch.ones(((batch_size, height, width)), dtype=torch.long, device=device)
+
+        # Extract multi-scale feature maps of same resolution `config.d_model` (cf Figure 4 in paper)
+        # First, sent pixel_values + pixel_mask through Backbone to obtain the features
+        # which is a list of tuples
+        features, position_embeddings_list = self.backbone(pixel_values, pixel_mask)
+
+        # Then, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default)
+        sources = []
+        masks = []
+        for level, (source, mask) in enumerate(features):
+            sources.append(self.input_proj[level](source))
+            masks.append(mask)
+            if mask is None:
+                raise ValueError("No attention mask was provided")
+
+        # Lowest resolution feature maps are obtained via 3x3 stride 2 convolutions on the final stage
+        if self.config.num_feature_levels > len(sources):
+            _len_sources = len(sources)
+            for level in range(_len_sources, self.config.num_feature_levels):
+                if level == _len_sources:
+                    source = self.input_proj[level](features[-1][0])
+                else:
+                    source = self.input_proj[level](sources[-1])
+                mask = nn.functional.interpolate(pixel_mask[None].float(), size=source.shape[-2:]).to(torch.bool)[0]
+                pos_l = self.backbone.position_embedding(source, mask).to(source.dtype)
+                sources.append(source)
+                masks.append(mask)
+                position_embeddings_list.append(pos_l)
+
+        # Create queries
+        query_embeds = None
+        if not self.config.two_stage:
+            query_embeds = self.query_position_embeddings.weight
+
+        # Prepare encoder inputs (by flattening)
+        spatial_shapes = [(source.shape[2:]) for source in sources]
+        source_flatten = [source.flatten(2).transpose(1, 2) for source in sources]
+        mask_flatten = [mask.flatten(1) for mask in masks]
+
+        lvl_pos_embed_flatten = []
+        for level, pos_embed in enumerate(position_embeddings_list):
+            pos_embed = pos_embed.flatten(2).transpose(1, 2)
+            lvl_pos_embed = pos_embed + self.level_embed[level].view(1, 1, -1)
+            lvl_pos_embed_flatten.append(lvl_pos_embed)
+
+        source_flatten = torch.cat(source_flatten, 1)
+        mask_flatten = torch.cat(mask_flatten, 1)
+        lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
+        spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=source_flatten.device)
+        level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
+        valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1)
+        valid_ratios = valid_ratios.float()
+
+        # Fourth, sent source_flatten + mask_flatten + lvl_pos_embed_flatten (backbone + proj layer output) through encoder
+        # Also provide spatial_shapes, level_start_index and valid_ratios
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                inputs_embeds=source_flatten,
+                attention_mask=mask_flatten,
+                position_embeddings=lvl_pos_embed_flatten,
+                spatial_shapes=spatial_shapes,
+                level_start_index=level_start_index,
+                valid_ratios=valid_ratios,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        # Fifth, prepare decoder inputs
+        batch_size, _, num_channels = encoder_outputs[0].shape
+        enc_outputs_class = None
+        enc_outputs_coord_logits = None
+        output_proposals = None
+        if self.config.two_stage:
+            object_query_embedding, output_proposals, level_ids = self.gen_encoder_output_proposals(
+                encoder_outputs[0], ~mask_flatten, spatial_shapes
+            )
+
+            # hack implementation for two-stage DETA
+            # apply a detection head to each pixel (A.4 in paper)
+            # linear projection for bounding box binary classification (i.e. foreground and background)
+            enc_outputs_class = self.decoder.class_embed[-1](object_query_embedding)
+            # 3-layer FFN to predict bounding boxes coordinates (bbox regression branch)
+            delta_bbox = self.decoder.bbox_embed[-1](object_query_embedding)
+            enc_outputs_coord_logits = delta_bbox + output_proposals
+
+            # only keep top scoring `config.two_stage_num_proposals` proposals
+            topk = self.two_stage_num_proposals
+            proposal_logit = enc_outputs_class[..., 0]
+
+            if self.assign_first_stage:
+                proposal_boxes = center_to_corners_format(enc_outputs_coord_logits.sigmoid().float()).clamp(0, 1)
+                topk_proposals = []
+                for b in range(batch_size):
+                    prop_boxes_b = proposal_boxes[b]
+                    prop_logits_b = proposal_logit[b]
+
+                    # pre-nms per-level topk
+                    pre_nms_topk = 1000
+                    pre_nms_inds = []
+                    for lvl in range(len(spatial_shapes)):
+                        lvl_mask = level_ids == lvl
+                        pre_nms_inds.append(torch.topk(prop_logits_b.sigmoid() * lvl_mask, pre_nms_topk)[1])
+                    pre_nms_inds = torch.cat(pre_nms_inds)
+
+                    # nms on topk indices
+                    post_nms_inds = batched_nms(
+                        prop_boxes_b[pre_nms_inds], prop_logits_b[pre_nms_inds], level_ids[pre_nms_inds], 0.9
+                    )
+                    keep_inds = pre_nms_inds[post_nms_inds]
+
+                    if len(keep_inds) < self.two_stage_num_proposals:
+                        print(
+                            f"[WARNING] nms proposals ({len(keep_inds)}) < {self.two_stage_num_proposals}, running"
+                            " naive topk"
+                        )
+                        keep_inds = torch.topk(proposal_logit[b], topk)[1]
+
+                    # keep top Q/L indices for L levels
+                    q_per_l = topk // len(spatial_shapes)
+                    is_level_ordered = (
+                        level_ids[keep_inds][None]
+                        == torch.arange(len(spatial_shapes), device=level_ids.device)[:, None]
+                    )
+                    keep_inds_mask = is_level_ordered & (is_level_ordered.cumsum(1) <= q_per_l)  # LS
+                    keep_inds_mask = keep_inds_mask.any(0)  # S
+
+                    # pad to Q indices (might let ones filtered from pre-nms sneak by... unlikely because we pick high conf anyways)
+                    if keep_inds_mask.sum() < topk:
+                        num_to_add = topk - keep_inds_mask.sum()
+                        pad_inds = (~keep_inds_mask).nonzero()[:num_to_add]
+                        keep_inds_mask[pad_inds] = True
+
+                    keep_inds_topk = keep_inds[keep_inds_mask]
+                    topk_proposals.append(keep_inds_topk)
+                topk_proposals = torch.stack(topk_proposals)
+            else:
+                topk_proposals = torch.topk(enc_outputs_class[..., 0], topk, dim=1)[1]
+
+            topk_coords_logits = torch.gather(
+                enc_outputs_coord_logits, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4)
+            )
+            topk_coords_logits = topk_coords_logits.detach()
+            reference_points = topk_coords_logits.sigmoid()
+            init_reference_points = reference_points
+            pos_trans_out = self.pos_trans_norm(self.pos_trans(self.get_proposal_pos_embed(topk_coords_logits)))
+            query_embed, target = torch.split(pos_trans_out, num_channels, dim=2)
+
+            topk_feats = torch.stack(
+                [object_query_embedding[b][topk_proposals[b]] for b in range(batch_size)]
+            ).detach()
+            target = target + self.pix_trans_norm(self.pix_trans(topk_feats))
+        else:
+            query_embed, target = torch.split(query_embeds, num_channels, dim=1)
+            query_embed = query_embed.unsqueeze(0).expand(batch_size, -1, -1)
+            target = target.unsqueeze(0).expand(batch_size, -1, -1)
+            reference_points = self.reference_points(query_embed).sigmoid()
+            init_reference_points = reference_points
+
+        decoder_outputs = self.decoder(
+            inputs_embeds=target,
+            position_embeddings=query_embed,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=mask_flatten,
+            reference_points=reference_points,
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            valid_ratios=valid_ratios,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            enc_outputs = tuple(value for value in [enc_outputs_class, enc_outputs_coord_logits] if value is not None)
+            tuple_outputs = (init_reference_points,) + decoder_outputs + encoder_outputs + enc_outputs
+
+            return tuple_outputs
+
+        return DetaModelOutput(
+            init_reference_points=init_reference_points,
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            intermediate_hidden_states=decoder_outputs.intermediate_hidden_states,
+            intermediate_reference_points=decoder_outputs.intermediate_reference_points,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+            enc_outputs_class=enc_outputs_class,
+            enc_outputs_coord_logits=enc_outputs_coord_logits,
+            output_proposals=output_proposals,
+        )
+
+
+@add_start_docstrings(
+    """
+    DETA Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on top, for tasks
+    such as COCO detection.
+    """,
+    DETA_START_DOCSTRING,
+)
+class DetaForObjectDetection(DetaPreTrainedModel):
+    # When using clones, all layers > 0 will be clones, but layer 0 *is* required
+    _tied_weights_keys = [r"bbox_embed\.\d+", r"class_embed\.\d+"]
+    # We can't initialize the model on meta device as some weights are modified during the initialization
+    _no_split_modules = None
+
+    def __init__(self, config: DetaConfig):
+        super().__init__(config)
+
+        # Deformable DETR encoder-decoder model
+        self.model = DetaModel(config)
+
+        # Detection heads on top
+        self.class_embed = nn.Linear(config.d_model, config.num_labels)
+        self.bbox_embed = DetaMLPPredictionHead(
+            input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
+        )
+
+        prior_prob = 0.01
+        bias_value = -math.log((1 - prior_prob) / prior_prob)
+        self.class_embed.bias.data.fill_(bias_value)
+        nn.init.constant_(self.bbox_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(self.bbox_embed.layers[-1].bias.data, 0)
+
+        # if two-stage, the last class_embed and bbox_embed is for region proposal generation
+        num_pred = (config.decoder_layers + 1) if config.two_stage else config.decoder_layers
+        if config.with_box_refine:
+            self.class_embed = _get_clones(self.class_embed, num_pred)
+            self.bbox_embed = _get_clones(self.bbox_embed, num_pred)
+            nn.init.constant_(self.bbox_embed[0].layers[-1].bias.data[2:], -2.0)
+            # hack implementation for iterative bounding box refinement
+            self.model.decoder.bbox_embed = self.bbox_embed
+        else:
+            nn.init.constant_(self.bbox_embed.layers[-1].bias.data[2:], -2.0)
+            self.class_embed = nn.ModuleList([self.class_embed for _ in range(num_pred)])
+            self.bbox_embed = nn.ModuleList([self.bbox_embed for _ in range(num_pred)])
+            self.model.decoder.bbox_embed = None
+        if config.two_stage:
+            # hack implementation for two-stage
+            self.model.decoder.class_embed = self.class_embed
+            for box_embed in self.bbox_embed:
+                nn.init.constant_(box_embed.layers[-1].bias.data[2:], 0.0)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @torch.jit.unused
+    def _set_aux_loss(self, outputs_class, outputs_coord):
+        # this is a workaround to make torchscript happy, as torchscript
+        # doesn't support dictionary with non-homogeneous values, such
+        # as a dict having both a Tensor and a list.
+        aux_loss = [
+            {"logits": logits, "pred_boxes": pred_boxes}
+            for logits, pred_boxes in zip(outputs_class.transpose(0, 1)[:-1], outputs_coord.transpose(0, 1)[:-1])
+        ]
+        return aux_loss
+
+    @add_start_docstrings_to_model_forward(DETA_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=DetaObjectDetectionOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        pixel_mask: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_outputs: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[list[dict]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.FloatTensor], DetaObjectDetectionOutput]:
+        r"""
+        labels (`list[Dict]` of len `(batch_size,)`, *optional*):
+            Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
+            following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch
+            respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes
+            in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`.
+
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, DetaForObjectDetection
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("jozhang97/deta-swin-large")
+        >>> model = DetaForObjectDetection.from_pretrained("jozhang97/deta-swin-large")
+
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
+        >>> target_sizes = torch.tensor([image.size[::-1]])
+        >>> results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[
+        ...     0
+        ... ]
+        >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
+        ...     box = [round(i, 2) for i in box.tolist()]
+        ...     print(
+        ...         f"Detected {model.config.id2label[label.item()]} with confidence "
+        ...         f"{round(score.item(), 3)} at location {box}"
+        ...     )
+        Detected cat with confidence 0.802 at location [9.87, 54.36, 316.93, 473.44]
+        Detected cat with confidence 0.795 at location [346.62, 24.35, 639.62, 373.2]
+        Detected remote with confidence 0.725 at location [40.41, 73.36, 175.77, 117.29]
+        Detected remote with confidence 0.638 at location [333.34, 76.81, 370.22, 187.94]
+        Detected couch with confidence 0.584 at location [0.03, 0.99, 640.02, 474.93]
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # First, sent images through DETR base model to obtain encoder + decoder outputs
+        outputs = self.model(
+            pixel_values,
+            pixel_mask=pixel_mask,
+            decoder_attention_mask=decoder_attention_mask,
+            encoder_outputs=encoder_outputs,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs.intermediate_hidden_states if return_dict else outputs[2]
+        init_reference = outputs.init_reference_points if return_dict else outputs[0]
+        inter_references = outputs.intermediate_reference_points if return_dict else outputs[3]
+
+        # class logits + predicted bounding boxes
+        outputs_classes = []
+        outputs_coords = []
+
+        for level in range(hidden_states.shape[1]):
+            if level == 0:
+                reference = init_reference
+            else:
+                reference = inter_references[:, level - 1]
+            reference = inverse_sigmoid(reference)
+            outputs_class = self.class_embed[level](hidden_states[:, level])
+            delta_bbox = self.bbox_embed[level](hidden_states[:, level])
+            if reference.shape[-1] == 4:
+                outputs_coord_logits = delta_bbox + reference
+            elif reference.shape[-1] == 2:
+                delta_bbox[..., :2] += reference
+                outputs_coord_logits = delta_bbox
+            else:
+                raise ValueError(f"reference.shape[-1] should be 4 or 2, but got {reference.shape[-1]}")
+            outputs_coord = outputs_coord_logits.sigmoid()
+            outputs_classes.append(outputs_class)
+            outputs_coords.append(outputs_coord)
+        # Keep batch_size as first dimension
+        outputs_class = torch.stack(outputs_classes, dim=1)
+        outputs_coord = torch.stack(outputs_coords, dim=1)
+
+        logits = outputs_class[:, -1]
+        pred_boxes = outputs_coord[:, -1]
+
+        loss, loss_dict, auxiliary_outputs = None, None, None
+        if labels is not None:
+            # First: create the matcher
+            matcher = DetaHungarianMatcher(
+                class_cost=self.config.class_cost, bbox_cost=self.config.bbox_cost, giou_cost=self.config.giou_cost
+            )
+            # Second: create the criterion
+            losses = ["labels", "boxes", "cardinality"]
+            criterion = DetaLoss(
+                matcher=matcher,
+                num_classes=self.config.num_labels,
+                focal_alpha=self.config.focal_alpha,
+                losses=losses,
+                num_queries=self.config.num_queries,
+                assign_first_stage=self.config.assign_first_stage,
+                assign_second_stage=self.config.assign_second_stage,
+            )
+            criterion.to(logits.device)
+            # Third: compute the losses, based on outputs and labels
+            outputs_loss = {}
+            outputs_loss["logits"] = logits
+            outputs_loss["pred_boxes"] = pred_boxes
+            outputs_loss["init_reference"] = init_reference
+            if self.config.auxiliary_loss:
+                auxiliary_outputs = self._set_aux_loss(outputs_class, outputs_coord)
+                outputs_loss["auxiliary_outputs"] = auxiliary_outputs
+            if self.config.two_stage:
+                enc_outputs_coord = outputs.enc_outputs_coord_logits.sigmoid()
+                outputs_loss["enc_outputs"] = {
+                    "logits": outputs.enc_outputs_class,
+                    "pred_boxes": enc_outputs_coord,
+                    "anchors": outputs.output_proposals.sigmoid(),
+                }
+
+            loss_dict = criterion(outputs_loss, labels)
+            # Fourth: compute total loss, as a weighted sum of the various losses
+            weight_dict = {"loss_ce": 1, "loss_bbox": self.config.bbox_loss_coefficient}
+            weight_dict["loss_giou"] = self.config.giou_loss_coefficient
+            if self.config.auxiliary_loss:
+                aux_weight_dict = {}
+                for i in range(self.config.decoder_layers - 1):
+                    aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
+                aux_weight_dict.update({k + "_enc": v for k, v in weight_dict.items()})
+                weight_dict.update(aux_weight_dict)
+            loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict if k in weight_dict)
+
+        if not return_dict:
+            if auxiliary_outputs is not None:
+                output = (logits, pred_boxes) + auxiliary_outputs + outputs
+            else:
+                output = (logits, pred_boxes) + outputs
+            tuple_outputs = ((loss, loss_dict) + output) if loss is not None else output
+
+            return tuple_outputs
+
+        dict_outputs = DetaObjectDetectionOutput(
+            loss=loss,
+            loss_dict=loss_dict,
+            logits=logits,
+            pred_boxes=pred_boxes,
+            auxiliary_outputs=auxiliary_outputs,
+            last_hidden_state=outputs.last_hidden_state,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+            intermediate_hidden_states=outputs.intermediate_hidden_states,
+            intermediate_reference_points=outputs.intermediate_reference_points,
+            init_reference_points=outputs.init_reference_points,
+            enc_outputs_class=outputs.enc_outputs_class,
+            enc_outputs_coord_logits=outputs.enc_outputs_coord_logits,
+            output_proposals=outputs.output_proposals,
+        )
+
+        return dict_outputs
+
+
+def dice_loss(inputs, targets, num_boxes):
+    """
+    Compute the DICE loss, similar to generalized IOU for masks
+
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs (0 for the negative class and 1 for the positive
+                 class).
+    """
+    inputs = inputs.sigmoid()
+    inputs = inputs.flatten(1)
+    numerator = 2 * (inputs * targets).sum(1)
+    denominator = inputs.sum(-1) + targets.sum(-1)
+    loss = 1 - (numerator + 1) / (denominator + 1)
+    return loss.sum() / num_boxes
+
+
+def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2):
+    """
+    Loss used in RetinaNet for dense detection: https://huggingface.co/papers/1708.02002.
+
+    Args:
+        inputs (`torch.FloatTensor` of arbitrary shape):
+            The predictions for each example.
+        targets (`torch.FloatTensor` with the same shape as `inputs`)
+            A tensor storing the binary classification label for each element in the `inputs` (0 for the negative class
+            and 1 for the positive class).
+        alpha (`float`, *optional*, defaults to `0.25`):
+            Optional weighting factor in the range (0,1) to balance positive vs. negative examples.
+        gamma (`int`, *optional*, defaults to `2`):
+            Exponent of the modulating factor (1 - p_t) to balance easy vs hard examples.
+
+    Returns:
+        Loss tensor
+    """
+    prob = inputs.sigmoid()
+    ce_loss = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+    # add modulating factor
+    p_t = prob * targets + (1 - prob) * (1 - targets)
+    loss = ce_loss * ((1 - p_t) ** gamma)
+
+    if alpha >= 0:
+        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
+        loss = alpha_t * loss
+
+    return loss.mean(1).sum() / num_boxes
+
+
+class DetaLoss(nn.Module):
+    """
+    This class computes the losses for `DetaForObjectDetection`. The process happens in two steps: 1) we compute
+    hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair of matched
+    ground-truth / prediction (supervised class and box).
+
+    Args:
+        matcher (`DetaHungarianMatcher`):
+            Module able to compute a matching between targets and proposals.
+        num_classes (`int`):
+            Number of object categories, omitting the special no-object category.
+        focal_alpha (`float`):
+            Alpha parameter in focal loss.
+        losses (`list[str]`):
+            List of all the losses to be applied. See `get_loss` for a list of all available losses.
+    """
+
+    def __init__(
+        self,
+        matcher,
+        num_classes,
+        focal_alpha,
+        losses,
+        num_queries,
+        assign_first_stage=False,
+        assign_second_stage=False,
+    ):
+        super().__init__()
+        self.matcher = matcher
+        self.num_classes = num_classes
+        self.focal_alpha = focal_alpha
+        self.losses = losses
+        self.assign_first_stage = assign_first_stage
+        self.assign_second_stage = assign_second_stage
+
+        if self.assign_first_stage:
+            self.stg1_assigner = DetaStage1Assigner()
+        if self.assign_second_stage:
+            self.stg2_assigner = DetaStage2Assigner(num_queries)
+
+    def loss_labels(self, outputs, targets, indices, num_boxes):
+        """
+        Classification loss (Binary focal loss) targets dicts must contain the key "class_labels" containing a tensor
+        of dim [nb_target_boxes]
+        """
+        if "logits" not in outputs:
+            raise KeyError("No logits were found in the outputs")
+        source_logits = outputs["logits"]
+
+        idx = self._get_source_permutation_idx(indices)
+        target_classes_o = torch.cat([t["class_labels"][J] for t, (_, J) in zip(targets, indices)])
+        target_classes = torch.full(
+            source_logits.shape[:2], self.num_classes, dtype=torch.int64, device=source_logits.device
+        )
+        target_classes[idx] = target_classes_o
+
+        target_classes_onehot = torch.zeros(
+            [source_logits.shape[0], source_logits.shape[1], source_logits.shape[2] + 1],
+            dtype=source_logits.dtype,
+            layout=source_logits.layout,
+            device=source_logits.device,
+        )
+        target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1)
+
+        target_classes_onehot = target_classes_onehot[:, :, :-1]
+        loss_ce = (
+            sigmoid_focal_loss(source_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2)
+            * source_logits.shape[1]
+        )
+        losses = {"loss_ce": loss_ce}
+
+        return losses
+
+    @torch.no_grad()
+    def loss_cardinality(self, outputs, targets, indices, num_boxes):
+        """
+        Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes.
+
+        This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients.
+        """
+        logits = outputs["logits"]
+        device = logits.device
+        target_lengths = torch.as_tensor([len(v["class_labels"]) for v in targets], device=device)
+        # Count the number of predictions that are NOT "no-object" (which is the last class)
+        card_pred = (logits.argmax(-1) != logits.shape[-1] - 1).sum(1)
+        card_err = nn.functional.l1_loss(card_pred.float(), target_lengths.float())
+        losses = {"cardinality_error": card_err}
+        return losses
+
+    def loss_boxes(self, outputs, targets, indices, num_boxes):
+        """
+        Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss.
+
+        Targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]. The target boxes
+        are expected in format (center_x, center_y, w, h), normalized by the image size.
+        """
+        if "pred_boxes" not in outputs:
+            raise KeyError("No predicted boxes found in outputs")
+        idx = self._get_source_permutation_idx(indices)
+        source_boxes = outputs["pred_boxes"][idx]
+        target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0)
+
+        loss_bbox = nn.functional.l1_loss(source_boxes, target_boxes, reduction="none")
+
+        losses = {}
+        losses["loss_bbox"] = loss_bbox.sum() / num_boxes
+
+        loss_giou = 1 - torch.diag(
+            generalized_box_iou(center_to_corners_format(source_boxes), center_to_corners_format(target_boxes))
+        )
+        losses["loss_giou"] = loss_giou.sum() / num_boxes
+        return losses
+
+    def _get_source_permutation_idx(self, indices):
+        # permute predictions following indices
+        batch_idx = torch.cat([torch.full_like(source, i) for i, (source, _) in enumerate(indices)])
+        source_idx = torch.cat([source for (source, _) in indices])
+        return batch_idx, source_idx
+
+    def _get_target_permutation_idx(self, indices):
+        # permute targets following indices
+        batch_idx = torch.cat([torch.full_like(target, i) for i, (_, target) in enumerate(indices)])
+        target_idx = torch.cat([target for (_, target) in indices])
+        return batch_idx, target_idx
+
+    def get_loss(self, loss, outputs, targets, indices, num_boxes):
+        loss_map = {
+            "labels": self.loss_labels,
+            "cardinality": self.loss_cardinality,
+            "boxes": self.loss_boxes,
+        }
+        if loss not in loss_map:
+            raise ValueError(f"Loss {loss} not supported")
+        return loss_map[loss](outputs, targets, indices, num_boxes)
+
+    def forward(self, outputs, targets):
+        """
+        This performs the loss computation.
+
+        Args:
+             outputs (`dict`, *optional*):
+                Dictionary of tensors, see the output specification of the model for the format.
+             targets (`list[dict]`, *optional*):
+                List of dicts, such that `len(targets) == batch_size`. The expected keys in each dict depends on the
+                losses applied, see each loss' doc.
+        """
+        outputs_without_aux = {k: v for k, v in outputs.items() if k not in ("auxiliary_outputs", "enc_outputs")}
+
+        # Retrieve the matching between the outputs of the last layer and the targets
+        if self.assign_second_stage:
+            indices = self.stg2_assigner(outputs_without_aux, targets)
+        else:
+            indices = self.matcher(outputs_without_aux, targets)
+
+        # Compute the average number of target boxes across all nodes, for normalization purposes
+        num_boxes = sum(len(t["class_labels"]) for t in targets)
+        num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
+        # Check that we have initialized the distributed state
+        world_size = 1
+        if is_accelerate_available():
+            if PartialState._shared_state != {}:
+                num_boxes = reduce(num_boxes)
+                world_size = PartialState().num_processes
+        num_boxes = torch.clamp(num_boxes / world_size, min=1).item()
+
+        # Compute all the requested losses
+        losses = {}
+        for loss in self.losses:
+            losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes))
+
+        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
+        if "auxiliary_outputs" in outputs:
+            for i, auxiliary_outputs in enumerate(outputs["auxiliary_outputs"]):
+                if not self.assign_second_stage:
+                    indices = self.matcher(auxiliary_outputs, targets)
+                for loss in self.losses:
+                    l_dict = self.get_loss(loss, auxiliary_outputs, targets, indices, num_boxes)
+                    l_dict = {k + f"_{i}": v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+
+        if "enc_outputs" in outputs:
+            enc_outputs = outputs["enc_outputs"]
+            bin_targets = copy.deepcopy(targets)
+            for bt in bin_targets:
+                bt["class_labels"] = torch.zeros_like(bt["class_labels"])
+            if self.assign_first_stage:
+                indices = self.stg1_assigner(enc_outputs, bin_targets)
+            else:
+                indices = self.matcher(enc_outputs, bin_targets)
+            for loss in self.losses:
+                l_dict = self.get_loss(loss, enc_outputs, bin_targets, indices, num_boxes)
+                l_dict = {k + "_enc": v for k, v in l_dict.items()}
+                losses.update(l_dict)
+
+        return losses
+
+
+class DetaMLPPredictionHead(nn.Module):
+    """
+    Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
+    height and width of a bounding box w.r.t. an image.
+
+    Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+
+    """
+
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
+class DetaHungarianMatcher(nn.Module):
+    """
+    This class computes an assignment between the targets and the predictions of the network.
+
+    For efficiency reasons, the targets don't include the no_object. Because of this, in general, there are more
+    predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, while the others are
+    un-matched (and thus treated as non-objects).
+
+    Args:
+        class_cost:
+            The relative weight of the classification error in the matching cost.
+        bbox_cost:
+            The relative weight of the L1 error of the bounding box coordinates in the matching cost.
+        giou_cost:
+            The relative weight of the giou loss of the bounding box in the matching cost.
+    """
+
+    def __init__(self, class_cost: float = 1, bbox_cost: float = 1, giou_cost: float = 1):
+        super().__init__()
+        requires_backends(self, ["scipy"])
+
+        self.class_cost = class_cost
+        self.bbox_cost = bbox_cost
+        self.giou_cost = giou_cost
+        if class_cost == 0 and bbox_cost == 0 and giou_cost == 0:
+            raise ValueError("All costs of the Matcher can't be 0")
+
+    @torch.no_grad()
+    def forward(self, outputs, targets):
+        """
+        Args:
+            outputs (`dict`):
+                A dictionary that contains at least these entries:
+                * "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
+                * "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates.
+            targets (`list[dict]`):
+                A list of targets (len(targets) = batch_size), where each target is a dict containing:
+                * "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of
+                  ground-truth
+                 objects in the target) containing the class labels
+                * "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates.
+
+        Returns:
+            `list[Tuple]`: A list of size `batch_size`, containing tuples of (index_i, index_j) where:
+            - index_i is the indices of the selected predictions (in order)
+            - index_j is the indices of the corresponding selected targets (in order)
+            For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
+        """
+        batch_size, num_queries = outputs["logits"].shape[:2]
+
+        # We flatten to compute the cost matrices in a batch
+        out_prob = outputs["logits"].flatten(0, 1).sigmoid()  # [batch_size * num_queries, num_classes]
+        out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]
+
+        # Also concat the target labels and boxes
+        target_ids = torch.cat([v["class_labels"] for v in targets])
+        target_bbox = torch.cat([v["boxes"] for v in targets])
+
+        # Compute the classification cost.
+        alpha = 0.25
+        gamma = 2.0
+        neg_cost_class = (1 - alpha) * (out_prob**gamma) * (-(1 - out_prob + 1e-8).log())
+        pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
+        class_cost = pos_cost_class[:, target_ids] - neg_cost_class[:, target_ids]
+
+        # Compute the L1 cost between boxes
+        bbox_cost = torch.cdist(out_bbox, target_bbox, p=1)
+
+        # Compute the giou cost between boxes
+        giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(target_bbox))
+
+        # Final cost matrix
+        cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost
+        cost_matrix = cost_matrix.view(batch_size, num_queries, -1).cpu()
+
+        sizes = [len(v["boxes"]) for v in targets]
+        indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))]
+        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
+
+
+def _upcast(t: Tensor) -> Tensor:
+    # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type
+    if t.is_floating_point():
+        return t if t.dtype in (torch.float32, torch.float64) else t.float()
+    else:
+        return t if t.dtype in (torch.int32, torch.int64) else t.int()
+
+
+def box_area(boxes: Tensor) -> Tensor:
+    """
+    Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates.
+
+    Args:
+        boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`):
+            Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1
+            < x2` and `0 <= y1 < y2`.
+
+    Returns:
+        `torch.FloatTensor`: a tensor containing the area for each box.
+    """
+    boxes = _upcast(boxes)
+    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+
+
+def box_iou(boxes1, boxes2):
+    area1 = box_area(boxes1)
+    area2 = box_area(boxes2)
+
+    left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
+    right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
+
+    width_height = (right_bottom - left_top).clamp(min=0)  # [N,M,2]
+    inter = width_height[:, :, 0] * width_height[:, :, 1]  # [N,M]
+
+    union = area1[:, None] + area2 - inter
+
+    iou = inter / union
+    return iou, union
+
+
+def generalized_box_iou(boxes1, boxes2):
+    """
+    Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format.
+
+    Returns:
+        `torch.FloatTensor`: a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2)
+    """
+    # degenerate boxes gives inf / nan results
+    # so do an early check
+    if not (boxes1[:, 2:] >= boxes1[:, :2]).all():
+        raise ValueError(f"boxes1 must be in [x0, y0, x1, y1] (corner) format, but got {boxes1}")
+    if not (boxes2[:, 2:] >= boxes2[:, :2]).all():
+        raise ValueError(f"boxes2 must be in [x0, y0, x1, y1] (corner) format, but got {boxes2}")
+    iou, union = box_iou(boxes1, boxes2)
+
+    top_left = torch.min(boxes1[:, None, :2], boxes2[:, :2])
+    bottom_right = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
+
+    width_height = (bottom_right - top_left).clamp(min=0)  # [N,M,2]
+    area = width_height[:, :, 0] * width_height[:, :, 1]
+
+    return iou - (area - union) / area
+
+
+# from https://github.com/facebookresearch/detectron2/blob/cbbc1ce26473cb2a5cc8f58e8ada9ae14cb41052/detectron2/layers/wrappers.py#L100
+def nonzero_tuple(x):
+    """
+    A 'as_tuple=True' version of torch.nonzero to support torchscript. because of
+    https://github.com/pytorch/pytorch/issues/38718
+    """
+    if torch.jit.is_scripting():
+        if x.dim() == 0:
+            return x.unsqueeze(0).nonzero().unbind(1)
+        return x.nonzero().unbind(1)
+    else:
+        return x.nonzero(as_tuple=True)
+
+
+# from https://github.com/facebookresearch/detectron2/blob/9921a2caa585d4fa66c4b534b6fab6e74d89b582/detectron2/modeling/matcher.py#L9
+class DetaMatcher:
+    """
+    This class assigns to each predicted "element" (e.g., a box) a ground-truth element. Each predicted element will
+    have exactly zero or one matches; each ground-truth element may be matched to zero or more predicted elements.
+
+    The matching is determined by the MxN match_quality_matrix, that characterizes how well each (ground-truth,
+    prediction)-pair match each other. For example, if the elements are boxes, this matrix may contain box
+    intersection-over-union overlap values.
+
+    The matcher returns (a) a vector of length N containing the index of the ground-truth element m in [0, M) that
+    matches to prediction n in [0, N). (b) a vector of length N containing the labels for each prediction.
+    """
+
+    def __init__(self, thresholds: list[float], labels: list[int], allow_low_quality_matches: bool = False):
+        """
+        Args:
+            thresholds (`list[float]`):
+                A list of thresholds used to stratify predictions into levels.
+            labels (`list[int`):
+                A list of values to label predictions belonging at each level. A label can be one of {-1, 0, 1}
+                signifying {ignore, negative class, positive class}, respectively.
+            allow_low_quality_matches (`bool`, *optional*, defaults to `False`):
+                If `True`, produce additional matches for predictions with maximum match quality lower than
+                high_threshold. See `set_low_quality_matches_` for more details.
+
+            For example,
+                thresholds = [0.3, 0.5] labels = [0, -1, 1] All predictions with iou < 0.3 will be marked with 0 and
+                thus will be considered as false positives while training. All predictions with 0.3 <= iou < 0.5 will
+                be marked with -1 and thus will be ignored. All predictions with 0.5 <= iou will be marked with 1 and
+                thus will be considered as true positives.
+        """
+        # Add -inf and +inf to first and last position in thresholds
+        thresholds = thresholds[:]
+        if thresholds[0] < 0:
+            raise ValueError("Thresholds should be positive")
+        thresholds.insert(0, -float("inf"))
+        thresholds.append(float("inf"))
+        # Currently torchscript does not support all + generator
+        if not all(low <= high for (low, high) in zip(thresholds[:-1], thresholds[1:])):
+            raise ValueError("Thresholds should be sorted.")
+        if not all(l in [-1, 0, 1] for l in labels):
+            raise ValueError("All labels should be either -1, 0 or 1")
+        if len(labels) != len(thresholds) - 1:
+            raise ValueError("Number of labels should be equal to number of thresholds - 1")
+        self.thresholds = thresholds
+        self.labels = labels
+        self.allow_low_quality_matches = allow_low_quality_matches
+
+    def __call__(self, match_quality_matrix):
+        """
+        Args:
+            match_quality_matrix (Tensor[float]): an MxN tensor, containing the
+                pairwise quality between M ground-truth elements and N predicted elements. All elements must be >= 0
+                (due to the us of `torch.nonzero` for selecting indices in `set_low_quality_matches_`).
+
+        Returns:
+            matches (Tensor[int64]): a vector of length N, where matches[i] is a matched
+                ground-truth index in [0, M)
+            match_labels (Tensor[int8]): a vector of length N, where pred_labels[i] indicates
+                whether a prediction is a true or false positive or ignored
+        """
+        assert match_quality_matrix.dim() == 2
+        if match_quality_matrix.numel() == 0:
+            default_matches = match_quality_matrix.new_full((match_quality_matrix.size(1),), 0, dtype=torch.int64)
+            # When no gt boxes exist, we define IOU = 0 and therefore set labels
+            # to `self.labels[0]`, which usually defaults to background class 0
+            # To choose to ignore instead, can make labels=[-1,0,-1,1] + set appropriate thresholds
+            default_match_labels = match_quality_matrix.new_full(
+                (match_quality_matrix.size(1),), self.labels[0], dtype=torch.int8
+            )
+            return default_matches, default_match_labels
+
+        assert torch.all(match_quality_matrix >= 0)
+
+        # match_quality_matrix is M (gt) x N (predicted)
+        # Max over gt elements (dim 0) to find best gt candidate for each prediction
+        matched_vals, matches = match_quality_matrix.max(dim=0)
+
+        match_labels = matches.new_full(matches.size(), 1, dtype=torch.int8)
+
+        for l, low, high in zip(self.labels, self.thresholds[:-1], self.thresholds[1:]):
+            low_high = (matched_vals >= low) & (matched_vals < high)
+            match_labels[low_high] = l
+
+        if self.allow_low_quality_matches:
+            self.set_low_quality_matches_(match_labels, match_quality_matrix)
+
+        return matches, match_labels
+
+    def set_low_quality_matches_(self, match_labels, match_quality_matrix):
+        """
+        Produce additional matches for predictions that have only low-quality matches. Specifically, for each
+        ground-truth G find the set of predictions that have maximum overlap with it (including ties); for each
+        prediction in that set, if it is unmatched, then match it to the ground-truth G.
+
+        This function implements the RPN assignment case (i) in Sec. 3.1.2 of :paper:`Faster R-CNN`.
+        """
+        # For each gt, find the prediction with which it has highest quality
+        highest_quality_foreach_gt, _ = match_quality_matrix.max(dim=1)
+        # Find the highest quality match available, even if it is low, including ties.
+        # Note that the matches qualities must be positive due to the use of
+        # `torch.nonzero`.
+        _, pred_inds_with_highest_quality = nonzero_tuple(match_quality_matrix == highest_quality_foreach_gt[:, None])
+        # If an anchor was labeled positive only due to a low-quality match
+        # with gt_A, but it has larger overlap with gt_B, it's matched index will still be gt_B.
+        # This follows the implementation in Detectron, and is found to have no significant impact.
+        match_labels[pred_inds_with_highest_quality] = 1
+
+
+# from https://github.com/facebookresearch/detectron2/blob/cbbc1ce26473cb2a5cc8f58e8ada9ae14cb41052/detectron2/modeling/sampling.py#L9
+def subsample_labels(labels: torch.Tensor, num_samples: int, positive_fraction: float, bg_label: int):
+    """
+    Return `num_samples` (or fewer, if not enough found) random samples from `labels` which is a mixture of positives &
+    negatives. It will try to return as many positives as possible without exceeding `positive_fraction * num_samples`,
+    and then try to fill the remaining slots with negatives.
+
+    Args:
+        labels (Tensor): (N, ) label vector with values:
+            * -1: ignore
+            * bg_label: background ("negative") class
+            * otherwise: one or more foreground ("positive") classes
+        num_samples (int): The total number of labels with value >= 0 to return.
+            Values that are not sampled will be filled with -1 (ignore).
+        positive_fraction (float): The number of subsampled labels with values > 0
+            is `min(num_positives, int(positive_fraction * num_samples))`. The number of negatives sampled is
+            `min(num_negatives, num_samples - num_positives_sampled)`. In order words, if there are not enough
+            positives, the sample is filled with negatives. If there are also not enough negatives, then as many
+            elements are sampled as is possible.
+        bg_label (int): label index of background ("negative") class.
+
+    Returns:
+        pos_idx, neg_idx (Tensor):
+            1D vector of indices. The total length of both is `num_samples` or fewer.
+    """
+    positive = nonzero_tuple((labels != -1) & (labels != bg_label))[0]
+    negative = nonzero_tuple(labels == bg_label)[0]
+
+    num_pos = int(num_samples * positive_fraction)
+    # protect against not enough positive examples
+    num_pos = min(positive.numel(), num_pos)
+    num_neg = num_samples - num_pos
+    # protect against not enough negative examples
+    num_neg = min(negative.numel(), num_neg)
+
+    # randomly select positive and negative examples
+    perm1 = torch.randperm(positive.numel(), device=positive.device)[:num_pos]
+    perm2 = torch.randperm(negative.numel(), device=negative.device)[:num_neg]
+
+    pos_idx = positive[perm1]
+    neg_idx = negative[perm2]
+    return pos_idx, neg_idx
+
+
+def sample_topk_per_gt(pr_inds, gt_inds, iou, k):
+    if len(gt_inds) == 0:
+        return pr_inds, gt_inds
+    # find topk matches for each gt
+    gt_inds2, counts = gt_inds.unique(return_counts=True)
+    scores, pr_inds2 = iou[gt_inds2].topk(k, dim=1)
+    gt_inds2 = gt_inds2[:, None].repeat(1, k)
+
+    # filter to as many matches that gt has
+    pr_inds3 = torch.cat([pr[:c] for c, pr in zip(counts, pr_inds2)])
+    gt_inds3 = torch.cat([gt[:c] for c, gt in zip(counts, gt_inds2)])
+    return pr_inds3, gt_inds3
+
+
+# modified from https://github.com/facebookresearch/detectron2/blob/cbbc1ce26473cb2a5cc8f58e8ada9ae14cb41052/detectron2/modeling/roi_heads/roi_heads.py#L123
+class DetaStage2Assigner(nn.Module):
+    def __init__(self, num_queries, max_k=4):
+        super().__init__()
+        self.positive_fraction = 0.25
+        self.bg_label = 400  # number > 91 to filter out later
+        self.batch_size_per_image = num_queries
+        self.proposal_matcher = DetaMatcher(thresholds=[0.6], labels=[0, 1], allow_low_quality_matches=True)
+        self.k = max_k
+
+    def _sample_proposals(self, matched_idxs: torch.Tensor, matched_labels: torch.Tensor, gt_classes: torch.Tensor):
+        """
+        Based on the matching between N proposals and M groundtruth, sample the proposals and set their classification
+        labels.
+
+        Args:
+            matched_idxs (Tensor): a vector of length N, each is the best-matched
+                gt index in [0, M) for each proposal.
+            matched_labels (Tensor): a vector of length N, the matcher's label
+                (one of cfg.MODEL.ROI_HEADS.IOU_LABELS) for each proposal.
+            gt_classes (Tensor): a vector of length M.
+
+        Returns:
+            Tensor: a vector of indices of sampled proposals. Each is in [0, N). Tensor: a vector of the same length,
+            the classification label for
+                each sampled proposal. Each sample is labeled as either a category in [0, num_classes) or the
+                background (num_classes).
+        """
+        has_gt = gt_classes.numel() > 0
+        # Get the corresponding GT for each proposal
+        if has_gt:
+            gt_classes = gt_classes[matched_idxs]
+            # Label unmatched proposals (0 label from matcher) as background (label=num_classes)
+            gt_classes[matched_labels == 0] = self.bg_label
+            # Label ignore proposals (-1 label)
+            gt_classes[matched_labels == -1] = -1
+        else:
+            gt_classes = torch.zeros_like(matched_idxs) + self.bg_label
+
+        sampled_fg_idxs, sampled_bg_idxs = subsample_labels(
+            gt_classes, self.batch_size_per_image, self.positive_fraction, self.bg_label
+        )
+
+        sampled_idxs = torch.cat([sampled_fg_idxs, sampled_bg_idxs], dim=0)
+        return sampled_idxs, gt_classes[sampled_idxs]
+
+    def forward(self, outputs, targets, return_cost_matrix=False):
+        # COCO categories are from 1 to 90. They set num_classes=91 and apply sigmoid.
+
+        bs = len(targets)
+        indices = []
+        ious = []
+        for b in range(bs):
+            iou, _ = box_iou(
+                center_to_corners_format(targets[b]["boxes"]),
+                center_to_corners_format(outputs["init_reference"][b].detach()),
+            )
+            matched_idxs, matched_labels = self.proposal_matcher(
+                iou
+            )  # proposal_id -> highest_iou_gt_id, proposal_id -> [1 if iou > 0.6, 0 ow]
+            (
+                sampled_idxs,
+                sampled_gt_classes,
+            ) = self._sample_proposals(  # list of sampled proposal_ids, sampled_id -> [0, num_classes)+[bg_label]
+                matched_idxs, matched_labels, targets[b]["class_labels"]
+            )
+            pos_pr_inds = sampled_idxs[sampled_gt_classes != self.bg_label]
+            pos_gt_inds = matched_idxs[pos_pr_inds]
+            pos_pr_inds, pos_gt_inds = self.postprocess_indices(pos_pr_inds, pos_gt_inds, iou)
+            indices.append((pos_pr_inds, pos_gt_inds))
+            ious.append(iou)
+        if return_cost_matrix:
+            return indices, ious
+        return indices
+
+    def postprocess_indices(self, pr_inds, gt_inds, iou):
+        return sample_topk_per_gt(pr_inds, gt_inds, iou, self.k)
+
+
+# modified from https://github.com/facebookresearch/detectron2/blob/cbbc1ce26473cb2a5cc8f58e8ada9ae14cb41052/detectron2/modeling/proposal_generator/rpn.py#L181
+class DetaStage1Assigner(nn.Module):
+    def __init__(self, t_low=0.3, t_high=0.7, max_k=4):
+        super().__init__()
+        self.positive_fraction = 0.5
+        self.batch_size_per_image = 256
+        self.k = max_k
+        self.t_low = t_low
+        self.t_high = t_high
+        self.anchor_matcher = DetaMatcher(
+            thresholds=[t_low, t_high], labels=[0, -1, 1], allow_low_quality_matches=True
+        )
+
+    def _subsample_labels(self, label):
+        """
+        Randomly sample a subset of positive and negative examples, and overwrite the label vector to the ignore value
+        (-1) for all elements that are not included in the sample.
+
+        Args:
+            labels (Tensor): a vector of -1, 0, 1. Will be modified in-place and returned.
+        """
+        pos_idx, neg_idx = subsample_labels(label, self.batch_size_per_image, self.positive_fraction, 0)
+        # Fill with the ignore label (-1), then set positive and negative labels
+        label.fill_(-1)
+        label.scatter_(0, pos_idx, 1)
+        label.scatter_(0, neg_idx, 0)
+        return label
+
+    def forward(self, outputs, targets):
+        bs = len(targets)
+        indices = []
+        for b in range(bs):
+            anchors = outputs["anchors"][b]
+            if len(targets[b]["boxes"]) == 0:
+                indices.append(
+                    (
+                        torch.tensor([], dtype=torch.long, device=anchors.device),
+                        torch.tensor([], dtype=torch.long, device=anchors.device),
+                    )
+                )
+                continue
+            iou, _ = box_iou(
+                center_to_corners_format(targets[b]["boxes"]),
+                center_to_corners_format(anchors),
+            )
+            matched_idxs, matched_labels = self.anchor_matcher(
+                iou
+            )  # proposal_id -> highest_iou_gt_id, proposal_id -> [1 if iou > 0.7, 0 if iou < 0.3, -1 ow]
+            matched_labels = self._subsample_labels(matched_labels)
+
+            all_pr_inds = torch.arange(len(anchors), device=matched_labels.device)
+            pos_pr_inds = all_pr_inds[matched_labels == 1]
+            pos_gt_inds = matched_idxs[pos_pr_inds]
+            pos_pr_inds, pos_gt_inds = self.postprocess_indices(pos_pr_inds, pos_gt_inds, iou)
+            pos_pr_inds, pos_gt_inds = pos_pr_inds.to(anchors.device), pos_gt_inds.to(anchors.device)
+            indices.append((pos_pr_inds, pos_gt_inds))
+        return indices
+
+    def postprocess_indices(self, pr_inds, gt_inds, iou):
+        return sample_topk_per_gt(pr_inds, gt_inds, iou, self.k)
+
+
+__all__ = ["DetaForObjectDetection", "DetaModel", "DetaPreTrainedModel"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/efficientformer/__init__.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/efficientformer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..db3d0a634051c713aee33b48f2962647ac48d0eb
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/efficientformer/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ....utils import _LazyModule
+from ....utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_efficientformer import *
+    from .image_processing_efficientformer import *
+    from .modeling_efficientformer import *
+    from .modeling_tf_efficientformer import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/efficientformer/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/efficientformer/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7a80a176236d775b378390231e58c10327e199cb
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/efficientformer/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/efficientformer/__pycache__/configuration_efficientformer.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/efficientformer/__pycache__/configuration_efficientformer.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c87b2f460d6525a39fc0c28bee35785525a7a3da
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/efficientformer/__pycache__/configuration_efficientformer.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/efficientformer/__pycache__/image_processing_efficientformer.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/efficientformer/__pycache__/image_processing_efficientformer.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e821e1941a51262995b29c5e3d753f4113dc9983
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/efficientformer/__pycache__/image_processing_efficientformer.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/efficientformer/__pycache__/modeling_efficientformer.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/efficientformer/__pycache__/modeling_efficientformer.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..631e714f39ea6b9609fc059f3581e528316f8063
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/efficientformer/__pycache__/modeling_efficientformer.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/efficientformer/__pycache__/modeling_tf_efficientformer.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/efficientformer/__pycache__/modeling_tf_efficientformer.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0c7e67ef896e4de08f23d2a7e669416b896744c2
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/efficientformer/__pycache__/modeling_tf_efficientformer.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/efficientformer/configuration_efficientformer.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/efficientformer/configuration_efficientformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..9aaee649ba047e5967f132eb0b3527dce5855d11
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/efficientformer/configuration_efficientformer.py
@@ -0,0 +1,170 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""EfficientFormer model configuration"""
+
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class EfficientFormerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of an [`EfficientFormerModel`]. It is used to
+    instantiate an EfficientFormer model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the EfficientFormer
+    [snap-research/efficientformer-l1](https://huggingface.co/snap-research/efficientformer-l1) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        depths (`List(int)`, *optional*, defaults to `[3, 2, 6, 4]`)
+            Depth of each stage.
+        hidden_sizes (`List(int)`, *optional*, defaults to `[48, 96, 224, 448]`)
+            Dimensionality of each stage.
+        downsamples (`List(bool)`, *optional*, defaults to `[True, True, True, True]`)
+            Whether or not to downsample inputs between two stages.
+        dim (`int`, *optional*, defaults to 448):
+            Number of channels in Meta3D layers
+        key_dim (`int`, *optional*, defaults to 32):
+            The size of the key in meta3D block.
+        attention_ratio (`int`, *optional*, defaults to 4):
+            Ratio of the dimension of the query and value to the dimension of the key in MSHA block
+        resolution (`int`, *optional*, defaults to 7)
+            Size of each patch
+        num_hidden_layers (`int`, *optional*, defaults to 5):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the 3D MetaBlock.
+        mlp_expansion_ratio (`int`, *optional*, defaults to 4):
+            Ratio of size of the hidden dimensionality of an MLP to the dimensionality of its input.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings and encoder.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        pool_size (`int`, *optional*, defaults to 3):
+            Kernel size of pooling layers.
+        downsample_patch_size (`int`, *optional*, defaults to 3):
+            The size of patches in downsampling layers.
+        downsample_stride (`int`, *optional*, defaults to 2):
+            The stride of convolution kernels in downsampling layers.
+        downsample_pad (`int`, *optional*, defaults to 1):
+            Padding in downsampling layers.
+        drop_path_rate (`int`, *optional*, defaults to 0):
+            Rate at which to increase dropout probability in DropPath.
+        num_meta3d_blocks (`int`, *optional*, defaults to 1):
+            The number of 3D MetaBlocks in the last stage.
+        distillation (`bool`, *optional*, defaults to `True`):
+            Whether to add a distillation head.
+        use_layer_scale (`bool`, *optional*, defaults to `True`):
+            Whether to scale outputs from token mixers.
+        layer_scale_init_value (`float`, *optional*, defaults to 1e-5):
+            Factor by which outputs from token mixers are scaled.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        image_size (`int`, *optional*, defaults to `224`):
+            The size (resolution) of each image.
+
+    Example:
+
+    ```python
+    >>> from transformers import EfficientFormerConfig, EfficientFormerModel
+
+    >>> # Initializing a EfficientFormer efficientformer-l1 style configuration
+    >>> configuration = EfficientFormerConfig()
+
+    >>> # Initializing a EfficientFormerModel (with random weights) from the efficientformer-l3 style configuration
+    >>> model = EfficientFormerModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "efficientformer"
+
+    def __init__(
+        self,
+        depths: list[int] = [3, 2, 6, 4],
+        hidden_sizes: list[int] = [48, 96, 224, 448],
+        downsamples: list[bool] = [True, True, True, True],
+        dim: int = 448,
+        key_dim: int = 32,
+        attention_ratio: int = 4,
+        resolution: int = 7,
+        num_hidden_layers: int = 5,
+        num_attention_heads: int = 8,
+        mlp_expansion_ratio: int = 4,
+        hidden_dropout_prob: float = 0.0,
+        patch_size: int = 16,
+        num_channels: int = 3,
+        pool_size: int = 3,
+        downsample_patch_size: int = 3,
+        downsample_stride: int = 2,
+        downsample_pad: int = 1,
+        drop_path_rate: float = 0.0,
+        num_meta3d_blocks: int = 1,
+        distillation: bool = True,
+        use_layer_scale: bool = True,
+        layer_scale_init_value: float = 1e-5,
+        hidden_act: str = "gelu",
+        initializer_range: float = 0.02,
+        layer_norm_eps: float = 1e-12,
+        image_size: int = 224,
+        batch_norm_eps: float = 1e-05,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.hidden_sizes = hidden_sizes
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.depths = depths
+        self.mlp_expansion_ratio = mlp_expansion_ratio
+        self.downsamples = downsamples
+        self.dim = dim
+        self.key_dim = key_dim
+        self.attention_ratio = attention_ratio
+        self.resolution = resolution
+        self.pool_size = pool_size
+        self.downsample_patch_size = downsample_patch_size
+        self.downsample_stride = downsample_stride
+        self.downsample_pad = downsample_pad
+        self.drop_path_rate = drop_path_rate
+        self.num_meta3d_blocks = num_meta3d_blocks
+        self.distillation = distillation
+        self.use_layer_scale = use_layer_scale
+        self.layer_scale_init_value = layer_scale_init_value
+        self.image_size = image_size
+        self.batch_norm_eps = batch_norm_eps
+
+
+__all__ = [
+    "EfficientFormerConfig",
+]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/efficientformer/image_processing_efficientformer.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/efficientformer/image_processing_efficientformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2dd1281e920fdea3b4c51ef79c09bc86c605921
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/efficientformer/image_processing_efficientformer.py
@@ -0,0 +1,324 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for EfficientFormer."""
+
+from typing import Optional, Union
+
+import numpy as np
+
+from ....image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ....image_transforms import (
+    get_resize_output_image_size,
+    resize,
+    to_channel_dimension_format,
+)
+from ....image_utils import (
+    IMAGENET_DEFAULT_MEAN,
+    IMAGENET_DEFAULT_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_batched,
+    is_scaled_image,
+    to_numpy_array,
+    valid_images,
+    validate_kwargs,
+    validate_preprocess_arguments,
+)
+from ....utils import TensorType, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class EfficientFormerImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a EfficientFormer image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `(size["height"],
+            size["width"])`. Can be overridden by the `do_resize` parameter in the `preprocess` method.
+        size (`dict`, *optional*, defaults to `{"height": 224, "width": 224}`):
+            Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
+            method.
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+            Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
+            `preprocess` method.
+        do_center_crop (`bool`, *optional*, defaults to `True`):
+            Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the
+            `preprocess` method.
+        crop_size (`dict[str, int]` *optional*, defaults to 224):
+            Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess`
+            method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
+            parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
+            `preprocess` method.
+        do_normalize:
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method.
+        image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_center_crop: bool = True,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        crop_size: Optional[dict[str, int]] = None,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 224, "width": 224}
+        size = get_size_dict(size)
+        crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
+        crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
+
+        self.do_resize = do_resize
+        self.do_rescale = do_rescale
+        self.do_normalize = do_normalize
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.size = size
+        self.resample = resample
+        self.rescale_factor = rescale_factor
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
+        self._valid_processor_keys = [
+            "images",
+            "do_resize",
+            "size",
+            "resample",
+            "do_center_crop",
+            "crop_size",
+            "do_rescale",
+            "rescale_factor",
+            "do_normalize",
+            "image_mean",
+            "image_std",
+            "return_tensors",
+            "data_format",
+            "input_data_format",
+        ]
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image to `(size["height"], size["width"])`.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`dict[str, int]`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            resample:
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+
+        Returns:
+            `np.ndarray`: The resized image.
+        """
+        size = get_size_dict(size)
+
+        if "shortest_edge" in size:
+            size = get_resize_output_image_size(
+                image, size=size["shortest_edge"], default_to_square=False, input_data_format=input_data_format
+            )
+            # size = get_resize_output_image_size(image, size["shortest_edge"], size["longest_edge"])
+        elif "height" in size and "width" in size:
+            size = (size["height"], size["width"])
+        else:
+            raise ValueError(f"Size must contain 'height' and 'width' keys or 'shortest_edge' key. Got {size.keys()}")
+        return resize(
+            image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs
+        )
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample: Optional[PILImageResampling] = None,
+        do_center_crop: Optional[bool] = None,
+        crop_size: Optional[int] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Dictionary in the format `{"height": h, "width": w}` specifying the size of the output image after
+                resizing.
+            resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`):
+                `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has
+                an effect if `do_resize` is set to `True`.
+            do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
+                Whether to center crop the image.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            crop_size (`dict[str, int]`, *optional*, defaults to `self.crop_size`):
+                Size of the center crop. Only has an effect if `do_center_crop` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use if `do_normalize` is set to `True`.
+            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use if `do_normalize` is set to `True`.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
+        crop_size = crop_size if crop_size is not None else self.crop_size
+        crop_size = get_size_dict(crop_size, param_name="crop_size", default_to_square=True)
+        resample = resample if resample is not None else self.resample
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+
+        size = size if size is not None else self.size
+        size_dict = get_size_dict(size)
+
+        validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys)
+
+        if not is_batched(images):
+            images = [images]
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_center_crop=do_center_crop,
+            crop_size=crop_size,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if do_rescale and is_scaled_image(images[0]):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        if do_resize:
+            images = [
+                self.resize(image=image, size=size_dict, resample=resample, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        if do_center_crop:
+            images = [
+                self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) for image in images
+            ]
+
+        if do_rescale:
+            images = [
+                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        if do_normalize:
+            images = [
+                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
+
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+
+__all__ = ["EfficientFormerImageProcessor"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/efficientformer/modeling_efficientformer.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/efficientformer/modeling_efficientformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..d35d3e82c007c6c6cf9326e1c933df34c6ae41ba
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/efficientformer/modeling_efficientformer.py
@@ -0,0 +1,786 @@
+# coding=utf-8
+# Copyright 2022 Snapchat Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch EfficientFormer model."""
+
+import itertools
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+from torch import nn
+
+from ....activations import ACT2FN
+from ....modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
+from ....modeling_utils import PreTrainedModel
+from ....utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
+from .configuration_efficientformer import EfficientFormerConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "EfficientFormerConfig"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "snap-research/efficientformer-l1-300"
+_EXPECTED_OUTPUT_SHAPE = [1, 49, 448]
+
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "snap-research/efficientformer-l1-300"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "Egyptian cat"
+
+
+class EfficientFormerPatchEmbeddings(nn.Module):
+    """
+    This class performs downsampling between two stages. For the input tensor with the shape [batch_size, num_channels,
+    height, width] it produces output tensor with the shape [batch_size, num_channels, height/stride, width/stride]
+    """
+
+    def __init__(self, config: EfficientFormerConfig, num_channels: int, embed_dim: int, apply_norm: bool = True):
+        super().__init__()
+        self.num_channels = num_channels
+
+        self.projection = nn.Conv2d(
+            num_channels,
+            embed_dim,
+            kernel_size=config.downsample_patch_size,
+            stride=config.downsample_stride,
+            padding=config.downsample_pad,
+        )
+        self.norm = nn.BatchNorm2d(embed_dim, eps=config.batch_norm_eps) if apply_norm else nn.Identity()
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        batch_size, num_channels, height, width = pixel_values.shape
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+
+        embeddings = self.projection(pixel_values)
+        embeddings = self.norm(embeddings)
+
+        return embeddings
+
+
+class EfficientFormerSelfAttention(nn.Module):
+    def __init__(self, dim: int, key_dim: int, num_heads: int, attention_ratio: int, resolution: int):
+        super().__init__()
+
+        self.num_heads = num_heads
+        self.key_dim = key_dim
+        self.attention_ratio = attention_ratio
+        self.scale = key_dim**-0.5
+        self.total_key_dim = key_dim * num_heads
+        self.expanded_key_dim = int(attention_ratio * key_dim)
+        self.total_expanded_key_dim = int(self.expanded_key_dim * num_heads)
+        hidden_size = self.total_expanded_key_dim + self.total_key_dim * 2
+        self.qkv = nn.Linear(dim, hidden_size)
+        self.projection = nn.Linear(self.total_expanded_key_dim, dim)
+        points = list(itertools.product(range(resolution), range(resolution)))
+        num_points = len(points)
+        attention_offsets = {}
+        idxs = []
+        for point_1 in points:
+            for point_2 in points:
+                offset = (abs(point_1[0] - point_2[0]), abs(point_1[1] - point_2[1]))
+                if offset not in attention_offsets:
+                    attention_offsets[offset] = len(attention_offsets)
+                idxs.append(attention_offsets[offset])
+        self.attention_biases = torch.nn.Parameter(torch.zeros(num_heads, len(attention_offsets)))
+        self.register_buffer("attention_bias_idxs", torch.LongTensor(idxs).view(num_points, num_points))
+
+    @torch.no_grad()
+    def train(self, mode=True):
+        super().train(mode)
+        if mode and hasattr(self, "ab"):
+            del self.ab
+        else:
+            self.ab = self.attention_biases[:, self.attention_bias_idxs]
+
+    def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> tuple[torch.Tensor]:
+        batch_size, sequence_length, num_channels = hidden_states.shape
+        qkv = self.qkv(hidden_states)
+        query_layer, key_layer, value_layer = qkv.reshape(batch_size, sequence_length, self.num_heads, -1).split(
+            [self.key_dim, self.key_dim, self.expanded_key_dim], dim=3
+        )
+        query_layer = query_layer.permute(0, 2, 1, 3)
+        key_layer = key_layer.permute(0, 2, 1, 3)
+        value_layer = value_layer.permute(0, 2, 1, 3)
+
+        # set `model.to(torch_device)` won't change `self.ab.device`, if there is no follow-up `train` or `eval` call.
+        # Let's do it manually here, so users won't have to do this everytime.
+        if not self.training:
+            self.ab = self.ab.to(self.attention_biases.device)
+        attention_probs = (torch.matmul(query_layer, key_layer.transpose(-2, -1))) * self.scale + (
+            self.attention_biases[:, self.attention_bias_idxs] if self.training else self.ab
+        )
+
+        attention_probs = attention_probs.softmax(dim=-1)
+
+        context_layer = torch.matmul(attention_probs, value_layer).transpose(1, 2)
+        context_layer = context_layer.reshape(batch_size, sequence_length, self.total_expanded_key_dim)
+        context_layer = self.projection(context_layer)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+class EfficientFormerConvStem(nn.Module):
+    def __init__(self, config: EfficientFormerConfig, out_channels: int):
+        super().__init__()
+
+        self.convolution1 = nn.Conv2d(config.num_channels, out_channels // 2, kernel_size=3, stride=2, padding=1)
+        self.batchnorm_before = nn.BatchNorm2d(out_channels // 2, eps=config.batch_norm_eps)
+
+        self.convolution2 = nn.Conv2d(out_channels // 2, out_channels, kernel_size=3, stride=2, padding=1)
+        self.batchnorm_after = nn.BatchNorm2d(out_channels, eps=config.batch_norm_eps)
+
+        self.activation = nn.ReLU()
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        features = self.batchnorm_before(self.convolution1(pixel_values))
+        features = self.activation(features)
+        features = self.batchnorm_after(self.convolution2(features))
+        features = self.activation(features)
+
+        return features
+
+
+class EfficientFormerPooling(nn.Module):
+    def __init__(self, pool_size: int):
+        super().__init__()
+        self.pool = nn.AvgPool2d(pool_size, stride=1, padding=pool_size // 2, count_include_pad=False)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        output = self.pool(hidden_states) - hidden_states
+        return output
+
+
+class EfficientFormerDenseMlp(nn.Module):
+    def __init__(
+        self,
+        config: EfficientFormerConfig,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+
+        self.linear_in = nn.Linear(in_features, hidden_features)
+        self.activation = ACT2FN[config.hidden_act]
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.linear_out = nn.Linear(hidden_features, out_features)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.linear_in(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.linear_out(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        return hidden_states
+
+
+class EfficientFormerConvMlp(nn.Module):
+    def __init__(
+        self,
+        config: EfficientFormerConfig,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        drop: float = 0.0,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+
+        self.convolution1 = nn.Conv2d(in_features, hidden_features, 1)
+        self.activation = ACT2FN[config.hidden_act]
+        self.convolution2 = nn.Conv2d(hidden_features, out_features, 1)
+        self.dropout = nn.Dropout(drop)
+
+        self.batchnorm_before = nn.BatchNorm2d(hidden_features, eps=config.batch_norm_eps)
+        self.batchnorm_after = nn.BatchNorm2d(out_features, eps=config.batch_norm_eps)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.convolution1(hidden_state)
+        hidden_state = self.batchnorm_before(hidden_state)
+
+        hidden_state = self.activation(hidden_state)
+        hidden_state = self.dropout(hidden_state)
+        hidden_state = self.convolution2(hidden_state)
+
+        hidden_state = self.batchnorm_after(hidden_state)
+        hidden_state = self.dropout(hidden_state)
+
+        return hidden_state
+
+
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+
+
+class EfficientFormerDropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return f"p={self.drop_prob}"
+
+
+class EfficientFormerFlat(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor]:
+        hidden_states = hidden_states.flatten(2).transpose(1, 2)
+        return hidden_states
+
+
+class EfficientFormerMeta3D(nn.Module):
+    def __init__(self, config: EfficientFormerConfig, dim: int, drop_path: float = 0.0):
+        super().__init__()
+
+        self.token_mixer = EfficientFormerSelfAttention(
+            dim=config.dim,
+            key_dim=config.key_dim,
+            num_heads=config.num_attention_heads,
+            attention_ratio=config.attention_ratio,
+            resolution=config.resolution,
+        )
+
+        self.layernorm1 = nn.LayerNorm(dim, eps=config.layer_norm_eps)
+        self.layernorm2 = nn.LayerNorm(dim, eps=config.layer_norm_eps)
+
+        mlp_hidden_dim = int(dim * config.mlp_expansion_ratio)
+        self.mlp = EfficientFormerDenseMlp(config, in_features=dim, hidden_features=mlp_hidden_dim)
+
+        self.drop_path = EfficientFormerDropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.use_layer_scale = config.use_layer_scale
+        if config.use_layer_scale:
+            self.layer_scale_1 = nn.Parameter(config.layer_scale_init_value * torch.ones(dim), requires_grad=True)
+            self.layer_scale_2 = nn.Parameter(config.layer_scale_init_value * torch.ones(dim), requires_grad=True)
+
+    def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> tuple[torch.Tensor]:
+        self_attention_outputs = self.token_mixer(self.layernorm1(hidden_states), output_attentions)
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        if self.use_layer_scale:
+            layer_output = hidden_states + self.drop_path(
+                self.layer_scale_1.unsqueeze(0).unsqueeze(0) * attention_output
+            )
+            layer_output = layer_output + self.drop_path(
+                self.layer_scale_2.unsqueeze(0).unsqueeze(0) * self.mlp(self.layernorm2(layer_output))
+            )
+        else:
+            layer_output = hidden_states + self.drop_path(attention_output)
+            layer_output = layer_output + self.drop_path(self.mlp(self.layernorm2(layer_output)))
+
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+
+class EfficientFormerMeta3DLayers(nn.Module):
+    def __init__(self, config: EfficientFormerConfig):
+        super().__init__()
+        drop_paths = [
+            config.drop_path_rate * (block_idx + sum(config.depths[:-1]))
+            for block_idx in range(config.num_meta3d_blocks)
+        ]
+        self.blocks = nn.ModuleList(
+            [EfficientFormerMeta3D(config, config.hidden_sizes[-1], drop_path=drop_path) for drop_path in drop_paths]
+        )
+
+    def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> tuple[torch.Tensor]:
+        all_attention_outputs = () if output_attentions else None
+
+        for layer_module in self.blocks:
+            if isinstance(hidden_states, tuple):
+                hidden_states = hidden_states[0]
+
+            hidden_states = layer_module(hidden_states, output_attentions)
+
+            if output_attentions:
+                all_attention_outputs = all_attention_outputs + (hidden_states[1],)
+
+        if output_attentions:
+            outputs = (hidden_states[0],) + all_attention_outputs
+            return outputs
+
+        return hidden_states
+
+
+class EfficientFormerMeta4D(nn.Module):
+    def __init__(self, config: EfficientFormerConfig, dim: int, drop_path: float = 0.0):
+        super().__init__()
+        pool_size = config.pool_size if config.pool_size is not None else 3
+        self.token_mixer = EfficientFormerPooling(pool_size=pool_size)
+        mlp_hidden_dim = int(dim * config.mlp_expansion_ratio)
+        self.mlp = EfficientFormerConvMlp(
+            config, in_features=dim, hidden_features=mlp_hidden_dim, drop=config.hidden_dropout_prob
+        )
+
+        self.drop_path = EfficientFormerDropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.use_layer_scale = config.use_layer_scale
+        if config.use_layer_scale:
+            self.layer_scale_1 = nn.Parameter(config.layer_scale_init_value * torch.ones(dim), requires_grad=True)
+            self.layer_scale_2 = nn.Parameter(config.layer_scale_init_value * torch.ones(dim), requires_grad=True)
+
+    def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor]:
+        outputs = self.token_mixer(hidden_states)
+
+        if self.use_layer_scale:
+            layer_output = hidden_states + self.drop_path(self.layer_scale_1.unsqueeze(-1).unsqueeze(-1) * outputs)
+
+            layer_output = layer_output + self.drop_path(
+                self.layer_scale_2.unsqueeze(-1).unsqueeze(-1) * self.mlp(layer_output)
+            )
+        else:
+            layer_output = hidden_states + self.drop_path(outputs)
+            layer_output = layer_output + self.drop_path(self.mlp(layer_output))
+
+        return layer_output
+
+
+class EfficientFormerMeta4DLayers(nn.Module):
+    def __init__(self, config: EfficientFormerConfig, stage_idx: int):
+        super().__init__()
+        num_layers = (
+            config.depths[stage_idx] if stage_idx != -1 else config.depths[stage_idx] - config.num_meta3d_blocks
+        )
+        drop_paths = [
+            config.drop_path_rate * (block_idx + sum(config.depths[:stage_idx])) for block_idx in range(num_layers)
+        ]
+
+        self.blocks = nn.ModuleList(
+            [
+                EfficientFormerMeta4D(config, config.hidden_sizes[stage_idx], drop_path=drop_path)
+                for drop_path in drop_paths
+            ]
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor]:
+        for layer_module in self.blocks:
+            hidden_states = layer_module(hidden_states)
+        return hidden_states
+
+
+class EfficientFormerIntermediateStage(nn.Module):
+    def __init__(self, config: EfficientFormerConfig, index: int):
+        super().__init__()
+        self.meta4D_layers = EfficientFormerMeta4DLayers(config, index)
+
+    def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor]:
+        hidden_states = self.meta4D_layers(hidden_states)
+        return hidden_states
+
+
+class EfficientFormerLastStage(nn.Module):
+    def __init__(self, config: EfficientFormerConfig):
+        super().__init__()
+        self.meta4D_layers = EfficientFormerMeta4DLayers(config, -1)
+        self.flat = EfficientFormerFlat()
+        self.meta3D_layers = EfficientFormerMeta3DLayers(config)
+
+    def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> tuple[torch.Tensor]:
+        hidden_states = self.meta4D_layers(hidden_states)
+        hidden_states = self.flat(hidden_states)
+        hidden_states = self.meta3D_layers(hidden_states, output_attentions)
+
+        return hidden_states
+
+
+class EfficientFormerEncoder(nn.Module):
+    def __init__(self, config: EfficientFormerConfig):
+        super().__init__()
+        self.config = config
+        num_intermediate_stages = len(config.depths) - 1
+        downsamples = [
+            config.downsamples[i] or config.hidden_sizes[i] != config.hidden_sizes[i + 1]
+            for i in range(num_intermediate_stages)
+        ]
+        intermediate_stages = []
+
+        for i in range(num_intermediate_stages):
+            intermediate_stages.append(EfficientFormerIntermediateStage(config, i))
+            if downsamples[i]:
+                intermediate_stages.append(
+                    EfficientFormerPatchEmbeddings(config, config.hidden_sizes[i], config.hidden_sizes[i + 1])
+                )
+
+        self.intermediate_stages = nn.ModuleList(intermediate_stages)
+        self.last_stage = EfficientFormerLastStage(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        output_hidden_states: bool = False,
+        output_attentions: bool = False,
+        return_dict: bool = True,
+    ) -> BaseModelOutput:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        for layer_module in self.intermediate_stages:
+            hidden_states = layer_module(hidden_states)
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+        layer_output = self.last_stage(hidden_states, output_attentions=output_attentions)
+
+        if output_attentions:
+            all_self_attentions = all_self_attentions + layer_output[1:]
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (layer_output[0],)
+
+        if not return_dict:
+            return tuple(v for v in [layer_output[0], all_hidden_states, all_self_attentions] if v is not None)
+
+        return BaseModelOutput(
+            last_hidden_state=layer_output[0],
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class EfficientFormerPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config: EfficientFormerConfig
+    base_model_prefix = "efficientformer"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = False
+
+    def _init_weights(self, module: nn.Module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+EFFICIENTFORMER_START_DOCSTRING = r"""
+    This model is a PyTorch [nn.Module](https://pytorch.org/docs/stable/nn.html#nn.Module) subclass. Use it as a
+    regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior.
+
+    Parameters:
+        config ([`EfficientFormerConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+EFFICIENTFORMER_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`ViTImageProcessor`]. See
+            [`ViTImageProcessor.preprocess`] for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare EfficientFormer Model transformer outputting raw hidden-states without any specific head on top.",
+    EFFICIENTFORMER_START_DOCSTRING,
+)
+class EfficientFormerModel(EfficientFormerPreTrainedModel):
+    def __init__(self, config: EfficientFormerConfig):
+        super().__init__(config)
+        self.config = config
+        _no_split_modules = ["EfficientFormerMeta4D"]
+
+        self.patch_embed = EfficientFormerConvStem(config, config.hidden_sizes[0])
+        self.encoder = EfficientFormerEncoder(config)
+        self.layernorm = nn.LayerNorm(config.hidden_sizes[-1], eps=config.layer_norm_eps)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPooling,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        embedding_output = self.patch_embed(pixel_values)
+        encoder_outputs = self.encoder(
+            embedding_output, output_attentions=output_attentions, output_hidden_states=output_hidden_states
+        )
+
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+
+        if not return_dict:
+            head_outputs = (sequence_output,)
+            return head_outputs + encoder_outputs[1:]
+
+        return BaseModelOutput(
+            last_hidden_state=sequence_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    EfficientFormer Model transformer with an image classification head on top (a linear layer on top of the final
+    hidden state of the [CLS] token) e.g. for ImageNet.
+    """,
+    EFFICIENTFORMER_START_DOCSTRING,
+)
+class EfficientFormerForImageClassification(EfficientFormerPreTrainedModel):
+    def __init__(self, config: EfficientFormerConfig):
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.efficientformer = EfficientFormerModel(config)
+
+        # Classifier head
+        self.classifier = (
+            nn.Linear(config.hidden_sizes[-1], config.num_labels) if config.num_labels > 0 else nn.Identity()
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=ImageClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, ImageClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.efficientformer(
+            pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.classifier(sequence_output.mean(-2))
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(labels, logits, self.config)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return ImageClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@dataclass
+class EfficientFormerForImageClassificationWithTeacherOutput(ModelOutput):
+    """
+    Output type of [`EfficientFormerForImageClassificationWithTeacher`].
+
+    Args:
+        logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
+            Prediction scores as the average of the cls_logits and distillation logits.
+        cls_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
+            Prediction scores of the classification head (i.e. the linear layer on top of the final hidden state of the
+            class token).
+        distillation_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
+            Prediction scores of the distillation head (i.e. the linear layer on top of the final hidden state of the
+            distillation token).
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
+            plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+    """
+
+    logits: Optional[torch.FloatTensor] = None
+    cls_logits: Optional[torch.FloatTensor] = None
+    distillation_logits: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+
+
+@add_start_docstrings(
+    """
+    EfficientFormer Model transformer with image classification heads on top (a linear layer on top of the final hidden
+    state of the [CLS] token and a linear layer on top of the final hidden state of the distillation token) e.g. for
+    ImageNet.
+
+    <Tip warning={true}>
+
+           This model supports inference-only. Fine-tuning with distillation (i.e. with a teacher) is not yet
+           supported.
+
+    </Tip>
+    """,
+    EFFICIENTFORMER_START_DOCSTRING,
+)
+class EfficientFormerForImageClassificationWithTeacher(EfficientFormerPreTrainedModel):
+    def __init__(self, config: EfficientFormerConfig):
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.efficientformer = EfficientFormerModel(config)
+
+        # Classifier head
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
+        # Distillation head
+        self.distillation_classifier = (
+            nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=EfficientFormerForImageClassificationWithTeacherOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, EfficientFormerForImageClassificationWithTeacherOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.efficientformer(
+            pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        cls_logits = self.classifier(sequence_output.mean(-2))
+        distillation_logits = self.distillation_classifier(sequence_output.mean(-2))
+
+        # during inference, return the average of both classifier predictions
+        logits = (cls_logits + distillation_logits) / 2
+
+        if not return_dict:
+            output = (logits, cls_logits, distillation_logits) + outputs[1:]
+            return output
+
+        return EfficientFormerForImageClassificationWithTeacherOutput(
+            logits=logits,
+            cls_logits=cls_logits,
+            distillation_logits=distillation_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = [
+    "EfficientFormerForImageClassification",
+    "EfficientFormerForImageClassificationWithTeacher",
+    "EfficientFormerModel",
+    "EfficientFormerPreTrainedModel",
+]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/efficientformer/modeling_tf_efficientformer.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/efficientformer/modeling_tf_efficientformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..643097e79c3eab47b62f7e7c4781b2b276418176
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/efficientformer/modeling_tf_efficientformer.py
@@ -0,0 +1,1198 @@
+# coding=utf-8
+# Copyright 2023 Snapchat Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TensorFlow EfficientFormer model."""
+
+import itertools
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import tensorflow as tf
+
+from ....activations_tf import ACT2FN
+from ....modeling_tf_outputs import (
+    TFBaseModelOutput,
+    TFBaseModelOutputWithPooling,
+    TFImageClassifierOutput,
+)
+from ....modeling_tf_utils import (
+    TFPreTrainedModel,
+    TFSequenceClassificationLoss,
+    get_initializer,
+    keras,
+    keras_serializable,
+    unpack_inputs,
+)
+from ....tf_utils import shape_list, stable_softmax
+from ....utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
+from .configuration_efficientformer import EfficientFormerConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "EfficientFormerConfig"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "snap-research/efficientformer-l1-300"
+_EXPECTED_OUTPUT_SHAPE = [1, 49, 448]
+
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "snap-research/efficientformer-l1-300"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "LABEL_281"
+
+
+class TFEfficientFormerPatchEmbeddings(keras.layers.Layer):
+    """
+    This class performs downsampling between two stages. For the input tensor with the shape [batch_size, num_channels,
+    height, width] it produces output tensor with the shape [batch_size, num_channels, height/stride, width/stride]
+    """
+
+    def __init__(
+        self, config: EfficientFormerConfig, num_channels: int, embed_dim: int, apply_norm: bool = True, **kwargs
+    ) -> None:
+        super().__init__(**kwargs)
+        self.num_channels = num_channels
+
+        self.padding = keras.layers.ZeroPadding2D(padding=config.downsample_pad)
+        self.projection = keras.layers.Conv2D(
+            filters=embed_dim,
+            kernel_size=config.downsample_patch_size,
+            strides=config.downsample_stride,
+            padding="valid",
+            name="projection",
+        )
+        # Use same default momentum and epsilon as PyTorch equivalent for BatchNormalization
+        self.norm = (
+            keras.layers.BatchNormalization(axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="norm")
+            if apply_norm
+            else tf.identity
+        )
+        self.embed_dim = embed_dim
+
+    def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor:
+        tf.debugging.assert_shapes(
+            [(pixel_values, (..., None, None, self.num_channels))],
+            message="Make sure that the channel dimension of the pixel values match with the one set in the configuration.",
+        )
+        embeddings = self.projection(self.padding(pixel_values))
+        embeddings = self.norm(embeddings, training=training)
+        return embeddings
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "projection", None) is not None:
+            with tf.name_scope(self.projection.name):
+                self.projection.build([None, None, None, self.num_channels])
+        if getattr(self, "norm", None) is not None:
+            if hasattr(self.norm, "name"):
+                with tf.name_scope(self.norm.name):
+                    self.norm.build([None, None, None, self.embed_dim])
+
+
+class TFEfficientFormerSelfAttention(keras.layers.Layer):
+    def __init__(
+        self,
+        dim: int,
+        key_dim: int,
+        num_heads: int,
+        attention_ratio: int,
+        resolution: int,
+        config: EfficientFormerConfig,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.num_heads = num_heads
+        self.key_dim = key_dim
+        self.attention_ratio = attention_ratio
+        self.scale = key_dim**-0.5
+        self.total_key_dim = key_dim * num_heads
+        self.expanded_key_dim = int(attention_ratio * key_dim)
+        self.total_expanded_key_dim = int(self.expanded_key_dim * num_heads)
+        hidden_size = self.total_expanded_key_dim + self.total_key_dim * 2
+
+        self.qkv = keras.layers.Dense(
+            units=hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="qkv"
+        )
+        self.projection = keras.layers.Dense(
+            units=dim, kernel_initializer=get_initializer(config.initializer_range), name="projection"
+        )
+        self.resolution = resolution
+        self.dim = dim
+
+    def build(self, input_shape: tf.TensorShape) -> None:
+        points = list(itertools.product(range(self.resolution), range(self.resolution)))
+        num_points = len(points)
+        attention_offsets = {}
+
+        idxs = []
+
+        for point_1 in points:
+            for point_2 in points:
+                offset = (abs(point_1[0] - point_2[0]), abs(point_1[1] - point_2[1]))
+                if offset not in attention_offsets:
+                    attention_offsets[offset] = len(attention_offsets)
+                idxs.append(attention_offsets[offset])
+
+        self.attention_biases = self.add_weight(
+            shape=(self.num_heads, len(attention_offsets)),
+            initializer=keras.initializers.zeros(),
+            trainable=True,
+            name="attention_biases",
+        )
+        self.attention_bias_idxs = self.add_weight(
+            shape=(num_points, num_points),
+            trainable=False,
+            dtype=tf.int32,
+            name="attention_bias_idxs",
+        )
+
+        self.attention_bias_idxs.assign(tf.reshape(tf.cast(idxs, dtype=tf.int32), (num_points, num_points)))
+
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "qkv", None) is not None:
+            with tf.name_scope(self.qkv.name):
+                self.qkv.build([None, None, self.dim])
+        if getattr(self, "projection", None) is not None:
+            with tf.name_scope(self.projection.name):
+                self.projection.build([None, None, self.total_expanded_key_dim])
+
+    def call(
+        self, hidden_states: tf.Tensor, output_attentions: bool = False, training: bool = False
+    ) -> tuple[tf.Tensor]:
+        batch_size, sequence_length, *_ = shape_list(hidden_states)
+        qkv = self.qkv(inputs=hidden_states)
+
+        query_layer, key_layer, value_layer = tf.split(
+            tf.reshape(tensor=qkv, shape=(batch_size, sequence_length, self.num_heads, -1)),
+            num_or_size_splits=[self.key_dim, self.key_dim, self.expanded_key_dim],
+            axis=3,
+        )
+
+        query_layer = tf.transpose(query_layer, perm=[0, 2, 1, 3])
+        key_layer = tf.transpose(key_layer, perm=[0, 2, 1, 3])
+        value_layer = tf.transpose(value_layer, perm=[0, 2, 1, 3])
+
+        attention_probs = tf.matmul(query_layer, tf.transpose(key_layer, perm=[0, 1, 3, 2]))
+        scale = tf.cast(self.scale, dtype=attention_probs.dtype)
+        attention_probs = tf.multiply(attention_probs, scale)
+
+        attention_biases = tf.gather(params=self.attention_biases, indices=self.attention_bias_idxs, axis=1)
+        attention_probs = attention_probs + attention_biases
+        attention_probs = stable_softmax(logits=attention_probs, axis=-1)
+
+        context_layer = tf.matmul(attention_probs, value_layer)
+        context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
+
+        context_layer = tf.reshape(
+            tensor=context_layer, shape=(batch_size, sequence_length, self.total_expanded_key_dim)
+        )
+        context_layer = self.projection(context_layer)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+class TFEfficientFormerConvStem(keras.layers.Layer):
+    def __init__(self, config: EfficientFormerConfig, out_channels: int, **kwargs):
+        super().__init__(**kwargs)
+
+        self.padding = keras.layers.ZeroPadding2D(padding=1)
+        self.convolution1 = keras.layers.Conv2D(
+            filters=out_channels // 2, kernel_size=3, strides=2, padding="valid", name="convolution1"
+        )
+        # Use same default momentum and epsilon as PyTorch equivalent for BatchNormalization
+        self.batchnorm_before = keras.layers.BatchNormalization(
+            axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="batchnorm_before"
+        )
+
+        self.convolution2 = keras.layers.Conv2D(
+            filters=out_channels,
+            kernel_size=3,
+            strides=2,
+            padding="valid",
+            name="convolution2",
+        )
+        # Use same default momentum and epsilon as PyTorch equivalent for BatchNormalization
+        self.batchnorm_after = keras.layers.BatchNormalization(
+            axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="batchnorm_after"
+        )
+
+        self.activation = keras.layers.Activation(activation=keras.activations.relu, name="activation")
+        self.out_channels = out_channels
+        self.config = config
+
+    def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor:
+        features = self.batchnorm_before(self.convolution1(self.padding(pixel_values)), training=training)
+        features = self.activation(features)
+        features = self.batchnorm_after(self.convolution2(self.padding(features)), training=training)
+        features = self.activation(features)
+        return features
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "convolution1", None) is not None:
+            with tf.name_scope(self.convolution1.name):
+                self.convolution1.build([None, None, None, self.config.num_channels])
+        if getattr(self, "batchnorm_before", None) is not None:
+            with tf.name_scope(self.batchnorm_before.name):
+                self.batchnorm_before.build([None, None, None, self.out_channels // 2])
+        if getattr(self, "convolution2", None) is not None:
+            with tf.name_scope(self.convolution2.name):
+                self.convolution2.build([None, None, None, self.out_channels // 2])
+        if getattr(self, "batchnorm_after", None) is not None:
+            with tf.name_scope(self.batchnorm_after.name):
+                self.batchnorm_after.build([None, None, None, self.out_channels])
+        if getattr(self, "activation", None) is not None:
+            with tf.name_scope(self.activation.name):
+                self.activation.build(None)
+
+
+class TFEfficientFormerPooling(keras.layers.Layer):
+    def __init__(self, pool_size: int, **kwargs):
+        super().__init__(**kwargs)
+        self.pool = keras.layers.AveragePooling2D(pool_size=pool_size, strides=1, padding="same")
+
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        output = self.pool(hidden_states)
+        output = output - hidden_states
+        return output
+
+
+class TFEfficientFormerDenseMlp(keras.layers.Layer):
+    def __init__(
+        self,
+        config: EfficientFormerConfig,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+
+        self.linear_in = keras.layers.Dense(
+            units=hidden_features, kernel_initializer=get_initializer(config.initializer_range), name="linear_in"
+        )
+        self.activation = ACT2FN[config.hidden_act]
+        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
+
+        self.linear_out = keras.layers.Dense(
+            units=out_features, kernel_initializer=get_initializer(config.initializer_range), name="linear_out"
+        )
+        self.hidden_features = hidden_features
+        self.in_features = in_features
+
+    def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
+        hidden_states = self.linear_in(inputs=hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.dropout(inputs=hidden_states, training=training)
+        hidden_states = self.linear_out(inputs=hidden_states)
+        hidden_states = self.dropout(inputs=hidden_states, training=training)
+
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "linear_in", None) is not None:
+            with tf.name_scope(self.linear_in.name):
+                self.linear_in.build([None, None, self.in_features])
+        if getattr(self, "linear_out", None) is not None:
+            with tf.name_scope(self.linear_out.name):
+                self.linear_out.build([None, None, self.hidden_features])
+
+
+class TFEfficientFormerConvMlp(keras.layers.Layer):
+    def __init__(
+        self,
+        config: EfficientFormerConfig,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        drop: float = 0.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+
+        self.convolution1 = keras.layers.Conv2D(
+            filters=hidden_features,
+            kernel_size=1,
+            name="convolution1",
+            padding="valid",
+        )
+
+        self.activation = ACT2FN[config.hidden_act]
+
+        self.convolution2 = keras.layers.Conv2D(
+            filters=out_features,
+            kernel_size=1,
+            name="convolution2",
+            padding="valid",
+        )
+
+        self.dropout = keras.layers.Dropout(rate=drop)
+
+        # Use same default momentum and epsilon as PyTorch equivalent for BatchNormalization
+        self.batchnorm_before = keras.layers.BatchNormalization(
+            axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="batchnorm_before"
+        )
+        # Use same default momentum and epsilon as PyTorch equivalent for BatchNormalization
+        self.batchnorm_after = keras.layers.BatchNormalization(
+            axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="batchnorm_after"
+        )
+        self.hidden_features = hidden_features
+        self.in_features = in_features
+        self.out_features = out_features
+
+    def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor:
+        hidden_state = self.convolution1(hidden_state)
+        hidden_state = self.batchnorm_before(hidden_state, training=training)
+        hidden_state = self.activation(hidden_state)
+        hidden_state = self.dropout(hidden_state, training=training)
+        hidden_state = self.convolution2(hidden_state)
+        hidden_state = self.batchnorm_after(hidden_state, training=training)
+        hidden_state = self.dropout(hidden_state, training=training)
+        return hidden_state
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "convolution1", None) is not None:
+            with tf.name_scope(self.convolution1.name):
+                self.convolution1.build([None, None, None, self.in_features])
+        if getattr(self, "convolution2", None) is not None:
+            with tf.name_scope(self.convolution2.name):
+                self.convolution2.build([None, None, None, self.hidden_features])
+        if getattr(self, "batchnorm_before", None) is not None:
+            with tf.name_scope(self.batchnorm_before.name):
+                self.batchnorm_before.build([None, None, None, self.hidden_features])
+        if getattr(self, "batchnorm_after", None) is not None:
+            with tf.name_scope(self.batchnorm_after.name):
+                self.batchnorm_after.build([None, None, None, self.out_features])
+
+
+# Copied from transformers.models.convnext.modeling_tf_convnext.TFConvNextDropPath with ConvNext->EfficientFormer
+class TFEfficientFormerDropPath(keras.layers.Layer):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    References:
+        (1) github.com:rwightman/pytorch-image-models
+    """
+
+    def __init__(self, drop_path: float, **kwargs):
+        super().__init__(**kwargs)
+        self.drop_path = drop_path
+
+    def call(self, x: tf.Tensor, training=None):
+        if training:
+            keep_prob = 1 - self.drop_path
+            shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1)
+            random_tensor = keep_prob + tf.random.uniform(shape, 0, 1)
+            random_tensor = tf.floor(random_tensor)
+            return (x / keep_prob) * random_tensor
+        return x
+
+
+class TFEfficientFormerFlat(keras.layers.Layer):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def call(self, hidden_states: tf.Tensor) -> tuple[tf.Tensor]:
+        batch_size, _, _, in_channels = shape_list(hidden_states)
+        hidden_states = tf.reshape(hidden_states, shape=[batch_size, -1, in_channels])
+        return hidden_states
+
+
+class TFEfficientFormerMeta3D(keras.layers.Layer):
+    def __init__(self, config: EfficientFormerConfig, dim: int, drop_path: float = 0.0, **kwargs):
+        super().__init__(**kwargs)
+
+        self.token_mixer = TFEfficientFormerSelfAttention(
+            dim=config.dim,
+            key_dim=config.key_dim,
+            num_heads=config.num_attention_heads,
+            attention_ratio=config.attention_ratio,
+            resolution=config.resolution,
+            name="token_mixer",
+            config=config,
+        )
+        self.dim = dim
+        self.config = config
+
+        self.layernorm1 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm1")
+        self.layernorm2 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm2")
+        mlp_hidden_dim = int(dim * config.mlp_expansion_ratio)
+        self.mlp = TFEfficientFormerDenseMlp(config, in_features=dim, hidden_features=mlp_hidden_dim, name="mlp")
+
+        # Using `layers.Activation` instead of `tf.identity` to better control `training' behavior.
+        self.drop_path = (
+            TFEfficientFormerDropPath(drop_path)
+            if drop_path > 0.0
+            else keras.layers.Activation("linear", name="drop_path")
+        )
+        self.config = config
+
+    def build(self, input_shape=None):
+        self.layer_scale_1 = None
+        self.layer_scale_2 = None
+
+        if self.config.use_layer_scale:
+            self.layer_scale_1 = self.add_weight(
+                shape=(self.dim,),
+                initializer=keras.initializers.Constant(value=self.config.layer_scale_init_value),
+                trainable=True,
+                name="layer_scale_1",
+            )
+            self.layer_scale_2 = self.add_weight(
+                shape=(self.dim,),
+                initializer=keras.initializers.Constant(value=self.config.layer_scale_init_value),
+                trainable=True,
+                name="layer_scale_2",
+            )
+
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "token_mixer", None) is not None:
+            with tf.name_scope(self.token_mixer.name):
+                self.token_mixer.build(None)
+        if getattr(self, "layernorm1", None) is not None:
+            with tf.name_scope(self.layernorm1.name):
+                self.layernorm1.build([None, None, self.dim])
+        if getattr(self, "layernorm2", None) is not None:
+            with tf.name_scope(self.layernorm2.name):
+                self.layernorm2.build([None, None, self.dim])
+        if getattr(self, "mlp", None) is not None:
+            with tf.name_scope(self.mlp.name):
+                self.mlp.build(None)
+        if getattr(self, "drop_path", None) is not None:
+            with tf.name_scope(self.drop_path.name):
+                self.drop_path.build(None)
+
+    def call(
+        self, hidden_states: tf.Tensor, output_attentions: bool = False, training: bool = False
+    ) -> tuple[tf.Tensor]:
+        self_attention_outputs = self.token_mixer(
+            hidden_states=self.layernorm1(hidden_states, training=training),
+            output_attentions=output_attentions,
+            training=training,
+        )
+
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        if self.config.use_layer_scale:
+            layer_output = hidden_states + self.drop_path(
+                tf.expand_dims(tf.expand_dims(self.layer_scale_1, 0), 0) * attention_output,
+                training=training,
+            )
+            layer_output = layer_output + self.drop_path(
+                tf.expand_dims(tf.expand_dims(self.layer_scale_2, 0), 0)
+                * self.mlp(hidden_states=self.layernorm2(inputs=layer_output, training=training), training=training),
+                training=training,
+            )
+        else:
+            layer_output = hidden_states + self.drop_path(attention_output, training=training)
+            layer_output = layer_output + self.drop_path(
+                self.mlp(hidden_states=self.layernorm2(inputs=layer_output, training=training), training=training),
+                training=training,
+            )
+
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+
+class TFEfficientFormerMeta3DLayers(keras.layers.Layer):
+    def __init__(self, config: EfficientFormerConfig, **kwargs):
+        super().__init__(**kwargs)
+        drop_paths = [
+            config.drop_path_rate * (block_idx + sum(config.depths[:-1]))
+            for block_idx in range(config.num_meta3d_blocks)
+        ]
+        self.blocks = [
+            TFEfficientFormerMeta3D(config, config.hidden_sizes[-1], drop_path=drop_path, name=f"blocks.{i}")
+            for i, drop_path in enumerate(drop_paths)
+        ]
+
+    def call(
+        self, hidden_states: tf.Tensor, output_attentions: bool = False, training: bool = False
+    ) -> tuple[tf.Tensor]:
+        all_attention_outputs = () if output_attentions else None
+
+        for i, layer_module in enumerate(self.blocks):
+            if isinstance(hidden_states, tuple):
+                hidden_states = hidden_states[0]
+
+            hidden_states = layer_module(
+                hidden_states=hidden_states, output_attentions=output_attentions, training=training
+            )
+            if output_attentions:
+                all_attention_outputs = all_attention_outputs + (hidden_states[1],)
+
+        if output_attentions:
+            outputs = (hidden_states[0],) + all_attention_outputs
+            return outputs
+
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "blocks", None) is not None:
+            for layer in self.blocks:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+
+
+class TFEfficientFormerMeta4D(keras.layers.Layer):
+    def __init__(self, config: EfficientFormerConfig, dim: int, drop_path: float = 0.0, **kwargs):
+        super().__init__(**kwargs)
+        pool_size = config.pool_size if config.pool_size is not None else 3
+        self.token_mixer = TFEfficientFormerPooling(pool_size=pool_size, name="token_mixer")
+        self.dim = dim
+        mlp_hidden_dim = int(dim * config.mlp_expansion_ratio)
+        self.mlp = TFEfficientFormerConvMlp(
+            config=config, in_features=dim, hidden_features=mlp_hidden_dim, drop=config.hidden_dropout_prob, name="mlp"
+        )
+
+        self.drop_path = (
+            TFEfficientFormerDropPath(drop_path, name="drop_path")
+            if drop_path > 0.0
+            else keras.layers.Activation("linear", name="drop_path")
+        )
+        self.config = config
+
+    def build(self, input_shape=None):
+        self.layer_scale_1 = None
+        self.layer_scale_2 = None
+
+        if self.config.use_layer_scale:
+            self.layer_scale_1 = self.add_weight(
+                shape=(self.dim),
+                initializer=keras.initializers.Constant(value=self.config.layer_scale_init_value),
+                trainable=True,
+                name="layer_scale_1",
+            )
+            self.layer_scale_2 = self.add_weight(
+                shape=(self.dim),
+                initializer=keras.initializers.Constant(value=self.config.layer_scale_init_value),
+                trainable=True,
+                name="layer_scale_2",
+            )
+
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "token_mixer", None) is not None:
+            with tf.name_scope(self.token_mixer.name):
+                self.token_mixer.build(None)
+        if getattr(self, "mlp", None) is not None:
+            with tf.name_scope(self.mlp.name):
+                self.mlp.build(None)
+        if getattr(self, "drop_path", None) is not None:
+            with tf.name_scope(self.drop_path.name):
+                self.drop_path.build(None)
+
+    def call(self, hidden_states: tf.Tensor, training: bool = False) -> tuple[tf.Tensor]:
+        outputs = self.token_mixer(hidden_states)
+
+        if self.config.use_layer_scale:
+            layer_output = hidden_states + self.drop_path(
+                tf.expand_dims(tf.expand_dims(self.layer_scale_1, 0), 0) * outputs,
+                training=training,
+            )
+
+            layer_output = layer_output + self.drop_path(
+                tf.expand_dims(tf.expand_dims(self.layer_scale_2, 0), 0)
+                * self.mlp(hidden_state=layer_output, training=training),
+                training=training,
+            )
+
+        else:
+            layer_output = hidden_states + self.drop_path(outputs, training=training)
+            layer_output = layer_output + self.drop_path(
+                self.mlp(hidden_state=layer_output, training=training), training=training
+            )
+
+        return layer_output
+
+
+class TFEfficientFormerMeta4DLayers(keras.layers.Layer):
+    def __init__(self, config: EfficientFormerConfig, stage_idx: int, **kwargs):
+        super().__init__(**kwargs)
+        num_layers = (
+            config.depths[stage_idx] if stage_idx != -1 else config.depths[stage_idx] - config.num_meta3d_blocks
+        )
+        drop_paths = [
+            config.drop_path_rate * (block_idx + sum(config.depths[:stage_idx])) for block_idx in range(num_layers)
+        ]
+
+        self.blocks = [
+            TFEfficientFormerMeta4D(
+                config=config, dim=config.hidden_sizes[stage_idx], drop_path=drop_paths[i], name=f"blocks.{i}"
+            )
+            for i in range(len(drop_paths))
+        ]
+
+    def call(self, hidden_states: tf.Tensor, training: bool = False) -> tuple[tf.Tensor]:
+        for layer_module in self.blocks:
+            hidden_states = layer_module(hidden_states=hidden_states, training=training)
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "blocks", None) is not None:
+            for layer in self.blocks:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+
+
+class TFEfficientFormerIntermediateStage(keras.layers.Layer):
+    def __init__(self, config: EfficientFormerConfig, index: int, **kwargs):
+        super().__init__(**kwargs)
+        self.meta4D_layers = TFEfficientFormerMeta4DLayers(config=config, stage_idx=index, name="meta4D_layers")
+
+    def call(self, hidden_states: tf.Tensor, training: bool = False) -> tuple[tf.Tensor]:
+        hidden_states = self.meta4D_layers(hidden_states=hidden_states, training=training)
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "meta4D_layers", None) is not None:
+            with tf.name_scope(self.meta4D_layers.name):
+                self.meta4D_layers.build(None)
+
+
+class TFEfficientFormerLastStage(keras.layers.Layer):
+    def __init__(self, config: EfficientFormerConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.meta4D_layers = TFEfficientFormerMeta4DLayers(config=config, stage_idx=-1, name="meta4D_layers")
+        self.flat = TFEfficientFormerFlat(name="flat")
+        self.meta3D_layers = TFEfficientFormerMeta3DLayers(config, name="meta3D_layers")
+
+    def call(
+        self, hidden_states: tf.Tensor, output_attentions: bool = False, training: bool = False
+    ) -> tuple[tf.Tensor]:
+        hidden_states = self.meta4D_layers(hidden_states=hidden_states, training=training)
+        hidden_states = self.flat(hidden_states=hidden_states)
+        hidden_states = self.meta3D_layers(
+            hidden_states=hidden_states, output_attentions=output_attentions, training=training
+        )
+
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "meta4D_layers", None) is not None:
+            with tf.name_scope(self.meta4D_layers.name):
+                self.meta4D_layers.build(None)
+        if getattr(self, "flat", None) is not None:
+            with tf.name_scope(self.flat.name):
+                self.flat.build(None)
+        if getattr(self, "meta3D_layers", None) is not None:
+            with tf.name_scope(self.meta3D_layers.name):
+                self.meta3D_layers.build(None)
+
+
+class TFEfficientFormerEncoder(keras.layers.Layer):
+    def __init__(self, config: EfficientFormerConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.config = config
+        num_intermediate_stages = len(config.depths) - 1
+        downsamples = [
+            config.downsamples[i] or config.hidden_sizes[i] != config.hidden_sizes[i + 1]
+            for i in range(num_intermediate_stages)
+        ]
+
+        intermediate_stages = []
+        layer_count = -1
+        for i in range(num_intermediate_stages):
+            layer_count += 1
+            intermediate_stages.append(
+                TFEfficientFormerIntermediateStage(config, i, name=f"intermediate_stages.{layer_count}")
+            )
+            if downsamples[i]:
+                layer_count += 1
+                intermediate_stages.append(
+                    TFEfficientFormerPatchEmbeddings(
+                        config,
+                        config.hidden_sizes[i],
+                        config.hidden_sizes[i + 1],
+                        name=f"intermediate_stages.{layer_count}",
+                    )
+                )
+        self.intermediate_stages = intermediate_stages
+        self.last_stage = TFEfficientFormerLastStage(config, name="last_stage")
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        output_hidden_states: bool,
+        output_attentions: bool,
+        return_dict: bool,
+        training: bool = False,
+    ) -> TFBaseModelOutput:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        for layer_module in self.intermediate_stages:
+            hidden_states = layer_module(hidden_states, training=training)
+
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+        layer_output = self.last_stage(hidden_states, output_attentions=output_attentions, training=training)
+
+        if output_attentions:
+            all_self_attentions = all_self_attentions + layer_output[1:]
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (layer_output[0],)
+
+        if not return_dict:
+            return tuple(v for v in [layer_output[0], all_hidden_states, all_self_attentions] if v is not None)
+
+        return TFBaseModelOutput(
+            last_hidden_state=layer_output[0],
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "last_stage", None) is not None:
+            with tf.name_scope(self.last_stage.name):
+                self.last_stage.build(None)
+        for layer in self.intermediate_stages:
+            with tf.name_scope(layer.name):
+                layer.build(None)
+
+
+@keras_serializable
+class TFEfficientFormerMainLayer(keras.layers.Layer):
+    config_class = EfficientFormerConfig
+
+    def __init__(self, config: EfficientFormerConfig, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.config = config
+
+        self.patch_embed = TFEfficientFormerConvStem(config, config.hidden_sizes[0], name="patch_embed")
+        self.encoder = TFEfficientFormerEncoder(config, name="encoder")
+        self.layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
+
+    @unpack_inputs
+    def call(
+        self,
+        pixel_values: Optional[tf.Tensor] = None,
+        output_attentions: Optional[tf.Tensor] = None,
+        output_hidden_states: Optional[tf.Tensor] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+    ) -> Union[TFBaseModelOutput, tuple[tf.Tensor, ...]]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        # When running on CPU, keras.layers.Conv2D and keras.layers.AveragePool2D do not
+        # support channels first NCHW format. A number of blocks contain both.
+        # So change the input format from (batch_size, num_channels, height, width) to
+        # (batch_size, height, width, num_channels) here.
+        # shape = (batch_size, in_height, in_width, in_channels=num_channels)
+        pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))
+        embedding_output = self.patch_embed(pixel_values, training=training)
+
+        encoder_outputs = self.encoder(
+            hidden_states=embedding_output,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output, training=training)
+
+        # Change the hidden states from (batch_size, height, width, num_channels) to
+        # (batch_size, num_channels, height, width).
+        # The hidden states are in (batch_size, height, width, num_channels)
+        # shape after all stages except the MB3D blocks.
+        if output_hidden_states:
+            hidden_states = tuple(tf.transpose(h, perm=(0, 3, 1, 2)) for h in encoder_outputs[1][:-1]) + (
+                encoder_outputs[1][-1],
+            )
+
+        if not return_dict:
+            head_outputs = (sequence_output,)
+            return head_outputs + encoder_outputs[1:]
+
+        return TFBaseModelOutput(
+            last_hidden_state=sequence_output,
+            hidden_states=hidden_states if output_hidden_states else encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "patch_embed", None) is not None:
+            with tf.name_scope(self.patch_embed.name):
+                self.patch_embed.build(None)
+        if getattr(self, "encoder", None) is not None:
+            with tf.name_scope(self.encoder.name):
+                self.encoder.build(None)
+        if getattr(self, "layernorm", None) is not None:
+            with tf.name_scope(self.layernorm.name):
+                self.layernorm.build([None, None, self.config.hidden_sizes[-1]])
+
+
+class TFEfficientFormerPreTrainedModel(TFPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = EfficientFormerConfig
+    base_model_prefix = "efficientformer"
+    main_input_name = "pixel_values"
+
+
+EFFICIENTFORMER_START_DOCSTRING = r"""
+    This model is a TensorFlow
+    [keras.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Layer). Use it as a regular
+    TensorFlow Module and refer to the TensorFlow documentation for all matter related to general usage and behavior.
+
+
+    Parameters:
+        config ([`EfficientFormerConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+EFFICIENTFORMER_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values ((`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`EfficientFormerImageProcessor.__call__`] for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare EfficientFormer Model transformer outputting raw hidden-states without any specific head on top.",
+    EFFICIENTFORMER_START_DOCSTRING,
+)
+class TFEfficientFormerModel(TFEfficientFormerPreTrainedModel):
+    def __init__(self, config: EfficientFormerConfig, **kwargs) -> None:
+        super().__init__(config, **kwargs)
+
+        self.efficientformer = TFEfficientFormerMainLayer(config, name="efficientformer")
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFBaseModelOutputWithPooling,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def call(
+        self,
+        pixel_values: Optional[tf.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+    ) -> Union[tuple, TFBaseModelOutput]:
+        outputs = self.efficientformer(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "efficientformer", None) is not None:
+            with tf.name_scope(self.efficientformer.name):
+                self.efficientformer.build(None)
+
+
+@add_start_docstrings(
+    """
+    EfficientFormer Model transformer with an image classification head on top of pooled last hidden state, e.g. for
+    ImageNet.
+    """,
+    EFFICIENTFORMER_START_DOCSTRING,
+)
+class TFEfficientFormerForImageClassification(TFEfficientFormerPreTrainedModel, TFSequenceClassificationLoss):
+    def __init__(self, config: EfficientFormerConfig):
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.efficientformer = TFEfficientFormerMainLayer(config, name="efficientformer")
+
+        # Classifier head
+        self.classifier = (
+            keras.layers.Dense(config.num_labels, name="classifier")
+            if config.num_labels > 0
+            else keras.layers.Activation("linear", name="classifier")
+        )
+        self.config = config
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=TFImageClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
+    def call(
+        self,
+        pixel_values: Optional[tf.Tensor] = None,
+        labels: Optional[tf.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+    ) -> Union[tf.Tensor, TFImageClassifierOutput]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.efficientformer(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.classifier(tf.reduce_mean(sequence_output, axis=-2))
+
+        loss = None if labels is None else self.hf_compute_loss(labels, logits)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFImageClassifierOutput(
+            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "efficientformer", None) is not None:
+            with tf.name_scope(self.efficientformer.name):
+                self.efficientformer.build(None)
+        if getattr(self, "classifier", None) is not None:
+            if hasattr(self.classifier, "name"):
+                with tf.name_scope(self.classifier.name):
+                    self.classifier.build([None, None, self.config.hidden_sizes[-1]])
+
+
+@dataclass
+class TFEfficientFormerForImageClassificationWithTeacherOutput(ModelOutput):
+    """
+    Args:
+    Output type of [`EfficientFormerForImageClassificationWithTeacher`].
+        logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
+            Prediction scores as the average of the cls_logits and distillation logits.
+        cls_logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
+            Prediction scores of the classification head (i.e. the linear layer on top of the final hidden state of the
+            class token).
+        distillation_logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
+            Prediction scores of the distillation head (i.e. the linear layer on top of the final hidden state of the
+            distillation token).
+        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when
+        `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus
+            the initial embedding outputs.
+        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when
+        `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+    """
+
+    logits: Optional[tf.Tensor] = None
+    cls_logits: Optional[tf.Tensor] = None
+    distillation_logits: Optional[tf.Tensor] = None
+    hidden_states: Optional[tuple[tf.Tensor]] = None
+    attentions: Optional[tuple[tf.Tensor]] = None
+
+
+@add_start_docstrings(
+    """
+    EfficientFormer Model transformer with image classification heads on top (a linear layer on top of the final hidden
+    state and a linear layer on top of the final hidden state of the distillation token) e.g. for ImageNet.
+
+    .. warning::
+            This model supports inference-only. Fine-tuning with distillation (i.e. with a teacher) is not yet
+            supported.
+    """,
+    EFFICIENTFORMER_START_DOCSTRING,
+)
+class TFEfficientFormerForImageClassificationWithTeacher(TFEfficientFormerPreTrainedModel):
+    def __init__(self, config: EfficientFormerConfig) -> None:
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.efficientformer = TFEfficientFormerMainLayer(config, name="efficientformer")
+
+        # Classifier heads
+        self.classifier = (
+            keras.layers.Dense(config.num_labels, name="classifier")
+            if config.num_labels > 0
+            else keras.layers.Activation("linear", name="classifier")
+        )
+        self.distillation_classifier = (
+            keras.layers.Dense(config.num_labels, name="distillation_classifier")
+            if config.num_labels > 0
+            else keras.layers.Activation("linear", name="distillation_classifier")
+        )
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=TFEfficientFormerForImageClassificationWithTeacherOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
+    def call(
+        self,
+        pixel_values: Optional[tf.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+    ) -> Union[tuple, TFEfficientFormerForImageClassificationWithTeacherOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if training:
+            raise Exception(
+                "This model supports inference-only. Fine-tuning with distillation (i.e. with a teacher) is not yet supported."
+            )
+
+        outputs = self.efficientformer(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        sequence_output = outputs[0]
+
+        cls_logits = self.classifier(tf.reduce_mean(sequence_output, axis=-2))
+        distillation_logits = self.distillation_classifier(tf.reduce_mean(sequence_output, axis=-2))
+        logits = (cls_logits + distillation_logits) / 2
+
+        if not return_dict:
+            output = (logits, cls_logits, distillation_logits) + outputs[1:]
+            return output
+
+        return TFEfficientFormerForImageClassificationWithTeacherOutput(
+            logits=logits,
+            cls_logits=cls_logits,
+            distillation_logits=distillation_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "efficientformer", None) is not None:
+            with tf.name_scope(self.efficientformer.name):
+                self.efficientformer.build(None)
+        if getattr(self, "classifier", None) is not None:
+            if hasattr(self.classifier, "name"):
+                with tf.name_scope(self.classifier.name):
+                    self.classifier.build([None, None, self.config.hidden_sizes[-1]])
+        if getattr(self, "distillation_classifier", None) is not None:
+            if hasattr(self.distillation_classifier, "name"):
+                with tf.name_scope(self.distillation_classifier.name):
+                    self.distillation_classifier.build([None, None, self.config.hidden_sizes[-1]])
+
+
+__all__ = [
+    "TFEfficientFormerForImageClassification",
+    "TFEfficientFormerForImageClassificationWithTeacher",
+    "TFEfficientFormerModel",
+    "TFEfficientFormerPreTrainedModel",
+]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/ernie_m/__init__.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/ernie_m/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2beb8f463ff10af33ea980599ea4d5fc05888acb
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/ernie_m/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2023 The HuggingFace and Baidu Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ....utils import _LazyModule
+from ....utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_ernie_m import *
+    from .modeling_ernie_m import *
+    from .tokenization_ernie_m import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/ernie_m/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/ernie_m/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4a02373a817fd06440cf2fee4c6af47876048405
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/ernie_m/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/ernie_m/__pycache__/configuration_ernie_m.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/ernie_m/__pycache__/configuration_ernie_m.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8c5b9a0bfccd95868aacbc89642906e5a0fb1f8e
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/ernie_m/__pycache__/configuration_ernie_m.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/ernie_m/__pycache__/modeling_ernie_m.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/ernie_m/__pycache__/modeling_ernie_m.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..99fdab17fec70b8e9f52c2907e266fbfd756b32d
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/ernie_m/__pycache__/modeling_ernie_m.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/ernie_m/__pycache__/tokenization_ernie_m.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/ernie_m/__pycache__/tokenization_ernie_m.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..da9bd19da299360e3bff63124c8ea253466febd0
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/ernie_m/__pycache__/tokenization_ernie_m.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/ernie_m/configuration_ernie_m.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/ernie_m/configuration_ernie_m.py
new file mode 100644
index 0000000000000000000000000000000000000000..839b17b11bf7248136668d8c325f03e10a64cd96
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/ernie_m/configuration_ernie_m.py
@@ -0,0 +1,112 @@
+# coding=utf-8
+# Copyright 2023 Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""ErnieM model configuration"""
+# Adapted from original paddlenlp repository.(https://github.com/PaddlePaddle/PaddleNLP/blob/develop/paddlenlp/transformers/ernie_m/configuration.py)
+
+from __future__ import annotations
+
+from ....configuration_utils import PretrainedConfig
+
+
+class ErnieMConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ErnieMModel`]. It is used to instantiate a
+    Ernie-M model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the `Ernie-M`
+    [susnato/ernie-m-base_pytorch](https://huggingface.co/susnato/ernie-m-base_pytorch) architecture.
+
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 250002):
+            Vocabulary size of `inputs_ids` in [`ErnieMModel`]. Also is the vocab size of token embedding matrix.
+            Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling
+            [`ErnieMModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the embedding layer, encoder layers and pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the feed-forward (ff) layer in the encoder. Input tensors to feed-forward layers are
+            firstly projected from hidden_size to intermediate_size, and then projected back to hidden_size. Typically
+            intermediate_size is larger than hidden_size.
+        hidden_act (`str`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function in the feed-forward layer. `"gelu"`, `"relu"` and any other torch
+            supported activation functions are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings and encoder.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability used in `MultiHeadAttention` in all encoder layers to drop some attention target.
+        max_position_embeddings (`int`, *optional*, defaults to 514):
+            The maximum value of the dimensionality of position encoding, which dictates the maximum supported length
+            of an input sequence.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the normal initializer for initializing all weight matrices. The index of padding
+            token in the token vocabulary.
+        pad_token_id (`int`, *optional*, defaults to 1):
+            Padding token id.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+        act_dropout (`float`, *optional*, defaults to 0.0):
+            This dropout probability is used in `ErnieMEncoderLayer` after activation.
+
+    A normal_initializer initializes weight matrices as normal distributions. See
+    `ErnieMPretrainedModel._init_weights()` for how weights are initialized in `ErnieMModel`.
+    """
+
+    model_type = "ernie_m"
+    attribute_map: dict[str, str] = {"dropout": "classifier_dropout", "num_classes": "num_labels"}
+
+    def __init__(
+        self,
+        vocab_size: int = 250002,
+        hidden_size: int = 768,
+        num_hidden_layers: int = 12,
+        num_attention_heads: int = 12,
+        intermediate_size: int = 3072,
+        hidden_act: str = "gelu",
+        hidden_dropout_prob: float = 0.1,
+        attention_probs_dropout_prob: float = 0.1,
+        max_position_embeddings: int = 514,
+        initializer_range: float = 0.02,
+        pad_token_id: int = 1,
+        layer_norm_eps: float = 1e-05,
+        classifier_dropout=None,
+        act_dropout=0.0,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.classifier_dropout = classifier_dropout
+        self.act_dropout = act_dropout
+
+
+__all__ = ["ErnieMConfig"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/ernie_m/modeling_ernie_m.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/ernie_m/modeling_ernie_m.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0e97c132d0989079aa647cac4e64bbae451a116
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/ernie_m/modeling_ernie_m.py
@@ -0,0 +1,1061 @@
+# coding=utf-8
+# Copyright 2023 Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch ErnieM model."""
+
+import math
+from typing import Optional, Union
+
+import torch
+from torch import nn, tensor
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ....activations import ACT2FN
+from ....cache_utils import Cache
+from ....modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    MultipleChoiceModelOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from ....modeling_utils import PreTrainedModel
+from ....pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ....utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from ....utils.deprecation import deprecate_kwarg
+from .configuration_ernie_m import ErnieMConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "susnato/ernie-m-base_pytorch"
+_CONFIG_FOR_DOC = "ErnieMConfig"
+_TOKENIZER_FOR_DOC = "ErnieMTokenizer"
+
+
+# Adapted from paddlenlp.transformers.ernie_m.modeling.ErnieEmbeddings
+class ErnieMEmbeddings(nn.Module):
+    """Construct the embeddings from word and position embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(
+            config.max_position_embeddings, config.hidden_size, padding_idx=config.pad_token_id
+        )
+        self.layer_norm = nn.LayerNorm(normalized_shape=config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(p=config.hidden_dropout_prob)
+        self.padding_idx = config.pad_token_id
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.LongTensor] = None,
+        past_key_values_length: int = 0,
+    ) -> torch.Tensor:
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        if position_ids is None:
+            input_shape = inputs_embeds.size()[:-1]
+            ones = torch.ones(input_shape, dtype=torch.int64, device=inputs_embeds.device)
+            seq_length = torch.cumsum(ones, dim=1)
+            position_ids = seq_length - ones
+
+            if past_key_values_length > 0:
+                position_ids = position_ids + past_key_values_length
+        # to mimic paddlenlp implementation
+        position_ids += 2
+        position_embeddings = self.position_embeddings(position_ids)
+        embeddings = inputs_embeds + position_embeddings
+        embeddings = self.layer_norm(embeddings)
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+class ErnieMSelfAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.q_proj = nn.Linear(config.hidden_size, self.all_head_size)
+        self.k_proj = nn.Linear(config.hidden_size, self.all_head_size)
+        self.v_proj = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = position_embedding_type or getattr(
+            config, "position_embedding_type", "absolute"
+        )
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.Tensor]:
+        mixed_query_layer = self.q_proj(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_values is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_values[0]
+            value_layer = past_key_values[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.k_proj(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.v_proj(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_values is not None:
+            key_layer = self.transpose_for_scores(self.k_proj(hidden_states))
+            value_layer = self.transpose_for_scores(self.v_proj(hidden_states))
+            key_layer = torch.cat([past_key_values[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_values[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.k_proj(hidden_states))
+            value_layer = self.transpose_for_scores(self.v_proj(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        use_cache = past_key_values is not None
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_values` is always `None`
+            past_key_values = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            query_length, key_length = query_layer.shape[2], key_layer.shape[2]
+            if use_cache:
+                position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view(
+                    -1, 1
+                )
+            else:
+                position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in ErnieMModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_values,)
+        return outputs
+
+
+class ErnieMAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        self.self_attn = ErnieMSelfAttention(config, position_embedding_type=position_embedding_type)
+        self.out_proj = nn.Linear(config.hidden_size, config.hidden_size)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self_attn.num_attention_heads, self.self_attn.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self_attn.q_proj = prune_linear_layer(self.self_attn.q_proj, index)
+        self.self_attn.k_proj = prune_linear_layer(self.self_attn.k_proj, index)
+        self.self_attn.v_proj = prune_linear_layer(self.self_attn.v_proj, index)
+        self.out_proj = prune_linear_layer(self.out_proj, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self_attn.num_attention_heads = self.self_attn.num_attention_heads - len(heads)
+        self.self_attn.all_head_size = self.self_attn.attention_head_size * self.self_attn.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.Tensor]:
+        self_outputs = self.self_attn(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_values,
+            output_attentions,
+        )
+        attention_output = self.out_proj(self_outputs[0])
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class ErnieMEncoderLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        # to mimic paddlenlp implementation
+        dropout = 0.1 if config.hidden_dropout_prob is None else config.hidden_dropout_prob
+        act_dropout = config.hidden_dropout_prob if config.act_dropout is None else config.act_dropout
+
+        self.self_attn = ErnieMAttention(config)
+        self.linear1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.dropout = nn.Dropout(act_dropout)
+        self.linear2 = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        if isinstance(config.hidden_act, str):
+            self.activation = ACT2FN[config.hidden_act]
+        else:
+            self.activation = config.hidden_act
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = True,
+    ):
+        residual = hidden_states
+        if output_attentions:
+            hidden_states, attention_opt_weights = self.self_attn(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+            )
+
+        else:
+            hidden_states = self.self_attn(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+            )
+        hidden_states = residual + self.dropout1(hidden_states)
+        hidden_states = self.norm1(hidden_states)
+        residual = hidden_states
+
+        hidden_states = self.linear1(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.linear2(hidden_states)
+        hidden_states = residual + self.dropout2(hidden_states)
+        hidden_states = self.norm2(hidden_states)
+
+        if output_attentions:
+            return hidden_states, attention_opt_weights
+        else:
+            return hidden_states
+
+
+class ErnieMEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([ErnieMEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+
+    def forward(
+        self,
+        input_embeds: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
+        hidden_states = () if output_hidden_states else None
+        attentions = () if output_attentions else None
+
+        output = input_embeds
+        if output_hidden_states:
+            hidden_states = hidden_states + (output,)
+        for i, layer in enumerate(self.layers):
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            output, opt_attn_weights = layer(
+                hidden_states=output,
+                attention_mask=attention_mask,
+                head_mask=layer_head_mask,
+                past_key_values=past_key_values[i] if past_key_values is not None else None,
+            )
+
+            if output_hidden_states:
+                hidden_states = hidden_states + (output,)
+            if output_attentions:
+                attentions = attentions + (opt_attn_weights,)
+
+        last_hidden_state = output
+        if not return_dict:
+            return tuple(v for v in [last_hidden_state, hidden_states, attentions] if v is not None)
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=last_hidden_state, hidden_states=hidden_states, attentions=attentions
+        )
+
+
+class ErnieMPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class ErnieMPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config: ErnieMConfig
+    base_model_prefix = "ernie_m"
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+ERNIE_M_START_DOCSTRING = r"""
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`ErnieMConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+ERNIE_M_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`ErnieMTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare ErnieM Model transformer outputting raw hidden-states without any specific head on top.",
+    ERNIE_M_START_DOCSTRING,
+)
+class ErnieMModel(ErnieMPreTrainedModel):
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.initializer_range = config.initializer_range
+        self.embeddings = ErnieMEmbeddings(config)
+        self.encoder = ErnieMEncoder(config)
+        self.pooler = ErnieMPooler(config) if add_pooling_layer else None
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layers[layer].self_attn.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPastAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[tensor] = None,
+        position_ids: Optional[tensor] = None,
+        attention_mask: Optional[tensor] = None,
+        head_mask: Optional[tensor] = None,
+        inputs_embeds: Optional[tensor] = None,
+        past_key_values: Optional[tuple[tuple[tensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.FloatTensor], BaseModelOutputWithPoolingAndCrossAttentions]:
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time.")
+
+        # init the default bool value
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        past_key_values_length = 0
+        if past_key_values is not None:
+            past_key_values_length = past_key_values.get_seq_length()
+
+        # Adapted from paddlenlp.transformers.ernie_m.ErnieMModel
+        if attention_mask is None:
+            attention_mask = (input_ids == self.config.pad_token_id).to(torch.float32)
+            attention_mask *= torch.finfo(attention_mask.dtype).min
+            if past_key_values is not None:
+                batch_size = past_key_values[0][0].shape[0]
+                past_mask = torch.zeros([batch_size, 1, 1, past_key_values_length], dtype=attention_mask.dtype)
+                attention_mask = torch.concat([past_mask, attention_mask], dim=-1)
+        # For 2D attention_mask from tokenizer
+        elif attention_mask.ndim == 2:
+            attention_mask = attention_mask.to(torch.float32)
+            attention_mask = 1.0 - attention_mask
+            attention_mask *= torch.finfo(attention_mask.dtype).min
+
+        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(1)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            sequence_output = encoder_outputs[0]
+            pooler_output = self.pooler(sequence_output) if self.pooler is not None else None
+            return (sequence_output, pooler_output) + encoder_outputs[1:]
+
+        sequence_output = encoder_outputs["last_hidden_state"]
+        pooler_output = self.pooler(sequence_output) if self.pooler is not None else None
+        hidden_states = None if not output_hidden_states else encoder_outputs["hidden_states"]
+        attentions = None if not output_attentions else encoder_outputs["attentions"]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooler_output,
+            hidden_states=hidden_states,
+            attentions=attentions,
+        )
+
+
+@add_start_docstrings(
+    """ErnieM Model transformer with a sequence classification/regression head on top (a linear layer on top of
+    the pooled output) e.g. for GLUE tasks.""",
+    ERNIE_M_START_DOCSTRING,
+)
+class ErnieMForSequenceClassification(ErnieMPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+
+        self.ernie_m = ErnieMModel(config)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = True,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[tuple[torch.FloatTensor], SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.ernie_m(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            past_key_values=past_key_values,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """ErnieM Model with a multiple choice classification head on top (a linear layer on top of
+    the pooled output and a softmax) e.g. for RocStories/SWAG tasks.""",
+    ERNIE_M_START_DOCSTRING,
+)
+class ErnieMForMultipleChoice(ErnieMPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.ernie_m = ErnieMModel(config)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = True,
+    ) -> Union[tuple[torch.FloatTensor], MultipleChoiceModelOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+            `input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+
+        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+
+        outputs = self.ernie_m(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """ErnieM Model with a token classification head on top (a linear layer on top of
+    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.""",
+    ERNIE_M_START_DOCSTRING,
+)
+class ErnieMForTokenClassification(ErnieMPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.ernie_m = ErnieMModel(config, add_pooling_layer=False)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = True,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[tuple[torch.FloatTensor], TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.ernie_m(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """ErnieM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).""",
+    ERNIE_M_START_DOCSTRING,
+)
+class ErnieMForQuestionAnswering(ErnieMPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.ernie_m = ErnieMModel(config, add_pooling_layer=False)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=QuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        start_positions: Optional[torch.Tensor] = None,
+        end_positions: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = True,
+    ) -> Union[tuple[torch.FloatTensor], QuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.ernie_m(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """ErnieMForInformationExtraction is a Ernie-M Model with two linear layer on top of the hidden-states output to
+    compute `start_prob` and `end_prob`, designed for Universal Information Extraction.""",
+    ERNIE_M_START_DOCSTRING,
+)
+class ErnieMForInformationExtraction(ErnieMPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.ernie_m = ErnieMModel(config)
+        self.linear_start = nn.Linear(config.hidden_size, 1)
+        self.linear_end = nn.Linear(config.hidden_size, 1)
+        self.sigmoid = nn.Sigmoid()
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        start_positions: Optional[torch.Tensor] = None,
+        end_positions: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = True,
+    ) -> Union[tuple[torch.FloatTensor], QuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for position (index) for computing the start_positions loss. Position outside of the sequence are
+            not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) for computing the end_positions loss. Position outside of the sequence are not
+            taken into account for computing the loss.
+        """
+
+        result = self.ernie_m(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        if return_dict:
+            sequence_output = result.last_hidden_state
+        elif not return_dict:
+            sequence_output = result[0]
+
+        start_logits = self.linear_start(sequence_output)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = self.linear_end(sequence_output)
+        end_logits = end_logits.squeeze(-1)
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = BCEWithLogitsLoss()
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            return tuple(
+                i
+                for i in [total_loss, start_logits, end_logits, result.hidden_states, result.attentions]
+                if i is not None
+            )
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=result.hidden_states,
+            attentions=result.attentions,
+        )
+
+
+__all__ = [
+    "ErnieMForMultipleChoice",
+    "ErnieMForQuestionAnswering",
+    "ErnieMForSequenceClassification",
+    "ErnieMForTokenClassification",
+    "ErnieMModel",
+    "ErnieMPreTrainedModel",
+    "ErnieMForInformationExtraction",
+]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/ernie_m/tokenization_ernie_m.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/ernie_m/tokenization_ernie_m.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c33abd043e6d170485e6d01298788d461275aa6
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/ernie_m/tokenization_ernie_m.py
@@ -0,0 +1,409 @@
+# coding=utf-8
+# Copyright 2023 Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for Ernie-M."""
+
+import os
+import unicodedata
+from typing import Any, Optional
+
+import sentencepiece as spm
+
+from ....tokenization_utils import PreTrainedTokenizer
+from ....utils import logging
+from ....utils.import_utils import requires
+
+
+logger = logging.get_logger(__name__)
+
+SPIECE_UNDERLINE = "▁"
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "sentencepiece_model_ckpt": "sentencepiece.bpe.model"}
+
+RESOURCE_FILES_NAMES = {
+    "sentencepiece_model_file": "sentencepiece.bpe.model",
+    "vocab_file": "vocab.txt",
+}
+
+
+# Adapted from paddlenlp.transformers.ernie_m.tokenizer.ErnieMTokenizer
+@requires(backends=("sentencepiece",))
+class ErnieMTokenizer(PreTrainedTokenizer):
+    r"""
+    Constructs a Ernie-M tokenizer. It uses the `sentencepiece` tools to cut the words to sub-words.
+
+    Args:
+        sentencepiece_model_file (`str`):
+            The file path of sentencepiece model.
+        vocab_file (`str`, *optional*):
+            The file path of the vocabulary.
+        do_lower_case (`str`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
+            A special token representing the `unknown (out-of-vocabulary)` token. An unknown token is set to be
+            `unk_token` inorder to be converted to an ID.
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
+            A special token separating two different sentences in the same input.
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
+            A special token used to make arrays of tokens the same size for batching purposes.
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
+            A special token used for sequence classification. It is the last token of the sequence when built with
+            special tokens.
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
+            A special token representing a masked token. This is the token used in the masked language modeling task
+            which the model tries to predict the original unmasked ones.
+    """
+
+    # Ernie-M model doesn't have token_type embedding.
+    model_input_names: list[str] = ["input_ids"]
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    resource_files_names = RESOURCE_FILES_NAMES
+
+    def __init__(
+        self,
+        sentencepiece_model_ckpt,
+        vocab_file=None,
+        do_lower_case=False,
+        encoding="utf8",
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        sp_model_kwargs: Optional[dict[str, Any]] = None,
+        **kwargs,
+    ) -> None:
+        # Mask token behave like a normal word, i.e. include the space before it and
+        # is included in the raw text, there should be a match in a non-normalized sentence.
+
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+
+        self.do_lower_case = do_lower_case
+        self.sentencepiece_model_ckpt = sentencepiece_model_ckpt
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(sentencepiece_model_ckpt)
+
+        # to mimic paddlenlp.transformers.ernie_m.tokenizer.ErnieMTokenizer functioning
+        if vocab_file is not None:
+            self.vocab = self.load_vocab(filepath=vocab_file)
+        else:
+            self.vocab = {self.sp_model.id_to_piece(id): id for id in range(self.sp_model.get_piece_size())}
+        self.reverse_vocab = {v: k for k, v in self.vocab.items()}
+
+        super().__init__(
+            do_lower_case=do_lower_case,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            vocab_file=vocab_file,
+            encoding=encoding,
+            sp_model_kwargs=self.sp_model_kwargs,
+            **kwargs,
+        )
+
+    def get_offset_mapping(self, text):
+        if text is None:
+            return None
+
+        split_tokens = self.tokenize(text)
+        normalized_text, char_mapping = "", []
+
+        for i, ch in enumerate(text):
+            if ch in self.SP_CHAR_MAPPING:
+                ch = self.SP_CHAR_MAPPING.get(ch)
+            else:
+                ch = unicodedata.normalize("NFKC", ch)
+            if self.is_whitespace(ch):
+                continue
+            normalized_text += ch
+            char_mapping.extend([i] * len(ch))
+
+        text, token_mapping, offset = normalized_text, [], 0
+
+        if self.do_lower_case:
+            text = text.lower()
+
+        for token in split_tokens:
+            if token[:1] == "▁":
+                token = token[1:]
+            start = text[offset:].index(token) + offset
+            end = start + len(token)
+
+            token_mapping.append((char_mapping[start], char_mapping[end - 1] + 1))
+            offset = end
+        return token_mapping
+
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+
+    def get_vocab(self):
+        return dict(self.vocab, **self.added_tokens_encoder)
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__ = d
+
+        # for backward compatibility
+        if not hasattr(self, "sp_model_kwargs"):
+            self.sp_model_kwargs = {}
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(self.sentencepiece_model_ckpt)
+
+    def clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        return "".join(self.SP_CHAR_MAPPING.get(c, c) for c in text)
+
+    def _tokenize(self, text, enable_sampling=False, nbest_size=64, alpha=0.1):
+        """Tokenize a string."""
+
+        if self.sp_model_kwargs.get("enable_sampling") is True:
+            enable_sampling = True
+        if self.sp_model_kwargs.get("alpha") is not None:
+            alpha = self.sp_model_kwargs.get("alpha")
+        if self.sp_model_kwargs.get("nbest_size") is not None:
+            nbest_size = self.sp_model_kwargs.get("nbest_size")
+
+        if not enable_sampling:
+            pieces = self.sp_model.EncodeAsPieces(text)
+        else:
+            pieces = self.sp_model.SampleEncodeAsPieces(text, nbest_size, alpha)
+        new_pieces = []
+        for pi, piece in enumerate(pieces):
+            if piece == SPIECE_UNDERLINE:
+                if not pieces[pi + 1].startswith(SPIECE_UNDERLINE) and pi != 0:
+                    new_pieces.append(SPIECE_UNDERLINE)
+                    continue
+                else:
+                    continue
+            lst_i = 0
+            for i, chunk in enumerate(piece):
+                if chunk == SPIECE_UNDERLINE:
+                    continue
+                if self.is_ch_char(chunk) or self.is_punct(chunk):
+                    if i > lst_i and piece[lst_i:i] != SPIECE_UNDERLINE:
+                        new_pieces.append(piece[lst_i:i])
+                    new_pieces.append(chunk)
+                    lst_i = i + 1
+                elif chunk.isdigit() and i > 0 and not piece[i - 1].isdigit():
+                    if i > lst_i and piece[lst_i:i] != SPIECE_UNDERLINE:
+                        new_pieces.append(piece[lst_i:i])
+                    lst_i = i
+                elif not chunk.isdigit() and i > 0 and piece[i - 1].isdigit():
+                    if i > lst_i and piece[lst_i:i] != SPIECE_UNDERLINE:
+                        new_pieces.append(piece[lst_i:i])
+                    lst_i = i
+            if len(piece) > lst_i:
+                new_pieces.append(piece[lst_i:])
+        return new_pieces
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (strings for sub-words) in a single string."""
+        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
+        return out_string
+
+    def convert_ids_to_string(self, ids):
+        """
+        Converts a sequence of tokens (strings for sub-words) in a single string.
+        """
+        tokens = self.convert_ids_to_tokens(ids)
+        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
+        return out_string
+
+    # to mimic paddlenlp.transformers.ernie_m.tokenizer.ErnieMTokenizer functioning
+    def _convert_token_to_id(self, token):
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+
+    # to mimic paddlenlp.transformers.ernie_m.tokenizer.ErnieMTokenizer functioning
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.reverse_vocab.get(index, self.unk_token)
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        r"""
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. An ErnieM sequence has the following format:
+
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] [SEP] B [SEP]`
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `list[int]`: List of input_id with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        _cls = [self.cls_token_id]
+        _sep = [self.sep_token_id]
+        return _cls + token_ids_0 + _sep + _sep + token_ids_1 + _sep
+
+    def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None):
+        r"""
+        Build offset map from a pair of offset map by concatenating and adding offsets of special tokens. An Ernie-M
+        offset_mapping has the following format:
+
+        - single sequence: `(0,0) X (0,0)`
+        - pair of sequences: `(0,0) A (0,0) (0,0) B (0,0)`
+
+        Args:
+            offset_mapping_ids_0 (`list[tuple]`):
+                List of char offsets to which the special tokens will be added.
+            offset_mapping_ids_1 (`list[tuple]`, *optional*):
+                Optional second list of wordpiece offsets for offset mapping pairs.
+        Returns:
+            `list[tuple]`: List of wordpiece offsets with the appropriate offsets of special tokens.
+        """
+        if offset_mapping_1 is None:
+            return [(0, 0)] + offset_mapping_0 + [(0, 0)]
+
+        return [(0, 0)] + offset_mapping_0 + [(0, 0), (0, 0)] + offset_mapping_1 + [(0, 0)]
+
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+        r"""
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `encode` method.
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of ids of the first sequence.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`str`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+        Returns:
+            `list[int]`:
+                The list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formatted with special tokens for the model."
+                )
+            return [1 if x in [self.sep_token_id, self.cls_token_id] else 0 for x in token_ids_0]
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Create the token type IDs corresponding to the sequences passed. [What are token type
+        IDs?](../glossary#token-type-ids) Should be overridden in a subclass if the model has a special way of
+        building: those.
+
+        Args:
+            token_ids_0 (`list[int]`):
+                The first tokenized sequence.
+            token_ids_1 (`list[int]`, *optional*):
+                The second tokenized sequence.
+        Returns:
+            `list[int]`: The token type ids.
+        """
+        # called when `add_special_tokens` is True, so align with `build_inputs_with_special_tokens` method
+        if token_ids_1 is None:
+            # [CLS] X [SEP]
+            return (len(token_ids_0) + 2) * [0]
+
+        # [CLS] A [SEP] [SEP] B [SEP]
+        return [0] * (len(token_ids_0) + 1) + [1] * (len(token_ids_1) + 3)
+
+    def is_ch_char(self, char):
+        """
+        is_ch_char
+        """
+        if "\u4e00" <= char <= "\u9fff":
+            return True
+        return False
+
+    def is_alpha(self, char):
+        """
+        is_alpha
+        """
+        if ("a" <= char <= "z") or ("A" <= char <= "Z"):
+            return True
+        return False
+
+    def is_punct(self, char):
+        """
+        is_punct
+        """
+        if char in ",;:.?!~，；：。？！《》【】":
+            return True
+        return False
+
+    def is_whitespace(self, char):
+        """
+        is whitespace
+        """
+        if char == " " or char == "\t" or char == "\n" or char == "\r":
+            return True
+        if len(char) == 1:
+            cat = unicodedata.category(char)
+            if cat == "Zs":
+                return True
+        return False
+
+    def load_vocab(self, filepath):
+        token_to_idx = {}
+        with open(filepath, "r", encoding="utf-8") as f:
+            for index, line in enumerate(f):
+                token = line.rstrip("\n")
+                token_to_idx[token] = int(index)
+
+        return token_to_idx
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        index = 0
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(
+                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+            )
+        else:
+            vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
+                        " Please check that the vocabulary is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(token + "\n")
+                index += 1
+
+        tokenizer_model_file = os.path.join(save_directory, "sentencepiece.bpe.model")
+        with open(tokenizer_model_file, "wb") as fi:
+            content_spiece_model = self.sp_model.serialized_model_proto()
+            fi.write(content_spiece_model)
+
+        return (vocab_file,)
+
+
+__all__ = ["ErnieMTokenizer"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/gptsan_japanese/__init__.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/gptsan_japanese/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c23b58f35012f1ddc00e2275c484810287de6f8
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/gptsan_japanese/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ....utils import _LazyModule
+from ....utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_gptsan_japanese import *
+    from .modeling_gptsan_japanese import *
+    from .tokenization_gptsan_japanese import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/gptsan_japanese/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/gptsan_japanese/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8d23ffdd6728f8add8265e8c72af4cdf45bd4a7a
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/gptsan_japanese/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/gptsan_japanese/__pycache__/configuration_gptsan_japanese.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/gptsan_japanese/__pycache__/configuration_gptsan_japanese.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..84efbe7ee76a657f9ce2eb0bae19a3a3d78684b1
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/gptsan_japanese/__pycache__/configuration_gptsan_japanese.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/gptsan_japanese/__pycache__/modeling_gptsan_japanese.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/gptsan_japanese/__pycache__/modeling_gptsan_japanese.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..075f001dc866e3a895c5bd3aa2bccca382a0e901
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/gptsan_japanese/__pycache__/modeling_gptsan_japanese.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/gptsan_japanese/__pycache__/tokenization_gptsan_japanese.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/gptsan_japanese/__pycache__/tokenization_gptsan_japanese.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1555cf5a639de2029a1905eede230c42bd6b369e
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/gptsan_japanese/__pycache__/tokenization_gptsan_japanese.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/gptsan_japanese/configuration_gptsan_japanese.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/gptsan_japanese/configuration_gptsan_japanese.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7d1e23e080eddc5cce28e72c551d123bec09de6
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/gptsan_japanese/configuration_gptsan_japanese.py
@@ -0,0 +1,157 @@
+# coding=utf-8
+# Copyright 2023, HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""GPTSAN-japanese model configuration"""
+
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class GPTSanJapaneseConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`GPTSanJapaneseModel`]. It is used to instantiate
+    a GPTSANJapanese model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the GPTSANJapanese
+    [Tanrei/GPTSAN-japanese](https://huggingface.co/Tanrei/GPTSAN-japanese) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Arguments:
+        vocab_size (`int`, *optional*, defaults to 36000):
+            Vocabulary size of the GPTSANJapanese model. Defines the number of different tokens that can be represented
+            by the `inputs_ids` passed when calling [`GPTSanJapaneseModel`].
+        max_position_embeddings (`int`, *optional*, defaults to 1280):
+            The maximum sequence length that this model might ever be used with. Defaults set this to 1280.
+        d_model (`int`, *optional*, defaults to 1024):
+            Size of the encoder layers and the pooler layer.
+        d_ff (`int`, *optional*, defaults to 8192):
+            Size of the intermediate feed forward layer in each `SwitchTransformersBlock`.
+        d_ext (`int`, *optional*, defaults to 4096):
+            Size of the intermediate feed forward layer in each Extra-layers.
+        d_spout (`int`, *optional*, defaults to 128):
+            Size of the `spout` vector.
+        num_switch_layers (`int`, *optional*, defaults to 10):
+            Number of layers in the Switch Transformer layer.
+        num_ext_layers (`int`, *optional*, defaults to 0):
+            Number of layers in the Extra-layers.
+        num_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_experts (`int`, *optional*, defaults to 16):
+            Number of experts for each SwitchTransformer layer.
+        expert_capacity (`int`, *optional*, defaults to 128):
+            Number of tokens that can be stored in each expert. If set to 1, the model will behave like a regular
+            Transformer.
+        dropout_rate (`float`, *optional*, defaults to 0.0):
+            The ratio for all dropout layers.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        router_bias (`bool`, *optional*, defaults to `False`):
+            Whether to add a bias to the router.
+        router_jitter_noise (`float`, *optional*, defaults to 0.0):
+            Amount of noise to add to the router. Set it to 0.0 during prediction or set small value (usually 1e-2)
+            during training.
+        router_dtype (`str`, *optional*, default to `"float32"`):
+            The `dtype` used for the routers. It is preferable to keep the `dtype` to `"float32"` as specified in the
+            *selective precision* discussion in [the paper](https://huggingface.co/papers/2101.03961).
+        router_ignore_padding_tokens (`bool`, *optional*, defaults to `False`):
+            Whether to ignore padding tokens when routing.
+        output_hidden_states (`bool`, *optional*, default to `False`):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        output_attentions (`bool`, *optional*, defaults to `False`):
+            Whether or not to return the attentions tensors of all attention layers.
+        initializer_factor (`float`, *optional*, defaults to 0.002):
+            A factor for initializing all weight matrices.
+        output_router_logits (`bool`, *optional*, default to `False`):
+            Whether or not to return the router logits of all experts.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models)
+    """
+
+    model_type = "gptsan-japanese"
+    keys_to_ignore_at_inference = [
+        "past_key_values",
+    ]
+    attribute_map = {
+        "hidden_size": "d_model",
+        "num_attention_heads": "num_heads",
+        "num_hidden_layers": "num_layers",
+    }
+
+    def __init__(
+        self,
+        vocab_size=36000,
+        max_position_embeddings=1280,
+        d_model=1024,
+        d_ff=8192,
+        d_ext=4096,
+        d_spout=128,
+        num_switch_layers=10,
+        num_ext_layers=0,
+        num_heads=16,
+        num_experts=16,
+        expert_capacity=128,
+        dropout_rate=0.0,
+        layer_norm_epsilon=1e-5,
+        router_bias=False,
+        router_jitter_noise=0.0,
+        router_dtype="float32",
+        router_ignore_padding_tokens=False,
+        output_hidden_states=False,
+        output_attentions=False,
+        initializer_factor=0.002,
+        output_router_logits=False,
+        use_cache=True,
+        separator_token_id=35998,
+        pad_token_id=35995,
+        eos_token_id=35999,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.d_model = d_model
+        self.d_ff = d_ff
+        self.d_ext = d_ext
+        self.d_spout = d_spout
+        self.num_switch_layers = num_switch_layers
+        self.num_ext_layers = num_ext_layers
+        self.num_layers = num_switch_layers + num_ext_layers
+        self.num_heads = num_heads
+        self.num_experts = num_experts
+        self.expert_capacity = expert_capacity
+        self.dropout_rate = dropout_rate
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.router_bias = router_bias
+        self.router_jitter_noise = router_jitter_noise
+        self.router_dtype = router_dtype
+        self.router_ignore_padding_tokens = router_ignore_padding_tokens
+        self.output_hidden_states = output_hidden_states
+        self.output_attentions = output_attentions
+        self.initializer_factor = initializer_factor
+        self.output_router_logits = output_router_logits
+        self.use_cache = use_cache
+
+        super().__init__(
+            separator_token_id=separator_token_id,
+            pad_token_id=pad_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs,
+        )
+
+
+__all__ = ["GPTSanJapaneseConfig"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/gptsan_japanese/modeling_gptsan_japanese.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/gptsan_japanese/modeling_gptsan_japanese.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e2d27f03bacbd60ade444fe3221ada9f714cb60
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/gptsan_japanese/modeling_gptsan_japanese.py
@@ -0,0 +1,1331 @@
+# coding=utf-8
+# Copyright 2023 Toshiyuki Sakamoto(tanreinama) and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch GPTSANJapanese model."""
+
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+
+from ....activations import ACT2FN
+from ....cache_utils import Cache
+from ....modeling_outputs import MoECausalLMOutputWithPast, MoEModelOutputWithPastAndCrossAttentions
+from ....modeling_utils import PreTrainedModel
+from ....utils import (
+    DUMMY_INPUTS,
+    DUMMY_MASK,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_torch_fx_proxy,
+    logging,
+)
+from ....utils.deprecation import deprecate_kwarg
+from .configuration_gptsan_japanese import GPTSanJapaneseConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "GPTSanJapaneseConfig"
+_CHECKPOINT_FOR_DOC = "Tanrei/GPTSAN-japanese"
+
+####################################################
+# This dict contains ids and associated url
+# for the pretrained weights provided with the models
+####################################################
+
+
+def router_z_loss_func(router_logits: torch.Tensor) -> float:
+    r"""
+    Compute the router z-loss implemented in PyTorch.
+
+    The router z-loss was introduced in [Designing Effective Sparse Expert Models](https://huggingface.co/papers/2202.08906).
+    It encourages router logits to remain small in an effort to improve stability.
+
+    Args:
+        router_logits (`float`):
+            Input logits of shape [batch_size, sequence_length, num_experts]
+
+    Returns:
+        Scalar router z-loss.
+    """
+    num_groups, tokens_per_group, _ = router_logits.shape
+    log_z = torch.logsumexp(router_logits, dim=-1)
+    z_loss = log_z**2
+    return torch.sum(z_loss) / (num_groups * tokens_per_group)
+
+
+def load_balancing_loss_func(router_probs: torch.Tensor, expert_indices: torch.Tensor) -> float:
+    r"""
+    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.
+
+    See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
+    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
+    experts is too unbalanced.
+
+    Args:
+        router_probs (`torch.Tensor`):
+            Probability assigned to each expert per token. Shape: [batch_size, sequence_length, num_experts].
+        expert_indices (`torch.Tensor`):
+            Indices tensor of shape [batch_size, sequence_length] identifying the selected expert for a given token.
+
+    Returns:
+        The auxiliary loss.
+    """
+    num_experts = router_probs.shape[-1]
+
+    # cast the expert indices to int64, otherwise one-hot encoding will fail
+    if expert_indices.dtype != torch.int64:
+        expert_indices = expert_indices.to(torch.int64)
+
+    if len(expert_indices.shape) == 2:
+        expert_indices = expert_indices.unsqueeze(2)
+
+    expert_mask = torch.nn.functional.one_hot(expert_indices, num_experts)
+
+    # For a given token, determine if it was routed to a given expert.
+    expert_mask = torch.max(expert_mask, axis=-2).values
+
+    # cast to float32 otherwise mean will fail
+    expert_mask = expert_mask.to(torch.float32)
+    tokens_per_group_and_expert = torch.mean(expert_mask, axis=-2)
+
+    router_prob_per_group_and_expert = torch.mean(router_probs, axis=-2)
+    return torch.mean(tokens_per_group_and_expert * router_prob_per_group_and_expert) * (num_experts**2)
+
+
+class GPTSanJapaneseDenseActDense(nn.Module):
+    """
+    FFN Layer for Switch Transformer and Extra layers
+
+    GPTSAN can mix Switch Transformer layers and normal Transformer layers This class is used as Expert in Switch
+    Transformer layers and as FFN in regular Transformer layers. RELU is used in the Switch Transformer layer, and
+    Swish is used in the normal Transformer layer, so there is a choice of which is used in the argument.
+
+    """
+
+    def __init__(self, config: GPTSanJapaneseConfig, ext_layer=False):
+        super().__init__()
+        d_inter = config.d_ext if ext_layer else config.d_ff
+        self.wi = nn.Linear(config.d_model, d_inter, bias=ext_layer)
+        self.wo = nn.Linear(d_inter, config.d_model, bias=ext_layer)
+        self.dropout = nn.Identity() if ext_layer else nn.Dropout(config.dropout_rate)
+        self.act = ACT2FN["swish" if ext_layer else "relu"]
+
+    def forward(self, hidden_states):
+        r"""
+        Args:
+            hidden_states (`torch.Tensor`) :
+                [num_groups, tokens_per_group, hidden_dim] inputs to send to experts.
+        Returns:
+            torch.Tensor[num_groups, tokens_per_group, hidden_dim]
+
+        """
+        hidden_states = self.wi(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.wo(hidden_states)
+        return hidden_states
+
+
+class GPTSanJapaneseTop1Router(nn.Module):
+    """
+    Router using tokens choose top-1 experts assignment.
+
+    This router uses the same mechanism as in Switch Transformer (https://huggingface.co/papers/2101.03961) and V-MoE
+    (https://huggingface.co/papers/2106.05974): tokens choose their top experts. Items are sorted by router_probs and then
+    routed to their choice of expert until the expert's expert_capacity is reached. **There is no guarantee that each
+    token is processed by an expert**, or that each expert receives at least one token.
+
+    """
+
+    def __init__(self, config: GPTSanJapaneseConfig):
+        super().__init__()
+        self.num_experts = config.num_experts
+        self.expert_capacity = config.expert_capacity
+        self.classifier = nn.Linear(config.hidden_size, self.num_experts, bias=config.router_bias)
+        self.jitter_noise = config.router_jitter_noise
+        self.ignore_padding_tokens = config.router_ignore_padding_tokens
+        self.dtype = getattr(torch, config.router_dtype)
+
+    def _compute_router_probabilities(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        r"""
+        Computes router probabilities from input hidden states.
+
+        Args:
+            hidden_states (`torch.Tensor`):
+                (batch_size, sequence_length, hidden_dim) from which router probabilities are computed.
+        Returns:
+            router_probabilities (`torch.Tensor`):
+                Tensor of shape (batch_size, sequence_length, num_experts) corresponding to the probabilities for each
+                token and expert. Used for routing tokens to experts.
+            router_logits (`torch.Tensor`):
+                Logits tensor of shape (batch_size, sequence_length, num_experts) corresponding to raw router logits.
+                This is used later for computing router z-loss.
+        """
+        # float32 is used to ensure stability. See the discussion of "selective precision" in
+        # https://huggingface.co/papers/2101.03961.
+        # We also store the previous dtype to cast back the output to the previous dtype
+        self.input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(self.dtype)
+
+        if self.training and self.jitter_noise > 0:
+            # Multiply the token inputs by the uniform distribution - adding some noise
+            hidden_states *= torch.empty_like(hidden_states).uniform_(1.0 - self.jitter_noise, 1.0 + self.jitter_noise)
+
+        # Shape: [num_groups, tokens_per_group, num_experts]
+        self._cast_classifier()
+        router_logits = self.classifier(hidden_states)
+
+        # Apply Softmax and cast back to the original `dtype`
+        router_probabilities = nn.functional.softmax(router_logits, dim=-1, dtype=self.dtype).to(self.input_dtype)
+        return router_probabilities, router_logits
+
+    def _cast_classifier(self):
+        r"""
+        `bitsandbytes` `Linear8bitLt` layers does not support manual casting Therefore we need to check if they are an
+        instance of the `Linear8bitLt` class by checking special attributes.
+        """
+        if not (hasattr(self.classifier, "SCB") or hasattr(self.classifier, "CB")):
+            self.classifier = self.classifier.to(self.dtype)
+
+    def forward(self, hidden_states: torch.Tensor) -> tuple:
+        r"""
+        Generic forward function for every Router class. Each Router expects to have the same input hidden states
+        (`hidden_states`) corresponding to the hidden states for each token, the `expert_capacity` corresponding to the
+        number of tokens the Router will send to each expert, some Routers can send up to few tokens to each expert.
+
+        Each Router works as the following: it expects the hidden states for each token, gets the `router_probs` and
+        `router_logits` from the `router_weights`. This will assign for each token, the raw probability to be assigned
+        to an expert. Then each Router class will have to define its own `_compute_routing_instructions`.
+
+        Args:
+            hidden_states (`torch.Tensor`) :
+                [num_groups, tokens_per_group, hidden_dim] inputs to send to experts.
+        Returns:
+            tuple[`torch.Tensor`, `torch.Tensor`, `torch.Tensor`] Tuple containing the expert index, the router probs
+            and the router logits. The router probabilities and logits are required to compute the loss.
+        """
+        router_probs, router_logits = self._compute_router_probabilities(hidden_states)
+
+        expert_index = torch.argmax(router_probs, dim=-1)
+        expert_index = torch.nn.functional.one_hot(expert_index, num_classes=self.num_experts)
+
+        # Mask tokens outside expert capacity. Sum over each sequence
+        token_priority = torch.cumsum(expert_index, dim=-2)
+        # mask if the token routed to to the expert will overflow
+        expert_capacity_mask = token_priority <= self.expert_capacity
+        expert_index = expert_index * expert_capacity_mask
+
+        router_probs = torch.max(router_probs, dim=-1).values.unsqueeze(-1)
+        return expert_index, router_probs, router_logits
+
+
+class GPTSanJapaneseSparseMLP(nn.Module):
+    r"""
+    Implementation of the Switch Transformers Sparse MLP module.
+    """
+
+    def __init__(self, config: GPTSanJapaneseConfig, expert_class: nn.Module = GPTSanJapaneseDenseActDense):
+        super().__init__()
+        # Step 1: Get the correct router according to its class
+        self.router = GPTSanJapaneseTop1Router(config)
+
+        # Step 2: Get the experts
+        self.experts = nn.ModuleDict()
+        for idx in range(config.num_experts):
+            self.experts[f"expert_{idx}"] = expert_class(config)
+
+    def forward(self, hidden_states):
+        r"""
+        Hold on, this will be slightly tricky to understand In the correct order, a MoE layer does the following:
+
+        1- Gets the `router_mask` from the router. The shape of the mask is `(batch_size, sequence_length, num_expert)`
+        and corresponds to the argmax of the `router_probs`. The probabilities are needed in the computation of the
+        hidden states : they are broadcasted to the hidden states values (can be interpreted as a scaling factor).
+
+        2- Dispatch the tokens to its associated experts. We do a classic for loop over the experts and assign for each
+        expert the corresponding hidden states.
+
+        """
+        # Step 1: Get the router_mask from the router as well as the probabilities
+        router_mask, router_probs, router_logits = self.router(hidden_states)
+        expert_index = torch.argmax(router_mask, dim=-1)
+
+        # The routers introduced might not always map all the tokens, to a router, which means that some hidden states
+        # can be unchanged from one layer to another. That is why the hidden states are cloned before updating only the selected ones.
+
+        next_states = hidden_states.clone()
+        for idx, expert in enumerate(self.experts.values()):
+            token_indices = router_mask[:, :, idx].bool()
+            next_states[token_indices] = expert(hidden_states[token_indices]).to(next_states.dtype)
+
+        hidden_states = router_probs * next_states
+        return hidden_states, (router_logits, expert_index)
+
+
+class GPTSanJapaneseLayerSparseFF(nn.Module):
+    r"""
+    Switch Transformers Feed Forward layer module. This is a wrapper around the Mixture of Experts module.
+
+    Parameters:
+        config : ([`GPTSanJapaneseConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+    """
+
+    def __init__(self, config: GPTSanJapaneseConfig):
+        super().__init__()
+        self.mlp = GPTSanJapaneseSparseMLP(config)
+        self.soft_bypass_mlp = nn.Linear(config.d_model, config.d_model, bias=False)
+        self.norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+
+    def forward(self, hidden_states, output_router_logits):
+        r"""
+        Args:
+            hidden_states (`torch.Tensor`) :
+                [num_groups, tokens_per_group, hidden_dim] inputs to send to experts.
+            output_router_logits (`bool`) :
+                output experts router output.
+        Returns:
+            torch.Tensor[num_groups, tokens_per_group, hidden_dim]
+
+        """
+        forwarded_states, router_tuple = self.mlp(hidden_states)
+        forwarded_states += torch.tanh(self.soft_bypass_mlp(hidden_states))
+        output = hidden_states + self.norm(forwarded_states)
+
+        if output_router_logits and router_tuple is not None:
+            return output, router_tuple
+        else:
+            return output
+
+
+class GPTSanJapaneseLayerDenseFF(nn.Module):
+    r"""
+    Extra Transformers Feed Forward layer module.
+
+    Parameters:
+        config : ([`GPTSanJapaneseConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+    """
+
+    def __init__(self, config: GPTSanJapaneseConfig):
+        super().__init__()
+        # Check if it is a sparse layer, if not then it is a dense layer
+        self.mlp = GPTSanJapaneseDenseActDense(config, ext_layer=True)
+        self.norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+
+    def forward(self, hidden_states):
+        r"""
+        Args:
+            hidden_states (`torch.Tensor`) :
+                [num_groups, tokens_per_group, hidden_dim] inputs to send to experts.
+        Returns:
+            torch.Tensor[num_groups, tokens_per_group, hidden_dim]
+
+        """
+        forwarded_states = self.mlp(hidden_states)
+        output = hidden_states + self.norm(forwarded_states)
+        return output
+
+
+class GPTSanJapaneseAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+        is_causal: bool = False,
+        config: Optional[GPTSanJapaneseConfig] = None,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        self.config = config
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+        self.is_causal = is_causal
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        # `past_key_values[0].shape[2] == key_value_states.shape[1]`
+        # is checking that the `sequence_length` of the `past_key_values` is the same as
+        # the provided `key_value_states` to support prefix tuning
+        if (
+            is_cross_attention
+            and past_key_values is not None
+            and past_key_values[0].shape[2] == key_value_states.shape[1]
+        ):
+            # reuse k,v, cross_attentions
+            key_states = past_key_values[0]
+            value_states = past_key_values[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_values is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_values[0], key_states], dim=2)
+            value_states = torch.cat([past_key_values[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_values` is always `None`
+            past_key_values = (key_states, value_states)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.reshape(*proj_shape)
+        value_states = value_states.reshape(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                    f" {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to be reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned across GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped, past_key_values
+
+
+class GPTSanJapaneseLayerSelfAttention(nn.Module):
+    """
+    Self Attention and Normalization Unit
+    """
+
+    def __init__(self, config, has_relative_attention_bias=False):
+        super().__init__()
+        self.self_attn = GPTSanJapaneseAttention(
+            embed_dim=config.d_model,
+            num_heads=config.num_heads,
+            is_decoder=True,
+            bias=has_relative_attention_bias,
+        )
+        self.norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: Optional[tuple[torch.FloatTensor]],
+        past_key_values: Optional[Cache] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[Union[torch.Tensor, tuple[torch.Tensor]], ...]:
+        r"""
+        Self-attention and normalize block.
+
+        Args:
+            hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                if the model is configured as a decoder.
+            past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+                Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up
+                decoding. If `past_key_values` are used, the user can optionally input only the last
+                `decoder_input_ids` (those that don't have their past key value states given to this model) of shape
+                `(batch_size, 1)` instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+            attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used
+                in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+            head_mask (`numpy.ndarray` of shape `({0})`, `optional):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        Returns:
+            tuple[torch.Tensor[num_groups, tokens_per_group, hidden_dim],...]
+        """
+        # Self Attention
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_values[:2] if past_key_values is not None else None
+        # add present self-attn cache to positions 1,2 of present_key_value tuple
+        atten_out = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_values=self_attn_past_key_value,
+            attention_mask=(1 - attention_mask) * torch.finfo(hidden_states.dtype).min,
+            layer_head_mask=head_mask,
+            output_attentions=output_attentions,
+        )
+        if output_attentions:
+            attn_weights = (atten_out[1],)
+        else:
+            attn_weights = ()
+
+        attention_output = atten_out[0]
+
+        hidden = hidden_states + self.norm(attention_output)
+
+        if use_cache:
+            outputs = (hidden, atten_out[2])  # hidden, present, (attentions)
+        else:
+            outputs = (hidden,)  # hidden, (attentions)
+
+        return outputs + attn_weights
+
+
+class GPTSanJapaneseBlock(nn.Module):
+    """
+    Self Attention and FFN Unit
+    """
+
+    def __init__(self, config, ext_layer=False):
+        super().__init__()
+        self.self_attn = GPTSanJapaneseLayerSelfAttention(config)
+        self.feed_forward = GPTSanJapaneseLayerDenseFF(config) if ext_layer else GPTSanJapaneseLayerSparseFF(config)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: Optional[tuple[torch.FloatTensor]],
+        past_key_values: Optional[Cache] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+        output_router_tuple: Optional[bool] = False,
+    ) -> tuple[Union[torch.Tensor, tuple[torch.Tensor]], ...]:
+        r"""
+        GPTSAN transformer block.
+
+        Args:
+            hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                if the model is configured as a decoder.
+            past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+                Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up
+                decoding. If `past_key_values` are used, the user can optionally input only the last
+                `decoder_input_ids` (those that don't have their past key value states given to this model) of shape
+                `(batch_size, 1)` instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+            attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used
+                in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+            head_mask (`numpy.ndarray` of shape `({0})`, `optional):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            output_attentions (`bool`) :
+                output attention probabirities.
+            output_router_tuple:
+                output experts router logits and expert id.
+        Returns:
+            tuple[torch.Tensor[num_groups, tokens_per_group, hidden_dim],...]
+        """
+        atten_out = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        attention_output = atten_out[0]
+
+        if isinstance(self.feed_forward, GPTSanJapaneseLayerSparseFF):
+            sparse_out = self.feed_forward(attention_output, output_router_tuple)
+            if output_router_tuple:
+                hidden, router_tuple = sparse_out
+            else:
+                hidden = sparse_out
+        else:
+            hidden = self.feed_forward(attention_output)
+
+        outputs = (hidden,) + atten_out[1:]
+
+        if isinstance(self.feed_forward, GPTSanJapaneseLayerSparseFF) and output_router_tuple:
+            outputs += (router_tuple,)
+
+        return outputs
+
+
+class GPTSanJapanesePreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config: GPTSanJapaneseConfig
+    base_model_prefix = "gptsan_japanese"
+    supports_gradient_checkpointing = False
+    _no_split_modules = ["GPTSanJapaneseBlock"]
+    _skip_keys_device_placement = "past_key_values"
+
+    @property
+    def dummy_inputs(self):
+        input_ids = torch.tensor(DUMMY_INPUTS)
+        input_mask = torch.tensor(DUMMY_MASK)
+        dummy_inputs = {
+            "input_ids": input_ids,
+            "attention_mask": input_mask,
+        }
+        return dummy_inputs
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        factor = self.config.initializer_factor  # Used for testing weights initialization
+        if isinstance(module, nn.LayerNorm):
+            module.weight.data.fill_(factor * 1.0)
+            module.bias.data.zero_()
+        elif isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
+            if hasattr(module, "bias") and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=factor * 1.0)
+        elif isinstance(module, GPTSanJapaneseModel):
+            # Mesh TensorFlow embeddings initialization
+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624
+            module.embed_tokens.weight.data.normal_(mean=0.0, std=factor * 1.0)
+            module.position_embeddings.weight.data.normal_(mean=0.0, std=factor * 1.0)
+            if hasattr(module, "extra_position_embeddings") and module.extra_position_embeddings is not None:
+                module.extra_position_embeddings.weight.data.normal_(mean=0.0, std=factor * 1.0)
+        elif isinstance(module, (GPTSanJapaneseModel, GPTSanJapaneseForConditionalGeneration)):
+            # Mesh TensorFlow embeddings initialization
+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624
+            module.final_logits_bias.data.normal_(mean=0.0, std=factor * 1.0)
+            if hasattr(module, "lm_head") and not self.config.tie_word_embeddings:
+                module.lm_head.weight.data.normal_(mean=0.0, std=factor * 1.0)
+        elif isinstance(module, GPTSanJapaneseDenseActDense):
+            # Mesh TensorFlow FF initialization
+            # See https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56
+            # and https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L89
+            module.wi.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
+            if hasattr(module.wi, "bias") and module.wi.bias is not None:
+                module.wi.bias.data.zero_()
+            module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff) ** -0.5))
+            if hasattr(module.wo, "bias") and module.wo.bias is not None:
+                module.wo.bias.data.zero_()
+        elif isinstance(module, GPTSanJapaneseAttention):
+            # Multi-headed attention
+            d_model = self.config.d_model
+            key_value_proj_dim = self.config.d_model
+            n_heads = self.config.num_heads
+            module.k_proj.weight.data.normal_(mean=0.0, std=factor * ((d_model * key_value_proj_dim) ** -0.5))
+            module.v_proj.weight.data.normal_(mean=0.0, std=factor * ((d_model * key_value_proj_dim) ** -0.5))
+            module.q_proj.weight.data.normal_(mean=0.0, std=factor * ((d_model * key_value_proj_dim) ** -0.5))
+            module.out_proj.weight.data.normal_(mean=0.0, std=factor * ((n_heads * key_value_proj_dim) ** -0.5))
+        elif isinstance(module, GPTSanJapaneseSparseMLP):
+            # Mesh TensorFlow attention initialization to avoid scaling before softmax
+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/attention.py#L136
+            d_model = self.config.d_model
+            key_value_proj_dim = self.config.d_model
+            n_heads = self.config.num_heads
+            module.router.classifier.weight.data.normal_(mean=0.0, std=factor * 1)
+            for idx in range(self.config.num_experts):
+                module.experts[f"expert_{idx}"].wi.weight.data.normal_(mean=0.0, std=factor * (d_model**-0.5))
+                module.experts[f"expert_{idx}"].wo.weight.data.normal_(mean=0.0, std=factor * (d_model**-0.5))
+
+    def _shift_right(self, input_ids):
+        decoder_start_token_id = self.config.decoder_start_token_id
+        pad_token_id = self.config.pad_token_id
+
+        if decoder_start_token_id is None:
+            raise ValueError(
+                "self.model.config.decoder_start_token_id has to be defined. In T5 it is usually set to the pad_token_id. "
+                "See T5 docs for more information."
+            )
+
+        # shift inputs to the right
+        if is_torch_fx_proxy(input_ids):
+            # Item assignment is not supported natively for proxies.
+            shifted_input_ids = torch.full(input_ids.shape[:-1] + (1,), decoder_start_token_id)
+            shifted_input_ids = torch.cat([shifted_input_ids, input_ids[..., :-1]], dim=-1)
+        else:
+            shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+            shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
+            shifted_input_ids[..., 0] = decoder_start_token_id
+
+        if pad_token_id is None:
+            raise ValueError("self.model.config.pad_token_id has to be defined.")
+        # replace possible -100 values in labels by `pad_token_id`
+        shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+        return shifted_input_ids
+
+
+GPTSAN_JAPANESE_START_DOCSTRING = r"""
+
+    The [GPTSAN-japanese](https://github.com/tanreinama/GPTSAN) model was proposed in General-purpose Swich transformer
+    based Japanese language model
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`GPTSanJapaneseConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+GPTSAN_JAPANESE_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. GPTSAN-japanese is a model that generates sentence
+            continuations or predicts tokens at mask positions. Special tokens required for inputs to the model are
+            automatically appended.
+        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            An input that masks the Prefix part in the Prefix-LM input. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **prefix** input,
+            - 0 for tokens that are **not-prefix** input.
+        spout (`torch.Tensor` of shape `(batch_size, config.d_spout)`):
+                This vector is transformed through an 8-layer FFN and can be used instead of `past_key_values`.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
+            representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be
+            input (see `past_key_values`). This is useful if you want more control over how to convert
+            `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        router_logits (`tuple(torch.FloatTensor)`, *optional*, returned when `output_router_logits=True` is passed or when `config.add_router_probs=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`.
+            Router logits of the decoder model, useful to compute the auxiliary loss for Mixture of Experts models.
+"""
+
+
+@add_start_docstrings(
+    "The bare GPTSAN-japanese Model transformer outputting raw hidden-states without any specific head on top.",
+    GPTSAN_JAPANESE_START_DOCSTRING,
+)
+class GPTSanJapaneseModel(GPTSanJapanesePreTrainedModel):
+    def __init__(self, config: GPTSanJapaneseConfig):
+        super().__init__(config)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.d_model)
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model)
+        self.last_project = nn.Linear(config.d_model, config.d_model, bias=True)
+        self.act = ACT2FN["swish"]
+
+        self.blocks = torch.nn.ModuleList([])
+        for _ in range(config.num_switch_layers):
+            self.blocks.append(GPTSanJapaneseBlock(config))
+        for _ in range(config.num_ext_layers):
+            self.blocks.append(GPTSanJapaneseBlock(config, ext_layer=True))
+
+        if config.num_ext_layers > 0:
+            self.extra_position_embeddings = nn.Embedding(config.max_position_embeddings, config.d_model)
+
+        if config.d_spout:
+            spouts = []
+            for _ in range(8):
+                spouts.append(nn.Linear(config.d_spout, config.d_spout, bias=False))
+                spouts.append(nn.Tanh())
+            spouts.append(nn.Linear(config.d_spout, config.num_layers * 2 * config.d_model, bias=False))
+            self.spout = nn.Sequential(*spouts)
+
+        self.post_init()
+
+    def set_input_embeddings(self, new_embeddings):
+        self.embed_tokens = new_embeddings
+
+    @add_start_docstrings_to_model_forward(GPTSAN_JAPANESE_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.FloatTensor] = None,
+        spout: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = False,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        output_router_logits: Optional[bool] = None,
+        num_precontext: Optional[torch.LongTensor] = None,
+    ) -> Union[MoEModelOutputWithPastAndCrossAttentions, tuple[torch.FloatTensor]]:
+        r"""
+        num_precontext (`torch.LongTensor` of shape `(batch_size,1)`):
+            length of `hybrid` input tokens in the input. Tokens up to this length refer to both front and back like
+            BERT, tokens after that refer only to front like GPT. see also:
+            https://github.com/tanreinama/GPTSAN/blob/main/report/model.md
+
+        Returns:
+            `MoEModelOutputWithPastAndCrossAttentions` or `tuple` if `return_dict` returns
+            MoEModelOutputWithPastAndCrossAttentions instead of tuple
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        device = self.position_embeddings.weight.device
+        if input_ids is None:
+            input_ids = torch.zeros([1, 1]).int().to(device)  # dummy for input_ids was None
+        if inputs_embeds is not None:
+            raise NotImplementedError(
+                "GPTSanJapaneseModel does not use `inputs_embeds`. Make sure to pass in `input_ids` instead."
+            )
+        num_pasts_contexts = 0
+        num_batch = input_ids.shape[0]
+        pasts_or_spout_value = None
+        if past_key_values is not None:
+            num_pasts_contexts = past_key_values.get_seq_length()
+        elif self.config.d_spout and spout is not None:
+            # `spout` is a special input vector specific to GPTSAN
+            # This controls the output by projecting embedded information such as the class of sentences during learning.
+            # It should passed instead of the first past_key_values.
+            # See the original GPTSAN repository for details
+            num_pasts_contexts += 1
+
+        # If there is an attention_mask, increase first one for spout
+        if self.config.d_spout and spout is not None and attention_mask is not None:
+            attention_mask_with_spout = torch.ones(num_batch, attention_mask.shape[1] + 1, device=device)
+            attention_mask_with_spout[:, 1:] -= 1 - attention_mask  # 1st token should be spout
+            attention_mask = attention_mask_with_spout  # update attention_mask
+
+        if num_precontext is not None:
+            # `num_precontext` is the number of tokens that refer to each other in prefix-lm
+            # created per batch, so dimension of num_precontext should be [batch, 1]
+            if not (
+                len(num_precontext.shape) == 2 and num_precontext.shape[1] == 1
+            ):  # num_precontext Should be [batch,1]
+                raise ValueError("num_precontext should be [batch, 1] size.")
+            num_precontext = torch.reshape(num_precontext, [-1])
+        else:
+            num_precontext = torch.zeros([num_batch]).int().to(device)
+
+        num_input_contexts = input_ids.shape[1]
+        num_output_contexts = num_input_contexts + num_pasts_contexts
+
+        hidden_states = self.embed_tokens(input_ids)
+
+        if past_key_values is not None:
+            pasts_or_spout_value = past_key_values
+        elif self.config.d_spout and spout is not None:
+            # Make vector from `spout` of GPTSAN to the same shape as past_key_values
+            pasts_or_spout_value = self.spout(spout)  # projecting `spout` vector
+            pasts_or_spout_value = torch.reshape(
+                pasts_or_spout_value,
+                [
+                    num_batch,
+                    self.config.num_layers,
+                    2,
+                    self.config.num_heads,
+                    num_pasts_contexts,
+                    self.config.d_model // self.config.num_heads,
+                ],
+            )
+            pasts_or_spout_value = torch.split(pasts_or_spout_value, [1] * self.config.num_layers, dim=1)
+            # make same shape as past_key_values
+            pasts_or_spout_value = tuple(
+                tuple(b.squeeze(1) for b in torch.split(a.squeeze(1), [1, 1], dim=1)) for a in pasts_or_spout_value
+            )
+        else:
+            pasts_or_spout_value = [None] * self.config.num_layers
+
+        # Token position considering spout and pasts
+        token_position = torch.arange(num_input_contexts).to(device) + num_pasts_contexts
+
+        if attention_mask is None:
+            attention_mask = torch.ones(num_batch, num_input_contexts, device=device)
+
+        # positions for get position_embeddings
+        gather_position = (
+            (
+                torch.zeros((num_batch, self.config.d_model, num_input_contexts)).to(device)
+                + token_position.unsqueeze(0)
+            )
+            .transpose(1, 2)
+            .long()
+        )
+        # When padding with padding_side="left", zeros line up on the left side of attention_mask, so position_embeddings is shifted accordingly
+        gather_position -= (1 - attention_mask).argmin(dim=-1).unsqueeze(1).unsqueeze(2)
+        gather_position = torch.clip(gather_position, num_pasts_contexts, self.config.max_position_embeddings - 1)
+
+        # attention_mask is applied per batch
+        for i in range(num_batch):
+            hidden_states[i] += torch.gather(self.position_embeddings.weight, dim=0, index=gather_position[i])
+
+        # Create a mask to be used when making the prefix Input length of Prefix-LM variable
+        causal_mask = (
+            torch.tril(torch.ones((num_output_contexts, num_output_contexts), dtype=torch.uint8))
+            .view(1, 1, num_output_contexts, num_output_contexts)
+            .to(device)
+        )
+        prefix_lm_mask = causal_mask[:, :, -num_input_contexts:, :]
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.unsqueeze(1).unsqueeze(2)
+            prefix_lm_mask = ((prefix_lm_mask + token_type_ids) > 0).float()
+        # Merge prefix_lm_mask and attention_mask
+        extended_attention_mask = prefix_lm_mask * attention_mask.unsqueeze(1).unsqueeze(2)
+
+        # Prepare head mask if needed
+        if head_mask is not None:
+            head_mask = self.get_head_mask(
+                head_mask, self.config.num_switch_layers + self.config.num_ext_layers
+            )  # n_layer x batch x n_heads x N x N
+
+        # outputs
+        present_key_value_states = () if self.config.use_cache or use_cache else None
+        all_hidden_states = () if self.config.output_hidden_states or output_hidden_states else None
+        all_attentions = () if self.config.output_attentions or output_attentions else None
+        all_router_probs = () if self.config.output_router_logits or output_router_logits else None
+
+        for layer, past in enumerate(pasts_or_spout_value):
+            if layer == self.config.num_switch_layers:
+                if self.config.num_ext_layers > 0:
+                    # extra_position_embeddings are extra position embeddings that are only created when extending the model with code from the original GPTSAN repository. Not used in the default model.
+                    # However, it is created when you create an additional layer and partially train only that location.
+                    # Therefore, convert_gptsan_tf_checkpoint_to_pytorch.py is used when converting and loading models created in the original GPTSAN repository.
+                    for i in range(num_batch):
+                        hidden_states[i] += torch.gather(
+                            self.extra_position_embeddings.weight, dim=0, index=gather_position[i]
+                        )
+
+            output_router_tuple = (
+                self.config.output_router_logits or output_router_logits
+            ) and layer < self.config.num_switch_layers
+            block_output = self.blocks[layer](
+                hidden_states=hidden_states,
+                past_key_values=past,
+                attention_mask=extended_attention_mask,
+                head_mask=head_mask,
+                use_cache=self.config.use_cache or use_cache,
+                output_attentions=self.config.output_attentions or output_attentions,
+                output_router_tuple=output_router_tuple,
+            )
+
+            outpos = 0
+            hidden_states = block_output[outpos]
+            if self.config.output_hidden_states or output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.config.use_cache or use_cache:
+                outpos += 1
+                present = block_output[outpos]
+                present_key_value_states += (present,)
+            if self.config.output_attentions or output_attentions:
+                outpos += 1
+                attention_probs = block_output[outpos]
+                all_attentions += (attention_probs,)
+            if output_router_tuple:
+                outpos += 1
+                router_tuple = block_output[outpos]
+                all_router_probs.append(router_tuple[0])
+
+        hidden_states = self.last_project(hidden_states)
+        hidden_states = self.act(hidden_states)
+
+        if self.config.output_hidden_states or output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    present_key_value_states,
+                    all_hidden_states,
+                    all_attentions,
+                    all_router_probs,
+                ]
+                if v is not None
+            )
+
+        return MoEModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=present_key_value_states,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+            router_probs=all_router_probs,
+        )
+
+
+@add_start_docstrings(
+    "The bare GPTSAN-japanese Model with a language modeling head.",
+    GPTSAN_JAPANESE_START_DOCSTRING,
+)
+class GPTSanJapaneseForConditionalGeneration(GPTSanJapanesePreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config: GPTSanJapaneseConfig):
+        super().__init__(config)
+        self.model = GPTSanJapaneseModel(config)
+        self.register_buffer("final_logits_bias", torch.zeros([1, config.vocab_size]))
+        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
+        if not self.config.torchscript:
+            self.lm_head.weight = self.model.embed_tokens.weight
+
+    @add_start_docstrings_to_model_forward(GPTSAN_JAPANESE_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.FloatTensor] = None,
+        spout: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = False,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        output_router_logits: Optional[bool] = None,
+        labels: Optional[torch.LongTensor] = None,
+    ) -> Union[tuple[torch.FloatTensor], MoECausalLMOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
+            labels in `[0, ..., config.vocab_size]`
+
+        Returns:
+            `MoECausalLMOutputWithPast` or `tuple` if `return_dict` returns MoECausalLMOutputWithPast instead of tuple
+
+        Example:
+
+        Text Generation with regular LM Model
+        ```python
+        >>> from transformers import AutoModel, AutoTokenizer, trainer_utils
+
+        >>> device = "cuda"
+        >>> model = AutoModel.from_pretrained("Tanrei/GPTSAN-japanese").to(device)
+        >>> tokenizer = AutoTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
+        >>> x_token = tokenizer("織田信長は、", return_tensors="pt")
+        >>> trainer_utils.set_seed(30)
+        >>> input_ids = x_token.input_ids.to(device)
+        >>> gen_token = model.generate(input_ids, max_new_tokens=50)
+        >>> tokenizer.decode(gen_token[0])
+        "織田信長は、政治・軍事の中枢まで掌握した政治家であり、日本史上類を見ない驚異的な軍事侵攻を続け..."
+        ```
+
+        Text Generation with Prefix-LM Model
+        ```python
+        >>> from transformers import AutoModel, AutoTokenizer, trainer_utils
+
+        >>> device = "cuda"
+        >>> model = AutoModel.from_pretrained("Tanrei/GPTSAN-japanese").to(device)
+        >>> tokenizer = AutoTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
+        >>> x_token = tokenizer("", prefix_text="織田信長は、", return_tensors="pt")
+        >>> trainer_utils.set_seed(30)
+        >>> input_ids = x_token.input_ids.to(device)
+        >>> token_type_ids = x_token.token_type_ids.to(device)
+        >>> gen_token = model.generate(input_ids, token_type_ids=token_type_ids, max_new_tokens=50)
+        >>> tokenizer.decode(gen_token[0])
+        "織田信長は、政治・外交で数々の戦果を上げるが、1568年からは、いわゆる本能寺の変で細川晴元に暗殺される..."
+        ```
+
+        Simultaneously Text Generation And Masked Language Model
+        ```python
+        >>> from transformers import AutoModel, AutoTokenizer, trainer_utils
+
+        >>> device = "cuda"
+        >>> model = AutoModel.from_pretrained("Tanrei/GPTSAN-japanese").to(device)
+        >>> tokenizer = AutoTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
+        >>> masked_sentence = "武田信玄は、<|inputmask|>時代ファンならぜひ押さえ<|inputmask|>きたい名将の一人。"
+        >>> x_token = tokenizer("", prefix_text=masked_sentence, return_tensors="pt")
+        >>> trainer_utils.set_seed(30)
+        >>> input_ids = x_token.input_ids.to(device)
+        >>> token_type_ids = x_token.token_type_ids.to(device)
+        >>> out_lm_token = model.generate(input_ids, token_type_ids=token_type_ids, max_new_tokens=50)
+        >>> out_mlm_token = model(input_ids, token_type_ids=token_type_ids).logits.argmax(axis=-1)
+        >>> tokenizer.decode(out_mlm_token[0])
+        "武田信玄は、戦国時代ファンならぜひ押さえておきたい名将の一人。"
+
+        >>> tokenizer.decode(out_lm_token[0][input_ids.shape[1] :])
+        "武田氏の三代に渡った武田家のひとり\n甲斐市に住む、日本史上最大の戦国大名。..."
+        ```"""
+        SEG_TOKEN = self.config.separator_token_id
+        use_cache = use_cache or self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        model_return_dict = True
+        num_precontext = None
+        if input_ids is not None:
+            num_batch = input_ids.shape[0]
+            num_precontext = torch.zeros([num_batch]).int().to(input_ids.device)
+            where_separators = torch.where(input_ids == SEG_TOKEN)
+            num_precontext[where_separators[0]] += where_separators[1]
+            num_precontext = num_precontext.unsqueeze(1)
+
+        outputs = self.model(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            spout,
+            past_key_values,
+            head_mask,
+            use_cache,
+            inputs_embeds,
+            decoder_inputs_embeds,
+            output_attentions,
+            output_hidden_states,
+            model_return_dict,
+            output_router_logits,
+            num_precontext,
+        )
+
+        lm_logits = self.lm_head(outputs[0])
+        if lm_logits.shape[-1] == self.final_logits_bias.shape[-1]:
+            lm_logits = lm_logits + self.final_logits_bias
+
+        loss = None
+        z_loss = None
+        router_probs = None
+        aux_loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(lm_logits.device)
+
+            loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
+
+            if output_router_logits:
+                # Compute the router loss (z_loss + auxiliary loss) for each router in the encoder and decoder
+                router_logits, expert_indexes = self._unpack_router_logits(outputs.router_probs)
+                z_loss = router_z_loss_func(router_logits)
+                router_probs = nn.Softmax(dim=-1)(router_logits)
+                aux_loss = load_balancing_loss_func(router_probs, expert_indexes)
+
+            loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    loss,
+                    lm_logits,
+                    outputs.past_key_values,
+                    outputs.hidden_states,
+                    outputs.router_probs,
+                    z_loss,
+                    aux_loss,
+                ]
+                if v is not None
+            )
+
+        return MoECausalLMOutputWithPast(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            router_logits=outputs.router_probs,
+            z_loss=z_loss,
+            aux_loss=aux_loss,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: torch.FloatTensor,
+        token_type_ids: Optional[torch.FloatTensor] = None,
+        spout: Optional[Union[list, torch.FloatTensor]] = None,
+        past_key_values: Optional[Cache] = None,
+        **kwargs,
+    ):
+        if isinstance(spout, list):
+            spout = torch.tensor(spout).float()
+            if input_ids is not None:
+                spout = spout.to(input_ids.device)
+        if past_key_values is not None:
+            return {
+                "input_ids": input_ids[:, -1:] if input_ids is not None else None,
+                "attention_mask": attention_mask,
+                "token_type_ids": token_type_ids[:, -1:] if token_type_ids is not None else None,
+                "spout": spout,
+                "past_key_values": past_key_values,
+            }
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "token_type_ids": token_type_ids,
+            "spout": spout,
+            "past_key_values": None,
+        }
+
+    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
+        return self._shift_right(labels)
+
+    def resize_token_embeddings(
+        self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None, mean_resizing: bool = True
+    ) -> nn.Embedding:
+        new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of, mean_resizing)
+        self._resize_final_logits_bias(new_embeddings.weight.shape[0])
+        return new_embeddings
+
+    def _resize_final_logits_bias(self, new_num_tokens: int) -> None:
+        old_num_tokens = self.final_logits_bias.shape[-1]
+        if new_num_tokens <= old_num_tokens:
+            new_bias = self.final_logits_bias[:, :new_num_tokens]
+        else:
+            extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device)
+            new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
+        self.register_buffer("final_logits_bias", new_bias)
+
+    def get_input_embeddings(self):
+        return self.model.get_input_embeddings()
+
+    def set_input_embeddings(self, new_embeddings):
+        self.model.set_input_embeddings(new_embeddings)
+
+    def _unpack_router_logits(self, router_outputs):
+        total_router_logits = []
+        total_expert_indexes = []
+        for router_output in router_outputs:
+            if len(router_output[0].shape) > 1:
+                router_logits, expert_indexes = router_output
+                total_router_logits.append(router_logits)
+                total_expert_indexes.append(expert_indexes)
+        return torch.cat(total_router_logits, dim=1), torch.cat(total_expert_indexes, dim=1)
+
+
+__all__ = ["GPTSanJapaneseForConditionalGeneration", "GPTSanJapaneseModel", "GPTSanJapanesePreTrainedModel"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/gptsan_japanese/tokenization_gptsan_japanese.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/gptsan_japanese/tokenization_gptsan_japanese.py
new file mode 100644
index 0000000000000000000000000000000000000000..1025fdf75fb46a0d350359a66c09315bd42effd3
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/gptsan_japanese/tokenization_gptsan_japanese.py
@@ -0,0 +1,518 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for GPTSANJapanese."""
+
+import collections
+import json
+import os
+import re
+import sys
+from typing import Optional, Union
+
+import numpy as np
+
+from ....tokenization_utils import PreTrainedTokenizer
+from ....tokenization_utils_base import (
+    BatchEncoding,
+    PreTokenizedInput,
+    PreTokenizedInputPair,
+    TextInput,
+    TextInputPair,
+    TruncationStrategy,
+)
+from ....utils import PaddingStrategy, logging
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "emoji_file": "emoji.json"}
+
+
+def load_vocab_and_emoji(vocab_file, emoji_file):
+    """Loads a vocabulary file and emoji file into a dictionary."""
+    with open(emoji_file, "r", encoding="utf-8") as f:
+        emoji = json.loads(f.read())
+
+    vocab = collections.OrderedDict()
+    raw_vocab = collections.OrderedDict()
+    ids_to_tokens = collections.OrderedDict()
+    with open(vocab_file, "r", encoding="utf-8") as f:
+        token = f.readlines()
+    token = [[t.rstrip("\n")] if (t == ",\n" or "," not in t) else t.rstrip("\n").split(",") for t in token]
+    for idx, b in enumerate(token):
+        ids_to_tokens[idx] = b
+        raw_vocab[",".join(b)] = idx
+        for wd in b:
+            vocab[wd] = idx
+
+    return vocab, raw_vocab, ids_to_tokens, emoji
+
+
+class GPTSanJapaneseTokenizer(PreTrainedTokenizer):
+    """
+    This tokenizer is based on GPTNeoXJapaneseTokenizer and has the following modifications
+    - Decoding byte0~byte255 tokens correctly
+    - Added bagofword token handling
+    - Return token_type_ids for Prefix-LM model
+    The bagofword token represents a repetition of the previous token and is converted to 3 consecutive tokens when
+    decoding In addition, the original Japanese special Sub-Word-Encoding has been released in this repository
+    (https://github.com/tanreinama/Japanese-BPEEncoder_V2). The token_type_ids is a mask indicating the prefix input
+    position of the Prefix-LM model. To specify a prefix position, specify a prefix input for prefix_text, or specify a
+    sentence of the prefix part and the part after it as a text pair of batch input.
+
+    Example:
+
+    ```python
+    >>> from transformers import GPTSanJapaneseTokenizer
+
+    >>> tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
+    >>> # You can confirm both 慶応 and 慶應 are encoded to 17750
+    >>> tokenizer("吾輩は猫である🐯。実は慶応(慶應)大学出身")["input_ids"]
+    [35993, 35998, 34347, 31459, 30647, 31448, 25, 30659, 35729, 35676, 32417, 30647, 17750, 35589, 17750, 35590, 321, 1281]
+
+    >>> # Both 慶応 and 慶應 are decoded to 慶応
+    >>> tokenizer.decode(tokenizer("吾輩は猫である🐯。実は慶応(慶應)大学出身")["input_ids"])
+    '吾輩は猫である🐯。実は慶応(慶応)大学出身'
+    ```
+
+    Example for Prefix-LM:
+
+    ```python
+    >>> from transformers import GPTSanJapaneseTokenizer
+
+    >>> tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
+    >>> tokenizer("実は慶応(慶應)大学出身", prefix_text="吾輩は猫である🐯。")["input_ids"]
+    [35993, 34347, 31459, 30647, 31448, 25, 30659, 35729, 35676, 35998, 32417, 30647, 17750, 35589, 17750, 35590, 321, 1281]
+
+    >>> # Mask for Prefix-LM inputs
+    >>> tokenizer("実は慶応(慶應)大学出身", prefix_text="吾輩は猫である🐯。")["token_type_ids"]
+    [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+    ```
+
+    Example for batch encode:
+
+    ```python
+    >>> from transformers import GPTSanJapaneseTokenizer
+
+    >>> tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
+    >>> tokenizer([["武田信玄", "は、"], ["織田信長", "の配下の、"]], padding=True)["input_ids"]
+    [[35993, 35998, 8640, 25948, 35993, 35998, 30647, 35675, 35999, 35999], [35993, 35998, 10382, 9868, 35993, 35998, 30646, 9459, 30646, 35675]]
+
+    >>> # Mask for Prefix-LM inputs
+    >>> tokenizer([["武田信玄", "は、"], ["織田信長", "の配下の、"]], padding=True)["token_type_ids"]
+    [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
+
+    >>> # Mask for padding
+    >>> tokenizer([["武田信玄", "は、"], ["織田信長", "の配下の、"]], padding=True)["attention_mask"]
+    [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
+    ```
+
+    Args:
+        vocab_file (`str`):
+            File containing the vocabulary.
+        emoji_file (`str`):
+            File containing the emoji.
+        unk_token (`str`, *optional*, defaults to `"<|nottoken|>"`):
+            The token used for unknown character
+        pad_token (`str`, *optional*, defaults to `"<|separator|>"`):
+            The token used for padding
+        bos_token (`str`, *optional*, defaults to `"<|startoftext|>"`):
+            The beginning of sequence token.
+        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The end of sequence token.
+        sep_token (`str`, *optional*, defaults to `"<|segmenter|>"`):
+            A special token to separate token to prefix part and general input part.
+        do_clean_text (`bool`, *optional*, defaults to `False`):
+            Whether or not to clean text for URL, EMAIL, TEL, Japanese DATE and Japanese PRICE.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask", "token_type_ids"]
+
+    def __init__(
+        self,
+        vocab_file,
+        emoji_file,
+        unk_token="<|nottoken|>",
+        pad_token="<|separator|>",
+        bos_token="<|startoftext|>",
+        eos_token="<|endoftext|>",
+        sep_token="<|segmenter|>",
+        do_clean_text=False,
+        **kwargs,
+    ):
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
+                " model use `tokenizer = GPTSanJapaneseTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
+            )
+        if not os.path.isfile(emoji_file):
+            raise ValueError(
+                f"Can't find a emoji file at path '{emoji_file}'. To load the emoji information from a Google"
+                " pretrained model use `tokenizer = GPTSanJapaneseTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
+            )
+        self.do_clean_text = do_clean_text
+        self.vocab, self.raw_vocab, self.ids_to_tokens, self.emoji = load_vocab_and_emoji(vocab_file, emoji_file)
+        self.subword_tokenizer = SubWordJapaneseTokenizer(
+            vocab=self.vocab, ids_to_tokens=self.ids_to_tokens, emoji=self.emoji
+        )
+
+        super().__init__(
+            unk_token=unk_token,
+            pad_token=pad_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            sep_token=sep_token,
+            do_clean_text=do_clean_text,
+            **kwargs,
+        )
+
+    @property
+    def vocab_size(self):
+        # self.vocab contains support for character fluctuation unique to Japanese, and has a large number of vocab
+        return len(self.raw_vocab)
+
+    def get_vocab(self):
+        return dict(self.raw_vocab, **self.added_tokens_encoder)
+
+    def _tokenize(self, text):
+        return self.subword_tokenizer.tokenize(text, clean=self.do_clean_text)
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.subword_tokenizer.convert_id_to_token(index)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        words = []
+        byte_tokens = []
+        for word in tokens:
+            if word[:6] == "<|byte" and word[-2:] == "|>":
+                byte_tokens.append(int(word[6:-2]))
+            else:
+                if len(byte_tokens) > 0:
+                    words.append(bytearray(byte_tokens).decode("utf-8", errors="replace"))
+                    byte_tokens = []
+                if word[:7] == "<|emoji" and word[-2:] == "|>":
+                    words.append(self.emoji["emoji_inv"][word])
+                elif word == "<SP>":
+                    words.append(" ")
+                elif word == "<BR>":
+                    words.append("\n")
+                elif word == "<TAB>":
+                    words.append("\t")
+                elif word == "<BLOCK>":
+                    words.append("▀")
+                elif word == "<KIGOU>":
+                    words.append("ǀ")
+                elif word == "<U2000U2BFF>":
+                    words.append("‖")
+                elif word == "<|bagoftoken|>":
+                    if len(words) > 0:
+                        words.append(words[-1])
+                        words.append(words[-1])
+                        words.append(words[-1])
+                elif word.startswith("<|") and word.endswith("|>"):
+                    words.append("")
+                else:
+                    words.append(word)
+        if len(byte_tokens) > 0:
+            words.append(bytearray(byte_tokens).decode("utf-8", errors="replace"))
+        text = "".join(words)
+        return text
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        index = 0
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(
+                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+            )
+            emoji_file = os.path.join(
+                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["emoji_file"]
+            )
+        else:
+            vocab_file = (
+                (filename_prefix + "-" if filename_prefix else "") + save_directory + VOCAB_FILES_NAMES["vocab_file"]
+            )
+            emoji_file = (
+                (filename_prefix + "-" if filename_prefix else "") + save_directory + VOCAB_FILES_NAMES["emoji_file"]
+            )
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token_index, token in self.ids_to_tokens.items():
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
+                        " Please check that the vocabulary is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(",".join(token) + "\n")
+                index += 1
+        with open(emoji_file, "w", encoding="utf-8") as writer:
+            json.dump(self.emoji, writer)
+        return vocab_file, emoji_file
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        # docstyle-ignore
+        """
+        The tokenizer returns token_type_ids as separators between the Prefix part and the rest.
+        token_type_ids is 1 for the Prefix part and 0 for the rest of the token.
+
+        Example:
+        ```python
+        >>> from transformers import GPTSanJapaneseTokenizer
+
+        >>> tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
+        >>> x_token = tokenizer("ｱｲｳｴ")
+        >>> # input_ids:      | SOT | SEG | ｱ | ｲ | ｳ | ｴ |
+        >>> # token_type_ids: | 1   | 0   | 0 | 0 | 0 | 0 |
+
+        >>> x_token = tokenizer("", prefix_text="ｱｲｳｴ")
+        >>> # input_ids:      | SOT | ｱ | ｲ | ｳ | ｴ | SEG |
+        >>> # token_type_ids: | 1   | 1 | 1 | 1 | 1 | 0  |
+
+        >>> x_token = tokenizer("ｳｴ", prefix_text="ｱｲ")
+        >>> # input_ids:      | SOT | ｱ | ｲ | SEG | ｳ | ｴ |
+        >>> # token_type_ids: | 1   | 1 | 1 | 0   | 0 | 0 |
+        ```"""
+        prefix_len = 0
+        if self.sep_token in self.vocab:
+            segid = self.vocab[self.sep_token]
+            if segid in token_ids_0:
+                prefix_len = token_ids_0.index(segid)
+        if token_ids_1 is None:
+            total_len = len(token_ids_0)
+        else:
+            total_len = len(token_ids_0 + token_ids_1)
+        return prefix_len * [1] + (total_len - prefix_len) * [0]
+
+    def prepare_for_tokenization(self, text, prefix_text=None, add_sep_token=None, **kwargs):
+        # GPTSAN inserts extra SEP tokens in Prefix-LM in addition to SOT for text generation.
+        # SOT at the beginning of the text, and SEP at the separator between the Prefix part and the rest.
+        if add_sep_token is None:
+            add_sep_token = self.sep_token not in text  # If insert un-prefix position explicitly
+        prepared = self.bos_token if self.bos_token in self.vocab else ""
+        prepared += prefix_text if prefix_text is not None else ""
+        if add_sep_token:
+            prepared += self.sep_token if self.sep_token in self.vocab else ""
+        prepared += text
+        return (prepared, kwargs)
+
+    def _batch_encode_plus(
+        self,
+        batch_text_or_text_pairs: Union[
+            list[TextInput], list[TextInputPair], list[PreTokenizedInput], list[PreTokenizedInputPair]
+        ],
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        is_split_into_words: bool = False,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[str] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        # This tokenizer converts input text pairs into Prefix input and subsequent input
+        if isinstance(batch_text_or_text_pairs[0], tuple) or isinstance(tuple(batch_text_or_text_pairs[0]), list):
+            # As a single text with an explicit un-prefix position
+            batch_prefix_texts = []
+            for pref, txt in batch_text_or_text_pairs:
+                batch_prefix_texts.append(pref + self.sep_token + txt)
+            batch_text_or_text_pairs = batch_prefix_texts
+
+        return super()._batch_encode_plus(
+            batch_text_or_text_pairs,
+            add_special_tokens,
+            padding_strategy,
+            truncation_strategy,
+            max_length,
+            stride,
+            is_split_into_words,
+            pad_to_multiple_of,
+            return_tensors,
+            return_token_type_ids,
+            return_attention_mask,
+            return_overflowing_tokens,
+            return_special_tokens_mask,
+            return_offsets_mapping,
+            return_length,
+            verbose,
+            **kwargs,
+        )
+
+
+class SubWordJapaneseTokenizer:
+    """
+    This tokenizer is based on GPTNeoXJapaneseTokenizer and has the following modifications
+    - Decoding byte0~byte255 tokens correctly
+    - Added bagofword token handling
+
+    https://github.com/tanreinama/Japanese-BPEEncoder_V2 This tokenizer class is under MIT License according to the
+    original repository.
+
+    MIT License
+
+    Copyright (c) 2020 tanreinama
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+    documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
+    rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
+    permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be included in all copies or substantial portions of
+    the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+    THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+    TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+    SOFTWARE.
+    """
+
+    def __init__(self, vocab, ids_to_tokens, emoji):
+        self.vocab = vocab  # same as swe
+        self.ids_to_tokens = ids_to_tokens  # same as bpe
+        self.emoji = emoji
+        self.maxlen = np.max([len(w) for w in self.vocab])
+        self.content_repatter1 = re.compile(r"(https?|ftp)(:\/\/[-_\.!~*\'()a-zA-Z0-9;\/?:\@&=\+$,%#]+)")
+        self.content_repatter2 = re.compile(r"[A-Za-z0-9\._+]*@[\-_0-9A-Za-z]+(\.[A-Za-z]+)*")
+        self.content_repatter3 = re.compile(r"[\(]{0,1}[0-9]{2,4}[\)\-\(]{0,1}[0-9]{2,4}[\)\-]{0,1}[0-9]{3,4}")
+        self.content_repatter4 = re.compile(
+            r"([12]\d{3}[/\-年])*(0?[1-9]|1[0-2])[/\-月]((0?[1-9]|[12][0-9]|3[01])日?)*(\d{1,2}|:|\d{1,2}時|\d{1,2}分|\(日\)|\(月\)|\(火\)|\(水\)|\(木\)|\(金\)|\(土\)|㈰|㈪|㈫|㈬|㈭|㈮|㈯)*"
+        )
+        self.content_repatter5 = re.compile(
+            r"(明治|大正|昭和|平成|令和|㍾|㍽|㍼|㍻|\u32ff)\d{1,2}年(0?[1-9]|1[0-2])月(0?[1-9]|[12][0-9]|3[01])日(\d{1,2}|:|\d{1,2}時|\d{1,2}分|\(日\)|\(月\)|\(火\)|\(水\)|\(木\)|\(金\)|\(土\)|㈰|㈪|㈫|㈬|㈭|㈮|㈯)*"
+        )
+        # The original version of this regex displays catastrophic backtracking behaviour. We avoid this using
+        # possessive quantifiers in Py >= 3.11. In versions below this, we avoid the vulnerability using a slightly
+        # different regex that should generally have the same behaviour in most non-pathological cases.
+        if sys.version_info >= (3, 11):
+            self.content_repatter6 = re.compile(
+                r"(?:\d,\d{3}|[\d億])*+"
+                r"(?:\d,\d{3}|[\d万])*+"
+                r"(?:\d,\d{3}|[\d千])*+"
+                r"(?:千円|万円|千万円|円|千ドル|万ドル|千万ドル|ドル|千ユーロ|万ユーロ|千万ユーロ|ユーロ)+"
+                r"(?:\(税込\)|\(税抜\)|\+tax)*"
+            )
+        else:
+            self.content_repatter6 = re.compile(
+                r"(?:\d,\d{3}|[\d億万千])*"
+                r"(?:千円|万円|千万円|円|千ドル|万ドル|千万ドル|ドル|千ユーロ|万ユーロ|千万ユーロ|ユーロ)+"
+                r"(?:\(税込\)|\(税抜\)|\+tax)*"
+            )
+        keisen = "─━│┃┄┅┆┇┈┉┊┋┌┍┎┏┐┑┒┓└┕┖┗┘┙┚┛├┝┞┟┠┡┢┣┤┥┦┧┨┩┪┫┬┭┮┯┰┱┲┳┴┵┶┷┸┹┺┻┼┽┾┿╀╁╂╃╄╅╆╇╈╉╊╋╌╍╎╏═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠╡╢╣╤╥╦╧╨╩╪╫╬╭╮╯╰╱╲╳╴╵╶╷╸╹╺╻╼╽╾╿"
+        blocks = "▀▁▂▃▄▅▆▇█▉▊▋▌▍▎▏▐░▒▓▔▕▖▗▘▙▚▛▜▝▞▟"
+        self.content_trans1 = str.maketrans(dict.fromkeys(keisen + blocks, "<BLOCK>"))
+
+    def __len__(self):
+        return len(self.ids_to_tokens)
+
+    def clean_text(self, content):
+        content = self.content_repatter1.sub("<URL>", content)
+        content = self.content_repatter2.sub("<EMAIL>", content)
+        content = self.content_repatter3.sub("<TEL>", content)
+        content = self.content_repatter4.sub("<DATE>", content)
+        content = self.content_repatter5.sub("<DATE>", content)
+        content = self.content_repatter6.sub("<PRICE>", content)
+        content = content.translate(self.content_trans1)
+        while "<BLOCK><BLOCK>" in content:
+            content = content.replace("<BLOCK><BLOCK>", "<BLOCK>")
+        return content
+
+    def tokenize(self, text, clean=False):
+        text = text.replace(" ", "<SP>")
+        text = text.replace("　", "<SP>")
+        text = text.replace("\r\n", "<BR>")
+        text = text.replace("\n", "<BR>")
+        text = text.replace("\r", "<BR>")
+        text = text.replace("\t", "<TAB>")
+        text = text.replace("—", "ー")
+        text = text.replace("−", "ー")
+        for k, v in self.emoji["emoji"].items():
+            if k in text:
+                text = text.replace(k, v)
+        if clean:
+            text = self.clean_text(text)
+
+        def check_simbol(x):
+            e = x.encode()
+            if len(x) == 1 and len(e) == 2:
+                c = (int(e[0]) << 8) + int(e[1])
+                if (
+                    (c >= 0xC2A1 and c <= 0xC2BF)
+                    or (c >= 0xC780 and c <= 0xC783)
+                    or (c >= 0xCAB9 and c <= 0xCBBF)
+                    or (c >= 0xCC80 and c <= 0xCDA2)
+                ):
+                    return True
+            return False
+
+        def checku2e(x):
+            e = x.encode()
+            if len(x) == 1 and len(e) == 3:
+                c = (int(e[0]) << 16) + (int(e[1]) << 8) + int(e[2])
+                if c >= 0xE28080 and c <= 0xE2B07F:
+                    return True
+            return False
+
+        pos = 0
+        result = []
+        while pos < len(text):
+            end = min(len(text), pos + self.maxlen + 1) if text[pos] == "<" else pos + 3
+            candidates = []  # (token_id, token, pos)
+            for e in range(end, pos, -1):
+                wd = text[pos:e]
+                if wd in self.vocab:
+                    if wd[0] == "<" and len(wd) > 2:
+                        candidates = [(self.vocab[wd], wd, e)]
+                        break
+                    else:
+                        candidates.append((self.vocab[wd], wd, e))
+            if len(candidates) > 0:
+                # the smallest token_id is adopted
+                _, wd, e = min(candidates, key=lambda x: x[0])
+                result.append(wd)
+                pos = e
+            else:
+                end = pos + 1
+                wd = text[pos:end]
+                if check_simbol(wd):
+                    result.append("<KIGOU>")
+                elif checku2e(wd):
+                    result.append("<U2000U2BFF>")
+                else:
+                    for i in wd.encode("utf-8"):
+                        result.append("<|byte%d|>" % i)
+                pos = end
+        return result
+
+    def convert_id_to_token(self, index):
+        return self.ids_to_tokens[index][0]
+
+
+__all__ = ["GPTSanJapaneseTokenizer"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/graphormer/__init__.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/graphormer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a4b3eb1be2b4e69dcad1540abdd91f412de0ca2
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/graphormer/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ....utils import _LazyModule
+from ....utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_graphormer import *
+    from .modeling_graphormer import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/graphormer/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/graphormer/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6cc5858a28b3a10e82a844859b1313165bbcee54
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/graphormer/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/graphormer/__pycache__/collating_graphormer.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/graphormer/__pycache__/collating_graphormer.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1237a1a3ed22035b3365d31a8198457557ea8f75
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/graphormer/__pycache__/collating_graphormer.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/graphormer/__pycache__/configuration_graphormer.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/graphormer/__pycache__/configuration_graphormer.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fd70a92202aabb90c0c2a2a9ac1e054134f73981
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/graphormer/__pycache__/configuration_graphormer.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/graphormer/__pycache__/modeling_graphormer.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/graphormer/__pycache__/modeling_graphormer.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..578cb08bfaf3ae4c9fc2a528900d22deda7b2e2d
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/graphormer/__pycache__/modeling_graphormer.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/graphormer/algos_graphormer.pyx b/venv/lib/python3.13/site-packages/transformers/models/deprecated/graphormer/algos_graphormer.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..a0fafbdee53b55efb9596036817b03be0d006992
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/graphormer/algos_graphormer.pyx
@@ -0,0 +1,107 @@
+# Copyright (c) Microsoft Corporation and HuggingFace
+# Licensed under the MIT License.
+
+import cython
+
+cimport numpy
+from cython.parallel cimport parallel, prange
+
+import numpy as np
+
+
+# Reduce this number if matrices are too big for large graphs
+UNREACHABLE_NODE_DISTANCE = 510 
+
+def floyd_warshall(adjacency_matrix):
+    """
+    Applies the Floyd-Warshall algorithm to the adjacency matrix, to compute the 
+    shortest paths distance between all nodes, up to UNREACHABLE_NODE_DISTANCE.
+    """
+    (nrows, ncols) = adjacency_matrix.shape
+    assert nrows == ncols
+    cdef unsigned int n = nrows
+
+    adj_mat_copy = adjacency_matrix.astype(np.int32, order='C', casting='safe', copy=True)
+    assert adj_mat_copy.flags['C_CONTIGUOUS']
+    cdef numpy.ndarray[numpy.int32_t, ndim=2, mode='c'] M = adj_mat_copy
+    cdef numpy.ndarray[numpy.int32_t, ndim=2, mode='c'] path = -1 * np.ones([n, n], dtype=np.int32)
+
+    cdef unsigned int i, j, k
+    cdef numpy.int32_t M_ij, M_ik, cost_ikkj
+    cdef numpy.int32_t* M_ptr = &M[0,0]
+    cdef numpy.int32_t* M_i_ptr
+    cdef numpy.int32_t* M_k_ptr
+
+    # set unreachable nodes distance to UNREACHABLE_NODE_DISTANCE
+    for i in range(n):
+        for j in range(n):
+            if i == j:
+                M[i][j] = 0
+            elif M[i][j] == 0:
+                M[i][j] = UNREACHABLE_NODE_DISTANCE
+
+    # floyed algo
+    for k in range(n):
+        M_k_ptr = M_ptr + n*k
+        for i in range(n):
+            M_i_ptr = M_ptr + n*i
+            M_ik = M_i_ptr[k]
+            for j in range(n):
+                cost_ikkj = M_ik + M_k_ptr[j]
+                M_ij = M_i_ptr[j]
+                if M_ij > cost_ikkj:
+                    M_i_ptr[j] = cost_ikkj
+                    path[i][j] = k
+
+    # set unreachable path to UNREACHABLE_NODE_DISTANCE
+    for i in range(n):
+        for j in range(n):
+            if M[i][j] >= UNREACHABLE_NODE_DISTANCE:
+                path[i][j] = UNREACHABLE_NODE_DISTANCE
+                M[i][j] = UNREACHABLE_NODE_DISTANCE
+
+    return M, path
+
+
+def get_all_edges(path, i, j):
+    """
+    Recursive function to compute all possible paths between two nodes from the graph adjacency matrix.
+    """
+    cdef int k = path[i][j]
+    if k == -1:
+        return []
+    else:
+        return get_all_edges(path, i, k) + [k] + get_all_edges(path, k, j)
+
+
+def gen_edge_input(max_dist, path, edge_feat):
+    """
+    Generates the full edge feature and adjacency matrix.
+    Shape: num_nodes * num_nodes * max_distance_between_nodes * num_edge_features
+    Dim 1 is the input node, dim 2 the output node of the edge, dim 3 the depth of the edge, dim 4 the feature
+    """
+    (nrows, ncols) = path.shape
+    assert nrows == ncols
+    cdef unsigned int n = nrows
+    cdef unsigned int max_dist_copy = max_dist
+
+    path_copy = path.astype(long, order='C', casting='safe', copy=True)
+    edge_feat_copy = edge_feat.astype(long, order='C', casting='safe', copy=True)
+    assert path_copy.flags['C_CONTIGUOUS']
+    assert edge_feat_copy.flags['C_CONTIGUOUS']
+
+    cdef numpy.ndarray[numpy.int32_t, ndim=4, mode='c'] edge_fea_all = -1 * np.ones([n, n, max_dist_copy, edge_feat.shape[-1]], dtype=np.int32)
+    cdef unsigned int i, j, k, num_path, cur
+
+    for i in range(n):
+        for j in range(n):
+            if i == j:
+                continue
+            if path_copy[i][j] == UNREACHABLE_NODE_DISTANCE:
+                continue
+            path = [i] + get_all_edges(path_copy, i, j) + [j]
+            num_path = len(path) - 1
+            for k in range(num_path):
+                edge_fea_all[i, j, k, :] = edge_feat_copy[path[k], path[k+1], :]
+
+    return edge_fea_all
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/graphormer/collating_graphormer.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/graphormer/collating_graphormer.py
new file mode 100644
index 0000000000000000000000000000000000000000..88657bab435d95cf5f06014524b184939e8582c7
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/graphormer/collating_graphormer.py
@@ -0,0 +1,135 @@
+# Copyright (c) Microsoft Corporation and HuggingFace
+# Licensed under the MIT License.
+
+from collections.abc import Mapping
+from typing import Any
+
+import numpy as np
+import torch
+
+from ....utils import is_cython_available, requires_backends
+
+
+if is_cython_available():
+    import pyximport
+
+    pyximport.install(setup_args={"include_dirs": np.get_include()})
+    from . import algos_graphormer
+
+
+def convert_to_single_emb(x, offset: int = 512):
+    feature_num = x.shape[1] if len(x.shape) > 1 else 1
+    feature_offset = 1 + np.arange(0, feature_num * offset, offset, dtype=np.int64)
+    x = x + feature_offset
+    return x
+
+
+def preprocess_item(item, keep_features=True):
+    requires_backends(preprocess_item, ["cython"])
+
+    if keep_features and "edge_attr" in item:  # edge_attr
+        edge_attr = np.asarray(item["edge_attr"], dtype=np.int64)
+    else:
+        edge_attr = np.ones((len(item["edge_index"][0]), 1), dtype=np.int64)  # same embedding for all
+
+    if keep_features and "node_feat" in item:  # input_nodes
+        node_feature = np.asarray(item["node_feat"], dtype=np.int64)
+    else:
+        node_feature = np.ones((item["num_nodes"], 1), dtype=np.int64)  # same embedding for all
+
+    edge_index = np.asarray(item["edge_index"], dtype=np.int64)
+
+    input_nodes = convert_to_single_emb(node_feature) + 1
+    num_nodes = item["num_nodes"]
+
+    if len(edge_attr.shape) == 1:
+        edge_attr = edge_attr[:, None]
+    attn_edge_type = np.zeros([num_nodes, num_nodes, edge_attr.shape[-1]], dtype=np.int64)
+    attn_edge_type[edge_index[0], edge_index[1]] = convert_to_single_emb(edge_attr) + 1
+
+    # node adj matrix [num_nodes, num_nodes] bool
+    adj = np.zeros([num_nodes, num_nodes], dtype=bool)
+    adj[edge_index[0], edge_index[1]] = True
+
+    shortest_path_result, path = algos_graphormer.floyd_warshall(adj)
+    max_dist = np.amax(shortest_path_result)
+
+    input_edges = algos_graphormer.gen_edge_input(max_dist, path, attn_edge_type)
+    attn_bias = np.zeros([num_nodes + 1, num_nodes + 1], dtype=np.single)  # with graph token
+
+    # combine
+    item["input_nodes"] = input_nodes + 1  # we shift all indices by one for padding
+    item["attn_bias"] = attn_bias
+    item["attn_edge_type"] = attn_edge_type
+    item["spatial_pos"] = shortest_path_result.astype(np.int64) + 1  # we shift all indices by one for padding
+    item["in_degree"] = np.sum(adj, axis=1).reshape(-1) + 1  # we shift all indices by one for padding
+    item["out_degree"] = item["in_degree"]  # for undirected graph
+    item["input_edges"] = input_edges + 1  # we shift all indices by one for padding
+    if "labels" not in item:
+        item["labels"] = item["y"]
+
+    return item
+
+
+class GraphormerDataCollator:
+    def __init__(self, spatial_pos_max=20, on_the_fly_processing=False):
+        if not is_cython_available():
+            raise ImportError("Graphormer preprocessing needs Cython (pyximport)")
+
+        self.spatial_pos_max = spatial_pos_max
+        self.on_the_fly_processing = on_the_fly_processing
+
+    def __call__(self, features: list[dict]) -> dict[str, Any]:
+        if self.on_the_fly_processing:
+            features = [preprocess_item(i) for i in features]
+
+        if not isinstance(features[0], Mapping):
+            features = [vars(f) for f in features]
+        batch = {}
+
+        max_node_num = max(len(i["input_nodes"]) for i in features)
+        node_feat_size = len(features[0]["input_nodes"][0])
+        edge_feat_size = len(features[0]["attn_edge_type"][0][0])
+        max_dist = max(len(i["input_edges"][0][0]) for i in features)
+        edge_input_size = len(features[0]["input_edges"][0][0][0])
+        batch_size = len(features)
+
+        batch["attn_bias"] = torch.zeros(batch_size, max_node_num + 1, max_node_num + 1, dtype=torch.float)
+        batch["attn_edge_type"] = torch.zeros(batch_size, max_node_num, max_node_num, edge_feat_size, dtype=torch.long)
+        batch["spatial_pos"] = torch.zeros(batch_size, max_node_num, max_node_num, dtype=torch.long)
+        batch["in_degree"] = torch.zeros(batch_size, max_node_num, dtype=torch.long)
+        batch["input_nodes"] = torch.zeros(batch_size, max_node_num, node_feat_size, dtype=torch.long)
+        batch["input_edges"] = torch.zeros(
+            batch_size, max_node_num, max_node_num, max_dist, edge_input_size, dtype=torch.long
+        )
+
+        for ix, f in enumerate(features):
+            for k in ["attn_bias", "attn_edge_type", "spatial_pos", "in_degree", "input_nodes", "input_edges"]:
+                f[k] = torch.tensor(f[k])
+
+            if len(f["attn_bias"][1:, 1:][f["spatial_pos"] >= self.spatial_pos_max]) > 0:
+                f["attn_bias"][1:, 1:][f["spatial_pos"] >= self.spatial_pos_max] = float("-inf")
+
+            batch["attn_bias"][ix, : f["attn_bias"].shape[0], : f["attn_bias"].shape[1]] = f["attn_bias"]
+            batch["attn_edge_type"][ix, : f["attn_edge_type"].shape[0], : f["attn_edge_type"].shape[1], :] = f[
+                "attn_edge_type"
+            ]
+            batch["spatial_pos"][ix, : f["spatial_pos"].shape[0], : f["spatial_pos"].shape[1]] = f["spatial_pos"]
+            batch["in_degree"][ix, : f["in_degree"].shape[0]] = f["in_degree"]
+            batch["input_nodes"][ix, : f["input_nodes"].shape[0], :] = f["input_nodes"]
+            batch["input_edges"][
+                ix, : f["input_edges"].shape[0], : f["input_edges"].shape[1], : f["input_edges"].shape[2], :
+            ] = f["input_edges"]
+
+        batch["out_degree"] = batch["in_degree"]
+
+        sample = features[0]["labels"]
+        if len(sample) == 1:  # one task
+            if isinstance(sample[0], float):  # regression
+                batch["labels"] = torch.from_numpy(np.concatenate([i["labels"] for i in features]))
+            else:  # binary classification
+                batch["labels"] = torch.from_numpy(np.concatenate([i["labels"] for i in features]))
+        else:  # multi task classification, left to float to keep the NaNs
+            batch["labels"] = torch.from_numpy(np.stack([i["labels"] for i in features], axis=0))
+
+        return batch
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/graphormer/configuration_graphormer.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/graphormer/configuration_graphormer.py
new file mode 100644
index 0000000000000000000000000000000000000000..81c13c6a802b3128d4f88b23f59ac31e8484e901
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/graphormer/configuration_graphormer.py
@@ -0,0 +1,220 @@
+# coding=utf-8
+# Copyright 2022 Microsoft, clefourrier and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Graphormer model configuration"""
+
+from typing import Optional
+
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class GraphormerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`~GraphormerModel`]. It is used to instantiate an
+    Graphormer model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the Graphormer
+    [graphormer-base-pcqm4mv1](https://huggingface.co/graphormer-base-pcqm4mv1) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        num_classes (`int`, *optional*, defaults to 1):
+            Number of target classes or labels, set to n for binary classification of n tasks.
+        num_atoms (`int`, *optional*, defaults to 512*9):
+            Number of node types in the graphs.
+        num_edges (`int`, *optional*, defaults to 512*3):
+            Number of edges types in the graph.
+        num_in_degree (`int`, *optional*, defaults to 512):
+            Number of in degrees types in the input graphs.
+        num_out_degree (`int`, *optional*, defaults to 512):
+            Number of out degrees types in the input graphs.
+        num_edge_dis (`int`, *optional*, defaults to 128):
+            Number of edge dis in the input graphs.
+        multi_hop_max_dist (`int`, *optional*, defaults to 20):
+            Maximum distance of multi hop edges between two nodes.
+        spatial_pos_max (`int`, *optional*, defaults to 1024):
+            Maximum distance between nodes in the graph attention bias matrices, used during preprocessing and
+            collation.
+        edge_type (`str`, *optional*, defaults to multihop):
+            Type of edge relation chosen.
+        max_nodes (`int`, *optional*, defaults to 512):
+            Maximum number of nodes which can be parsed for the input graphs.
+        share_input_output_embed (`bool`, *optional*, defaults to `False`):
+            Shares the embedding layer between encoder and decoder - careful, True is not implemented.
+        num_layers (`int`, *optional*, defaults to 12):
+            Number of layers.
+        embedding_dim (`int`, *optional*, defaults to 768):
+            Dimension of the embedding layer in encoder.
+        ffn_embedding_dim (`int`, *optional*, defaults to 768):
+            Dimension of the "intermediate" (often named feed-forward) layer in encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads in the encoder.
+        self_attention (`bool`, *optional*, defaults to `True`):
+            Model is self attentive (False not implemented).
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the attention weights.
+        activation_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the activation of the linear transformer layer.
+        layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://huggingface.co/papers/1909.11556)
+            for more details.
+        bias (`bool`, *optional*, defaults to `True`):
+            Uses bias in the attention module - unsupported at the moment.
+        embed_scale(`float`, *optional*, defaults to None):
+            Scaling factor for the node embeddings.
+        num_trans_layers_to_freeze (`int`, *optional*, defaults to 0):
+            Number of transformer layers to freeze.
+        encoder_normalize_before (`bool`, *optional*, defaults to `False`):
+            Normalize features before encoding the graph.
+        pre_layernorm (`bool`, *optional*, defaults to `False`):
+            Apply layernorm before self attention and the feed forward network. Without this, post layernorm will be
+            used.
+        apply_graphormer_init (`bool`, *optional*, defaults to `False`):
+            Apply a custom graphormer initialisation to the model before training.
+        freeze_embeddings (`bool`, *optional*, defaults to `False`):
+            Freeze the embedding layer, or train it along the model.
+        encoder_normalize_before (`bool`, *optional*, defaults to `False`):
+            Apply the layer norm before each encoder block.
+        q_noise (`float`, *optional*, defaults to 0.0):
+            Amount of quantization noise (see "Training with Quantization Noise for Extreme Model Compression"). (For
+            more detail, see fairseq's documentation on quant_noise).
+        qn_block_size (`int`, *optional*, defaults to 8):
+            Size of the blocks for subsequent quantization with iPQ (see q_noise).
+        kdim (`int`, *optional*, defaults to None):
+            Dimension of the key in the attention, if different from the other values.
+        vdim (`int`, *optional*, defaults to None):
+            Dimension of the value in the attention, if different from the other values.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        traceable (`bool`, *optional*, defaults to `False`):
+            Changes return value of the encoder's inner_state to stacked tensors.
+
+        Example:
+            ```python
+            >>> from transformers import GraphormerForGraphClassification, GraphormerConfig
+
+            >>> # Initializing a Graphormer graphormer-base-pcqm4mv2 style configuration
+            >>> configuration = GraphormerConfig()
+
+            >>> # Initializing a model from the graphormer-base-pcqm4mv1 style configuration
+            >>> model = GraphormerForGraphClassification(configuration)
+
+            >>> # Accessing the model configuration
+            >>> configuration = model.config
+            ```
+    """
+
+    model_type = "graphormer"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        num_classes: int = 1,
+        num_atoms: int = 512 * 9,
+        num_edges: int = 512 * 3,
+        num_in_degree: int = 512,
+        num_out_degree: int = 512,
+        num_spatial: int = 512,
+        num_edge_dis: int = 128,
+        multi_hop_max_dist: int = 5,  # sometimes is 20
+        spatial_pos_max: int = 1024,
+        edge_type: str = "multi_hop",
+        max_nodes: int = 512,
+        share_input_output_embed: bool = False,
+        num_hidden_layers: int = 12,
+        embedding_dim: int = 768,
+        ffn_embedding_dim: int = 768,
+        num_attention_heads: int = 32,
+        dropout: float = 0.1,
+        attention_dropout: float = 0.1,
+        activation_dropout: float = 0.1,
+        layerdrop: float = 0.0,
+        encoder_normalize_before: bool = False,
+        pre_layernorm: bool = False,
+        apply_graphormer_init: bool = False,
+        activation_fn: str = "gelu",
+        embed_scale: Optional[float] = None,
+        freeze_embeddings: bool = False,
+        num_trans_layers_to_freeze: int = 0,
+        traceable: bool = False,
+        q_noise: float = 0.0,
+        qn_block_size: int = 8,
+        kdim: Optional[int] = None,
+        vdim: Optional[int] = None,
+        bias: bool = True,
+        self_attention: bool = True,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        **kwargs,
+    ):
+        self.num_classes = num_classes
+        self.num_atoms = num_atoms
+        self.num_in_degree = num_in_degree
+        self.num_out_degree = num_out_degree
+        self.num_edges = num_edges
+        self.num_spatial = num_spatial
+        self.num_edge_dis = num_edge_dis
+        self.edge_type = edge_type
+        self.multi_hop_max_dist = multi_hop_max_dist
+        self.spatial_pos_max = spatial_pos_max
+        self.max_nodes = max_nodes
+        self.num_hidden_layers = num_hidden_layers
+        self.embedding_dim = embedding_dim
+        self.hidden_size = embedding_dim
+        self.ffn_embedding_dim = ffn_embedding_dim
+        self.num_attention_heads = num_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.layerdrop = layerdrop
+        self.encoder_normalize_before = encoder_normalize_before
+        self.pre_layernorm = pre_layernorm
+        self.apply_graphormer_init = apply_graphormer_init
+        self.activation_fn = activation_fn
+        self.embed_scale = embed_scale
+        self.freeze_embeddings = freeze_embeddings
+        self.num_trans_layers_to_freeze = num_trans_layers_to_freeze
+        self.share_input_output_embed = share_input_output_embed
+        self.traceable = traceable
+        self.q_noise = q_noise
+        self.qn_block_size = qn_block_size
+
+        # These parameters are here for future extensions
+        # atm, the model only supports self attention
+        self.kdim = kdim
+        self.vdim = vdim
+        self.self_attention = self_attention
+        self.bias = bias
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs,
+        )
+
+
+__all__ = ["GraphormerConfig"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/graphormer/modeling_graphormer.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/graphormer/modeling_graphormer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3e8ea742c8dabd4bd13e34ea84fa60547fc0645
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/graphormer/modeling_graphormer.py
@@ -0,0 +1,912 @@
+# coding=utf-8
+# Copyright 2022 Microsoft, clefourrier The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Graphormer model."""
+
+import math
+from collections.abc import Iterable, Iterator
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ....activations import ACT2FN
+from ....modeling_outputs import (
+    BaseModelOutputWithNoAttention,
+    SequenceClassifierOutput,
+)
+from ....modeling_utils import PreTrainedModel
+from ....utils import logging
+from .configuration_graphormer import GraphormerConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "graphormer-base-pcqm4mv1"
+_CONFIG_FOR_DOC = "GraphormerConfig"
+
+
+def quant_noise(module: nn.Module, p: float, block_size: int):
+    """
+    From:
+    https://github.com/facebookresearch/fairseq/blob/dd0079bde7f678b0cd0715cbd0ae68d661b7226d/fairseq/modules/quant_noise.py
+
+    Wraps modules and applies quantization noise to the weights for subsequent quantization with Iterative Product
+    Quantization as described in "Training with Quantization Noise for Extreme Model Compression"
+
+    Args:
+        - module: nn.Module
+        - p: amount of Quantization Noise
+        - block_size: size of the blocks for subsequent quantization with iPQ
+
+    Remarks:
+        - Module weights must have the right sizes wrt the block size
+        - Only Linear, Embedding and Conv2d modules are supported for the moment
+        - For more detail on how to quantize by blocks with convolutional weights, see "And the Bit Goes Down:
+          Revisiting the Quantization of Neural Networks"
+        - We implement the simplest form of noise here as stated in the paper which consists in randomly dropping
+          blocks
+    """
+
+    # if no quantization noise, don't register hook
+    if p <= 0:
+        return module
+
+    # supported modules
+    if not isinstance(module, (nn.Linear, nn.Embedding, nn.Conv2d)):
+        raise NotImplementedError("Module unsupported for quant_noise.")
+
+    # test whether module.weight has the right sizes wrt block_size
+    is_conv = module.weight.ndim == 4
+
+    # 2D matrix
+    if not is_conv:
+        if module.weight.size(1) % block_size != 0:
+            raise AssertionError("Input features must be a multiple of block sizes")
+
+    # 4D matrix
+    else:
+        # 1x1 convolutions
+        if module.kernel_size == (1, 1):
+            if module.in_channels % block_size != 0:
+                raise AssertionError("Input channels must be a multiple of block sizes")
+        # regular convolutions
+        else:
+            k = module.kernel_size[0] * module.kernel_size[1]
+            if k % block_size != 0:
+                raise AssertionError("Kernel size must be a multiple of block size")
+
+    def _forward_pre_hook(mod, input):
+        # no noise for evaluation
+        if mod.training:
+            if not is_conv:
+                # gather weight and sizes
+                weight = mod.weight
+                in_features = weight.size(1)
+                out_features = weight.size(0)
+
+                # split weight matrix into blocks and randomly drop selected blocks
+                mask = torch.zeros(in_features // block_size * out_features, device=weight.device)
+                mask.bernoulli_(p)
+                mask = mask.repeat_interleave(block_size, -1).view(-1, in_features)
+
+            else:
+                # gather weight and sizes
+                weight = mod.weight
+                in_channels = mod.in_channels
+                out_channels = mod.out_channels
+
+                # split weight matrix into blocks and randomly drop selected blocks
+                if mod.kernel_size == (1, 1):
+                    mask = torch.zeros(
+                        int(in_channels // block_size * out_channels),
+                        device=weight.device,
+                    )
+                    mask.bernoulli_(p)
+                    mask = mask.repeat_interleave(block_size, -1).view(-1, in_channels)
+                else:
+                    mask = torch.zeros(weight.size(0), weight.size(1), device=weight.device)
+                    mask.bernoulli_(p)
+                    mask = mask.unsqueeze(2).unsqueeze(3).repeat(1, 1, mod.kernel_size[0], mod.kernel_size[1])
+
+            # scale weights and apply mask
+            mask = mask.to(torch.bool)  # x.bool() is not currently supported in TorchScript
+            s = 1 / (1 - p)
+            mod.weight.data = s * weight.masked_fill(mask, 0)
+
+    module.register_forward_pre_hook(_forward_pre_hook)
+    return module
+
+
+class LayerDropModuleList(nn.ModuleList):
+    """
+    From:
+    https://github.com/facebookresearch/fairseq/blob/dd0079bde7f678b0cd0715cbd0ae68d661b7226d/fairseq/modules/layer_drop.py
+    A LayerDrop implementation based on [`torch.nn.ModuleList`]. LayerDrop as described in
+    https://huggingface.co/papers/1909.11556.
+
+    We refresh the choice of which layers to drop every time we iterate over the LayerDropModuleList instance. During
+    evaluation we always iterate over all layers.
+
+    Usage:
+
+    ```python
+    layers = LayerDropList(p=0.5, modules=[layer1, layer2, layer3])
+    for layer in layers:  # this might iterate over layers 1 and 3
+        x = layer(x)
+    for layer in layers:  # this might iterate over all layers
+        x = layer(x)
+    for layer in layers:  # this might not iterate over any layers
+        x = layer(x)
+    ```
+
+    Args:
+        p (float): probability of dropping out each layer
+        modules (iterable, optional): an iterable of modules to add
+    """
+
+    def __init__(self, p: float, modules: Optional[Iterable[nn.Module]] = None):
+        super().__init__(modules)
+        self.p = p
+
+    def __iter__(self) -> Iterator[nn.Module]:
+        dropout_probs = torch.empty(len(self)).uniform_()
+        for i, m in enumerate(super().__iter__()):
+            if not self.training or (dropout_probs[i] > self.p):
+                yield m
+
+
+class GraphormerGraphNodeFeature(nn.Module):
+    """
+    Compute node features for each node in the graph.
+    """
+
+    def __init__(self, config: GraphormerConfig):
+        super().__init__()
+        self.num_heads = config.num_attention_heads
+        self.num_atoms = config.num_atoms
+
+        self.atom_encoder = nn.Embedding(config.num_atoms + 1, config.hidden_size, padding_idx=config.pad_token_id)
+        self.in_degree_encoder = nn.Embedding(
+            config.num_in_degree, config.hidden_size, padding_idx=config.pad_token_id
+        )
+        self.out_degree_encoder = nn.Embedding(
+            config.num_out_degree, config.hidden_size, padding_idx=config.pad_token_id
+        )
+
+        self.graph_token = nn.Embedding(1, config.hidden_size)
+
+    def forward(
+        self,
+        input_nodes: torch.LongTensor,
+        in_degree: torch.LongTensor,
+        out_degree: torch.LongTensor,
+    ) -> torch.Tensor:
+        n_graph, n_node = input_nodes.size()[:2]
+
+        node_feature = (  # node feature + graph token
+            self.atom_encoder(input_nodes).sum(dim=-2)  # [n_graph, n_node, n_hidden]
+            + self.in_degree_encoder(in_degree)
+            + self.out_degree_encoder(out_degree)
+        )
+
+        graph_token_feature = self.graph_token.weight.unsqueeze(0).repeat(n_graph, 1, 1)
+
+        graph_node_feature = torch.cat([graph_token_feature, node_feature], dim=1)
+
+        return graph_node_feature
+
+
+class GraphormerGraphAttnBias(nn.Module):
+    """
+    Compute attention bias for each head.
+    """
+
+    def __init__(self, config: GraphormerConfig):
+        super().__init__()
+        self.num_heads = config.num_attention_heads
+        self.multi_hop_max_dist = config.multi_hop_max_dist
+
+        # We do not change edge feature embedding learning, as edge embeddings are represented as a combination of the original features
+        # + shortest path
+        self.edge_encoder = nn.Embedding(config.num_edges + 1, config.num_attention_heads, padding_idx=0)
+
+        self.edge_type = config.edge_type
+        if self.edge_type == "multi_hop":
+            self.edge_dis_encoder = nn.Embedding(
+                config.num_edge_dis * config.num_attention_heads * config.num_attention_heads,
+                1,
+            )
+
+        self.spatial_pos_encoder = nn.Embedding(config.num_spatial, config.num_attention_heads, padding_idx=0)
+
+        self.graph_token_virtual_distance = nn.Embedding(1, config.num_attention_heads)
+
+    def forward(
+        self,
+        input_nodes: torch.LongTensor,
+        attn_bias: torch.Tensor,
+        spatial_pos: torch.LongTensor,
+        input_edges: torch.LongTensor,
+        attn_edge_type: torch.LongTensor,
+    ) -> torch.Tensor:
+        n_graph, n_node = input_nodes.size()[:2]
+        graph_attn_bias = attn_bias.clone()
+        graph_attn_bias = graph_attn_bias.unsqueeze(1).repeat(
+            1, self.num_heads, 1, 1
+        )  # [n_graph, n_head, n_node+1, n_node+1]
+
+        # spatial pos
+        # [n_graph, n_node, n_node, n_head] -> [n_graph, n_head, n_node, n_node]
+        spatial_pos_bias = self.spatial_pos_encoder(spatial_pos).permute(0, 3, 1, 2)
+        graph_attn_bias[:, :, 1:, 1:] = graph_attn_bias[:, :, 1:, 1:] + spatial_pos_bias
+
+        # reset spatial pos here
+        t = self.graph_token_virtual_distance.weight.view(1, self.num_heads, 1)
+        graph_attn_bias[:, :, 1:, 0] = graph_attn_bias[:, :, 1:, 0] + t
+        graph_attn_bias[:, :, 0, :] = graph_attn_bias[:, :, 0, :] + t
+
+        # edge feature
+        if self.edge_type == "multi_hop":
+            spatial_pos_ = spatial_pos.clone()
+
+            spatial_pos_[spatial_pos_ == 0] = 1  # set pad to 1
+            # set 1 to 1, input_nodes > 1 to input_nodes - 1
+            spatial_pos_ = torch.where(spatial_pos_ > 1, spatial_pos_ - 1, spatial_pos_)
+            if self.multi_hop_max_dist > 0:
+                spatial_pos_ = spatial_pos_.clamp(0, self.multi_hop_max_dist)
+                input_edges = input_edges[:, :, :, : self.multi_hop_max_dist, :]
+            # [n_graph, n_node, n_node, max_dist, n_head]
+
+            input_edges = self.edge_encoder(input_edges).mean(-2)
+            max_dist = input_edges.size(-2)
+            edge_input_flat = input_edges.permute(3, 0, 1, 2, 4).reshape(max_dist, -1, self.num_heads)
+            edge_input_flat = torch.bmm(
+                edge_input_flat,
+                self.edge_dis_encoder.weight.reshape(-1, self.num_heads, self.num_heads)[:max_dist, :, :],
+            )
+            input_edges = edge_input_flat.reshape(max_dist, n_graph, n_node, n_node, self.num_heads).permute(
+                1, 2, 3, 0, 4
+            )
+            input_edges = (input_edges.sum(-2) / (spatial_pos_.float().unsqueeze(-1))).permute(0, 3, 1, 2)
+        else:
+            # [n_graph, n_node, n_node, n_head] -> [n_graph, n_head, n_node, n_node]
+            input_edges = self.edge_encoder(attn_edge_type).mean(-2).permute(0, 3, 1, 2)
+
+        graph_attn_bias[:, :, 1:, 1:] = graph_attn_bias[:, :, 1:, 1:] + input_edges
+        graph_attn_bias = graph_attn_bias + attn_bias.unsqueeze(1)  # reset
+
+        return graph_attn_bias
+
+
+class GraphormerMultiheadAttention(nn.Module):
+    """Multi-headed attention.
+
+    See "Attention Is All You Need" for more details.
+    """
+
+    def __init__(self, config: GraphormerConfig):
+        super().__init__()
+        self.embedding_dim = config.embedding_dim
+        self.kdim = config.kdim if config.kdim is not None else config.embedding_dim
+        self.vdim = config.vdim if config.vdim is not None else config.embedding_dim
+        self.qkv_same_dim = self.kdim == config.embedding_dim and self.vdim == config.embedding_dim
+
+        self.num_heads = config.num_attention_heads
+        self.attention_dropout_module = torch.nn.Dropout(p=config.attention_dropout, inplace=False)
+
+        self.head_dim = config.embedding_dim // config.num_attention_heads
+        if not (self.head_dim * config.num_attention_heads == self.embedding_dim):
+            raise AssertionError("The embedding_dim must be divisible by num_heads.")
+        self.scaling = self.head_dim**-0.5
+
+        self.self_attention = True  # config.self_attention
+        if not (self.self_attention):
+            raise NotImplementedError("The Graphormer model only supports self attention for now.")
+        if self.self_attention and not self.qkv_same_dim:
+            raise AssertionError("Self-attention requires query, key and value to be of the same size.")
+
+        self.k_proj = quant_noise(
+            nn.Linear(self.kdim, config.embedding_dim, bias=config.bias),
+            config.q_noise,
+            config.qn_block_size,
+        )
+        self.v_proj = quant_noise(
+            nn.Linear(self.vdim, config.embedding_dim, bias=config.bias),
+            config.q_noise,
+            config.qn_block_size,
+        )
+        self.q_proj = quant_noise(
+            nn.Linear(config.embedding_dim, config.embedding_dim, bias=config.bias),
+            config.q_noise,
+            config.qn_block_size,
+        )
+
+        self.out_proj = quant_noise(
+            nn.Linear(config.embedding_dim, config.embedding_dim, bias=config.bias),
+            config.q_noise,
+            config.qn_block_size,
+        )
+
+        self.onnx_trace = False
+
+    def reset_parameters(self):
+        if self.qkv_same_dim:
+            # Empirically observed the convergence to be much better with
+            # the scaled initialization
+            nn.init.xavier_uniform_(self.k_proj.weight, gain=1 / math.sqrt(2))
+            nn.init.xavier_uniform_(self.v_proj.weight, gain=1 / math.sqrt(2))
+            nn.init.xavier_uniform_(self.q_proj.weight, gain=1 / math.sqrt(2))
+        else:
+            nn.init.xavier_uniform_(self.k_proj.weight)
+            nn.init.xavier_uniform_(self.v_proj.weight)
+            nn.init.xavier_uniform_(self.q_proj.weight)
+
+        nn.init.xavier_uniform_(self.out_proj.weight)
+        if self.out_proj.bias is not None:
+            nn.init.constant_(self.out_proj.bias, 0.0)
+
+    def forward(
+        self,
+        query: torch.LongTensor,
+        key: Optional[torch.Tensor],
+        value: Optional[torch.Tensor],
+        attn_bias: Optional[torch.Tensor],
+        key_padding_mask: Optional[torch.Tensor] = None,
+        need_weights: bool = True,
+        attn_mask: Optional[torch.Tensor] = None,
+        before_softmax: bool = False,
+        need_head_weights: bool = False,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """
+        Args:
+            key_padding_mask (Bytetorch.Tensor, optional): mask to exclude
+                keys that are pads, of shape `(batch, src_len)`, where padding elements are indicated by 1s.
+            need_weights (bool, optional): return the attention weights,
+                averaged over heads (default: False).
+            attn_mask (Bytetorch.Tensor, optional): typically used to
+                implement causal attention, where the mask prevents the attention from looking forward in time
+                (default: None).
+            before_softmax (bool, optional): return the raw attention
+                weights and values before the attention softmax.
+            need_head_weights (bool, optional): return the attention
+                weights for each head. Implies *need_weights*. Default: return the average attention weights over all
+                heads.
+        """
+        if need_head_weights:
+            need_weights = True
+
+        tgt_len, bsz, embedding_dim = query.size()
+        src_len = tgt_len
+        if not (embedding_dim == self.embedding_dim):
+            raise AssertionError(
+                f"The query embedding dimension {embedding_dim} is not equal to the expected embedding_dim"
+                f" {self.embedding_dim}."
+            )
+        if not (list(query.size()) == [tgt_len, bsz, embedding_dim]):
+            raise AssertionError("Query size incorrect in Graphormer, compared to model dimensions.")
+
+        if key is not None:
+            src_len, key_bsz, _ = key.size()
+            if not torch.jit.is_scripting():
+                if (key_bsz != bsz) or (value is None) or not (src_len, bsz == value.shape[:2]):
+                    raise AssertionError(
+                        "The batch shape does not match the key or value shapes provided to the attention."
+                    )
+
+        q = self.q_proj(query)
+        k = self.k_proj(query)
+        v = self.v_proj(query)
+
+        q *= self.scaling
+
+        q = q.contiguous().view(tgt_len, bsz * self.num_heads, self.head_dim).transpose(0, 1)
+        if k is not None:
+            k = k.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1)
+        if v is not None:
+            v = v.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1)
+
+        if (k is None) or not (k.size(1) == src_len):
+            raise AssertionError("The shape of the key generated in the attention is incorrect")
+
+        # This is part of a workaround to get around fork/join parallelism
+        # not supporting Optional types.
+        if key_padding_mask is not None and key_padding_mask.dim() == 0:
+            key_padding_mask = None
+
+        if key_padding_mask is not None:
+            if key_padding_mask.size(0) != bsz or key_padding_mask.size(1) != src_len:
+                raise AssertionError(
+                    "The shape of the generated padding mask for the key does not match expected dimensions."
+                )
+        attn_weights = torch.bmm(q, k.transpose(1, 2))
+        attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len, bsz)
+
+        if list(attn_weights.size()) != [bsz * self.num_heads, tgt_len, src_len]:
+            raise AssertionError("The attention weights generated do not match the expected dimensions.")
+
+        if attn_bias is not None:
+            attn_weights += attn_bias.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if attn_mask is not None:
+            attn_mask = attn_mask.unsqueeze(0)
+            attn_weights += attn_mask
+
+        if key_padding_mask is not None:
+            # don't attend to padding symbols
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.masked_fill(
+                key_padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool), float("-inf")
+            )
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if before_softmax:
+            return attn_weights, v
+
+        attn_weights_float = torch.nn.functional.softmax(attn_weights, dim=-1)
+        attn_weights = attn_weights_float.type_as(attn_weights)
+        attn_probs = self.attention_dropout_module(attn_weights)
+
+        if v is None:
+            raise AssertionError("No value generated")
+        attn = torch.bmm(attn_probs, v)
+        if list(attn.size()) != [bsz * self.num_heads, tgt_len, self.head_dim]:
+            raise AssertionError("The attention generated do not match the expected dimensions.")
+
+        attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embedding_dim)
+        attn: torch.Tensor = self.out_proj(attn)
+
+        attn_weights = None
+        if need_weights:
+            attn_weights = attn_weights_float.contiguous().view(bsz, self.num_heads, tgt_len, src_len).transpose(1, 0)
+            if not need_head_weights:
+                # average attention weights over heads
+                attn_weights = attn_weights.mean(dim=0)
+
+        return attn, attn_weights
+
+    def apply_sparse_mask(self, attn_weights: torch.Tensor, tgt_len: int, src_len: int, bsz: int) -> torch.Tensor:
+        return attn_weights
+
+
+class GraphormerGraphEncoderLayer(nn.Module):
+    def __init__(self, config: GraphormerConfig) -> None:
+        super().__init__()
+
+        # Initialize parameters
+        self.embedding_dim = config.embedding_dim
+        self.num_attention_heads = config.num_attention_heads
+        self.q_noise = config.q_noise
+        self.qn_block_size = config.qn_block_size
+        self.pre_layernorm = config.pre_layernorm
+
+        self.dropout_module = torch.nn.Dropout(p=config.dropout, inplace=False)
+
+        self.activation_dropout_module = torch.nn.Dropout(p=config.activation_dropout, inplace=False)
+
+        # Initialize blocks
+        self.activation_fn = ACT2FN[config.activation_fn]
+        self.self_attn = GraphormerMultiheadAttention(config)
+
+        # layer norm associated with the self attention layer
+        self.self_attn_layer_norm = nn.LayerNorm(self.embedding_dim)
+
+        self.fc1 = self.build_fc(
+            self.embedding_dim,
+            config.ffn_embedding_dim,
+            q_noise=config.q_noise,
+            qn_block_size=config.qn_block_size,
+        )
+        self.fc2 = self.build_fc(
+            config.ffn_embedding_dim,
+            self.embedding_dim,
+            q_noise=config.q_noise,
+            qn_block_size=config.qn_block_size,
+        )
+
+        # layer norm associated with the position wise feed-forward NN
+        self.final_layer_norm = nn.LayerNorm(self.embedding_dim)
+
+    def build_fc(
+        self, input_dim: int, output_dim: int, q_noise: float, qn_block_size: int
+    ) -> Union[nn.Module, nn.Linear, nn.Embedding, nn.Conv2d]:
+        return quant_noise(nn.Linear(input_dim, output_dim), q_noise, qn_block_size)
+
+    def forward(
+        self,
+        input_nodes: torch.Tensor,
+        self_attn_bias: Optional[torch.Tensor] = None,
+        self_attn_mask: Optional[torch.Tensor] = None,
+        self_attn_padding_mask: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """
+        nn.LayerNorm is applied either before or after the self-attention/ffn modules similar to the original
+        Transformer implementation.
+        """
+        residual = input_nodes
+        if self.pre_layernorm:
+            input_nodes = self.self_attn_layer_norm(input_nodes)
+
+        input_nodes, attn = self.self_attn(
+            query=input_nodes,
+            key=input_nodes,
+            value=input_nodes,
+            attn_bias=self_attn_bias,
+            key_padding_mask=self_attn_padding_mask,
+            need_weights=False,
+            attn_mask=self_attn_mask,
+        )
+        input_nodes = self.dropout_module(input_nodes)
+        input_nodes = residual + input_nodes
+        if not self.pre_layernorm:
+            input_nodes = self.self_attn_layer_norm(input_nodes)
+
+        residual = input_nodes
+        if self.pre_layernorm:
+            input_nodes = self.final_layer_norm(input_nodes)
+        input_nodes = self.activation_fn(self.fc1(input_nodes))
+        input_nodes = self.activation_dropout_module(input_nodes)
+        input_nodes = self.fc2(input_nodes)
+        input_nodes = self.dropout_module(input_nodes)
+        input_nodes = residual + input_nodes
+        if not self.pre_layernorm:
+            input_nodes = self.final_layer_norm(input_nodes)
+
+        return input_nodes, attn
+
+
+class GraphormerGraphEncoder(nn.Module):
+    def __init__(self, config: GraphormerConfig):
+        super().__init__()
+
+        self.dropout_module = torch.nn.Dropout(p=config.dropout, inplace=False)
+        self.layerdrop = config.layerdrop
+        self.embedding_dim = config.embedding_dim
+        self.apply_graphormer_init = config.apply_graphormer_init
+        self.traceable = config.traceable
+
+        self.graph_node_feature = GraphormerGraphNodeFeature(config)
+        self.graph_attn_bias = GraphormerGraphAttnBias(config)
+
+        self.embed_scale = config.embed_scale
+
+        if config.q_noise > 0:
+            self.quant_noise = quant_noise(
+                nn.Linear(self.embedding_dim, self.embedding_dim, bias=False),
+                config.q_noise,
+                config.qn_block_size,
+            )
+        else:
+            self.quant_noise = None
+
+        if config.encoder_normalize_before:
+            self.emb_layer_norm = nn.LayerNorm(self.embedding_dim)
+        else:
+            self.emb_layer_norm = None
+
+        if config.pre_layernorm:
+            self.final_layer_norm = nn.LayerNorm(self.embedding_dim)
+
+        if self.layerdrop > 0.0:
+            self.layers = LayerDropModuleList(p=self.layerdrop)
+        else:
+            self.layers = nn.ModuleList([])
+        self.layers.extend([GraphormerGraphEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+
+        # Apply initialization of model params after building the model
+        if config.freeze_embeddings:
+            raise NotImplementedError("Freezing embeddings is not implemented yet.")
+
+        for layer in range(config.num_trans_layers_to_freeze):
+            m = self.layers[layer]
+            if m is not None:
+                for p in m.parameters():
+                    p.requires_grad = False
+
+    def forward(
+        self,
+        input_nodes: torch.LongTensor,
+        input_edges: torch.LongTensor,
+        attn_bias: torch.Tensor,
+        in_degree: torch.LongTensor,
+        out_degree: torch.LongTensor,
+        spatial_pos: torch.LongTensor,
+        attn_edge_type: torch.LongTensor,
+        perturb=None,
+        last_state_only: bool = False,
+        token_embeddings: Optional[torch.Tensor] = None,
+        attn_mask: Optional[torch.Tensor] = None,
+    ) -> tuple[Union[torch.Tensor, list[torch.LongTensor]], torch.Tensor]:
+        # compute padding mask. This is needed for multi-head attention
+        data_x = input_nodes
+        n_graph, n_node = data_x.size()[:2]
+        padding_mask = (data_x[:, :, 0]).eq(0)
+        padding_mask_cls = torch.zeros(n_graph, 1, device=padding_mask.device, dtype=padding_mask.dtype)
+        padding_mask = torch.cat((padding_mask_cls, padding_mask), dim=1)
+
+        attn_bias = self.graph_attn_bias(input_nodes, attn_bias, spatial_pos, input_edges, attn_edge_type)
+
+        if token_embeddings is not None:
+            input_nodes = token_embeddings
+        else:
+            input_nodes = self.graph_node_feature(input_nodes, in_degree, out_degree)
+
+        if perturb is not None:
+            input_nodes[:, 1:, :] += perturb
+
+        if self.embed_scale is not None:
+            input_nodes = input_nodes * self.embed_scale
+
+        if self.quant_noise is not None:
+            input_nodes = self.quant_noise(input_nodes)
+
+        if self.emb_layer_norm is not None:
+            input_nodes = self.emb_layer_norm(input_nodes)
+
+        input_nodes = self.dropout_module(input_nodes)
+
+        input_nodes = input_nodes.transpose(0, 1)
+
+        inner_states = []
+        if not last_state_only:
+            inner_states.append(input_nodes)
+
+        for layer in self.layers:
+            input_nodes, _ = layer(
+                input_nodes,
+                self_attn_padding_mask=padding_mask,
+                self_attn_mask=attn_mask,
+                self_attn_bias=attn_bias,
+            )
+            if not last_state_only:
+                inner_states.append(input_nodes)
+
+        graph_rep = input_nodes[0, :, :]
+
+        if last_state_only:
+            inner_states = [input_nodes]
+
+        if self.traceable:
+            return torch.stack(inner_states), graph_rep
+        else:
+            return inner_states, graph_rep
+
+
+class GraphormerDecoderHead(nn.Module):
+    def __init__(self, embedding_dim: int, num_classes: int):
+        super().__init__()
+        """num_classes should be 1 for regression, or the number of classes for classification"""
+        self.lm_output_learned_bias = nn.Parameter(torch.zeros(1))
+        self.classifier = nn.Linear(embedding_dim, num_classes, bias=False)
+        self.num_classes = num_classes
+
+    def forward(self, input_nodes: torch.Tensor, **unused) -> torch.Tensor:
+        input_nodes = self.classifier(input_nodes)
+        input_nodes = input_nodes + self.lm_output_learned_bias
+        return input_nodes
+
+
+class GraphormerPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config: GraphormerConfig
+    base_model_prefix = "graphormer"
+    main_input_name_nodes = "input_nodes"
+    main_input_name_edges = "input_edges"
+
+    def normal_(self, data: torch.Tensor):
+        # with FSDP, module params will be on CUDA, so we cast them back to CPU
+        # so that the RNG is consistent with and without FSDP
+        data.copy_(data.cpu().normal_(mean=0.0, std=0.02).to(data.device))
+
+    def init_graphormer_params(self, module: Union[nn.Linear, nn.Embedding, GraphormerMultiheadAttention]):
+        """
+        Initialize the weights specific to the Graphormer Model.
+        """
+        if isinstance(module, nn.Linear):
+            self.normal_(module.weight.data)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        if isinstance(module, nn.Embedding):
+            self.normal_(module.weight.data)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        if isinstance(module, GraphormerMultiheadAttention):
+            self.normal_(module.q_proj.weight.data)
+            self.normal_(module.k_proj.weight.data)
+            self.normal_(module.v_proj.weight.data)
+
+    def _init_weights(
+        self,
+        module: Union[
+            nn.Linear, nn.Conv2d, nn.Embedding, nn.LayerNorm, GraphormerMultiheadAttention, GraphormerGraphEncoder
+        ],
+    ):
+        """
+        Initialize the weights
+        """
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # We might be missing part of the Linear init, dependent on the layer num
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, GraphormerMultiheadAttention):
+            module.q_proj.weight.data.normal_(mean=0.0, std=0.02)
+            module.k_proj.weight.data.normal_(mean=0.0, std=0.02)
+            module.v_proj.weight.data.normal_(mean=0.0, std=0.02)
+            module.reset_parameters()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, GraphormerGraphEncoder):
+            if module.apply_graphormer_init:
+                module.apply(self.init_graphormer_params)
+
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+class GraphormerModel(GraphormerPreTrainedModel):
+    """The Graphormer model is a graph-encoder model.
+
+    It goes from a graph to its representation. If you want to use the model for a downstream classification task, use
+    GraphormerForGraphClassification instead. For any other downstream task, feel free to add a new class, or combine
+    this model with a downstream model of your choice, following the example in GraphormerForGraphClassification.
+    """
+
+    def __init__(self, config: GraphormerConfig):
+        super().__init__(config)
+        self.max_nodes = config.max_nodes
+
+        self.graph_encoder = GraphormerGraphEncoder(config)
+
+        self.share_input_output_embed = config.share_input_output_embed
+        self.lm_output_learned_bias = None
+
+        # Remove head is set to true during fine-tuning
+        self.load_softmax = not getattr(config, "remove_head", False)
+
+        self.lm_head_transform_weight = nn.Linear(config.embedding_dim, config.embedding_dim)
+        self.activation_fn = ACT2FN[config.activation_fn]
+        self.layer_norm = nn.LayerNorm(config.embedding_dim)
+
+        self.post_init()
+
+    def reset_output_layer_parameters(self):
+        self.lm_output_learned_bias = nn.Parameter(torch.zeros(1))
+
+    def forward(
+        self,
+        input_nodes: torch.LongTensor,
+        input_edges: torch.LongTensor,
+        attn_bias: torch.Tensor,
+        in_degree: torch.LongTensor,
+        out_degree: torch.LongTensor,
+        spatial_pos: torch.LongTensor,
+        attn_edge_type: torch.LongTensor,
+        perturb: Optional[torch.FloatTensor] = None,
+        masked_tokens: None = None,
+        return_dict: Optional[bool] = None,
+        **unused,
+    ) -> Union[tuple[torch.LongTensor], BaseModelOutputWithNoAttention]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        inner_states, graph_rep = self.graph_encoder(
+            input_nodes, input_edges, attn_bias, in_degree, out_degree, spatial_pos, attn_edge_type, perturb=perturb
+        )
+
+        # last inner state, then revert Batch and Graph len
+        input_nodes = inner_states[-1].transpose(0, 1)
+
+        # project masked tokens only
+        if masked_tokens is not None:
+            raise NotImplementedError
+
+        input_nodes = self.layer_norm(self.activation_fn(self.lm_head_transform_weight(input_nodes)))
+
+        # project back to size of vocabulary
+        if self.share_input_output_embed and hasattr(self.graph_encoder.embed_tokens, "weight"):
+            input_nodes = torch.nn.functional.linear(input_nodes, self.graph_encoder.embed_tokens.weight)
+
+        if not return_dict:
+            return tuple(x for x in [input_nodes, inner_states] if x is not None)
+        return BaseModelOutputWithNoAttention(last_hidden_state=input_nodes, hidden_states=inner_states)
+
+    def max_nodes(self):
+        """Maximum output length supported by the encoder."""
+        return self.max_nodes
+
+
+class GraphormerForGraphClassification(GraphormerPreTrainedModel):
+    """
+    This model can be used for graph-level classification or regression tasks.
+
+    It can be trained on
+    - regression (by setting config.num_classes to 1); there should be one float-type label per graph
+    - one task classification (by setting config.num_classes to the number of classes); there should be one integer
+      label per graph
+    - binary multi-task classification (by setting config.num_classes to the number of labels); there should be a list
+      of integer labels for each graph.
+    """
+
+    def __init__(self, config: GraphormerConfig):
+        super().__init__(config)
+        self.encoder = GraphormerModel(config)
+        self.embedding_dim = config.embedding_dim
+        self.num_classes = config.num_classes
+        self.classifier = GraphormerDecoderHead(self.embedding_dim, self.num_classes)
+        self.is_encoder_decoder = True
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        input_nodes: torch.LongTensor,
+        input_edges: torch.LongTensor,
+        attn_bias: torch.Tensor,
+        in_degree: torch.LongTensor,
+        out_degree: torch.LongTensor,
+        spatial_pos: torch.LongTensor,
+        attn_edge_type: torch.LongTensor,
+        labels: Optional[torch.LongTensor] = None,
+        return_dict: Optional[bool] = None,
+        **unused,
+    ) -> Union[tuple[torch.Tensor], SequenceClassifierOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_outputs = self.encoder(
+            input_nodes,
+            input_edges,
+            attn_bias,
+            in_degree,
+            out_degree,
+            spatial_pos,
+            attn_edge_type,
+            return_dict=True,
+        )
+        outputs, hidden_states = encoder_outputs["last_hidden_state"], encoder_outputs["hidden_states"]
+
+        head_outputs = self.classifier(outputs)
+        logits = head_outputs[:, 0, :].contiguous()
+
+        loss = None
+        if labels is not None:
+            mask = ~torch.isnan(labels)
+
+            if self.num_classes == 1:  # regression
+                loss_fct = MSELoss()
+                loss = loss_fct(logits[mask].squeeze(), labels[mask].squeeze().float())
+            elif self.num_classes > 1 and len(labels.shape) == 1:  # One task classification
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits[mask].view(-1, self.num_classes), labels[mask].view(-1))
+            else:  # Binary multi-task classification
+                loss_fct = BCEWithLogitsLoss(reduction="sum")
+                loss = loss_fct(logits[mask], labels[mask])
+
+        if not return_dict:
+            return tuple(x for x in [loss, logits, hidden_states] if x is not None)
+        return SequenceClassifierOutput(loss=loss, logits=logits, hidden_states=hidden_states, attentions=None)
+
+
+__all__ = ["GraphormerForGraphClassification", "GraphormerModel", "GraphormerPreTrainedModel"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/jukebox/__init__.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/jukebox/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..826bdbddc1f182de43a795ab0d78ad4009507c14
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/jukebox/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ....utils import _LazyModule
+from ....utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_jukebox import *
+    from .modeling_jukebox import *
+    from .tokenization_jukebox import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/jukebox/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/jukebox/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ec09f2cd70b469fa414844355067310f08a2e17a
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/jukebox/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/jukebox/__pycache__/configuration_jukebox.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/jukebox/__pycache__/configuration_jukebox.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..66cb53ac96327fe90d150799277f8eb178e391c5
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/jukebox/__pycache__/configuration_jukebox.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/jukebox/__pycache__/tokenization_jukebox.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/jukebox/__pycache__/tokenization_jukebox.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2cecb57daad9edd4a54af1726af247219f0c09d5
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/jukebox/__pycache__/tokenization_jukebox.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/jukebox/configuration_jukebox.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/jukebox/configuration_jukebox.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bb2e3da54aa232bf934a7922f731796a7146937
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/jukebox/configuration_jukebox.py
@@ -0,0 +1,613 @@
+# coding=utf-8
+# Copyright 2022 The OpenAI Team Authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Jukebox configuration"""
+
+import os
+from typing import Union
+
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+_LARGE_ATTENTION = [
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "cross_attention",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "cross_attention",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "cross_attention",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "cross_attention",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "cross_attention",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "cross_attention",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "cross_attention",
+]
+_RawColumnPreviousRowAttention = ["block_attn", "transpose_block_attn", "prev_block_attn"]
+_FullDenseAttention = ["dense_attention"]
+_PrimePrimeDenseAttention = ["prime_attn", "prime_attn", "dense_attn"]
+
+
+def full_dense_attention(layer):
+    return _FullDenseAttention[0]
+
+
+def raw_column_previous_row_attention(layer):
+    return _RawColumnPreviousRowAttention[layer % 3]
+
+
+def large_separated_enc_dec_w_lyrics(layer):
+    return _LARGE_ATTENTION[layer % 79]
+
+
+def enc_dec_with_lyrics(layer):
+    if layer % 16 == 15:
+        return _PrimePrimeDenseAttention[layer % 3]
+    return _RawColumnPreviousRowAttention[layer % 3]
+
+
+ATTENTION_PATTERNS = {
+    "full_dense_attention": full_dense_attention,
+    "raw_column_previous_row_attention": raw_column_previous_row_attention,  # Alternate row, column and previous row attn
+    "large_separated_enc_dec_w_lyrics": large_separated_enc_dec_w_lyrics,  # Used by large separated_enc_dec model with lyrics
+    "enc_dec_with_lyrics": enc_dec_with_lyrics,  # Used by encoder_decoder model with lyrics
+}
+
+
+class JukeboxPriorConfig(PretrainedConfig):
+    """
+        This is the configuration class to store the configuration of a [`JukeboxPrior`]. It is used to instantiate a
+        `JukeboxPrior` according to the specified arguments, defining the model architecture. Instantiating a
+        configuration with the defaults will yield a similar configuration to that of the top level prior from the
+        [openai/jukebox-1b-lyrics](https://huggingface.co/openai/jukebox
+    -1b-lyrics) architecture.
+
+        Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+        documentation from [`PretrainedConfig`] for more information.
+
+
+
+    Args:
+        act_fn (`str`, *optional*, defaults to `"quick_gelu"`):
+            Activation function.
+        alignment_head (`int`, *optional*, defaults to 2):
+            Head that is responsible of the alignment between lyrics and music. Only used to compute the lyric to audio
+            alignment
+        alignment_layer (`int`, *optional*, defaults to 68):
+            Index of the layer that is responsible of the alignment between lyrics and music. Only used to compute the
+            lyric to audio alignment
+        attention_multiplier (`float`, *optional*, defaults to 0.25):
+            Multiplier coefficient used to define the hidden dimension of the attention layers. 0.25 means that
+            0.25*width of the model will be used.
+        attention_pattern (`str`, *optional*, defaults to `"enc_dec_with_lyrics"`):
+            Which attention pattern to use for the decoder/
+        attn_dropout (`int`, *optional*, defaults to 0):
+            Dropout probability for the post-attention layer dropout in the decoder.
+        attn_res_scale (`bool`, *optional*, defaults to `False`):
+            Whether or not to scale the residuals in the attention conditioner block.
+        blocks (`int`, *optional*, defaults to 64):
+            Number of blocks used in the `block_attn`. A sequence of length seq_len is factored as `[blocks, seq_len //
+            blocks]` in the `JukeboxAttention` layer.
+        conv_res_scale (`int`, *optional*):
+            Whether or not to scale the residuals in the conditioner block. Since the top level prior does not have a
+            conditioner, the default value is to None and should not be modified.
+        num_layers (`int`, *optional*, defaults to 72):
+            Number of layers of the transformer architecture.
+        emb_dropout (`int`, *optional*, defaults to 0):
+            Embedding dropout used in the lyric decoder.
+        encoder_config (`JukeboxPriorConfig`, *optional*) :
+            Configuration of the encoder which models the prior on the lyrics.
+        encoder_loss_fraction (`float`, *optional*, defaults to 0.4):
+            Multiplication factor used in front of the lyric encoder loss.
+        hidden_size (`int`, *optional*, defaults to 2048):
+            Hidden dimension of the attention layers.
+        init_scale (`float`, *optional*, defaults to 0.2):
+            Initialization scales for the prior modules.
+        is_encoder_decoder (`bool`, *optional*, defaults to `True`):
+            Whether or not the prior is an encoder-decoder model. In case it is not, and `nb_relevant_lyric_tokens` is
+            greater than 0, the `encoder` args should be specified for the lyric encoding.
+        mask (`bool`, *optional*, defaults to `False`):
+            Whether or not to mask the previous positions in the attention.
+        max_duration (`int`, *optional*, defaults to 600):
+            Maximum supported duration of the generated song in seconds.
+        max_nb_genres (`int`, *optional*, defaults to 1):
+            Maximum number of genres that can be used to condition the model.
+        merged_decoder (`bool`, *optional*, defaults to `True`):
+            Whether or not the decoder and the encoder inputs are merged. This is used for the separated
+            encoder-decoder architecture
+        metadata_conditioning (`bool`, *optional*, defaults to `True)`:
+            Whether or not to condition on the artist and genre metadata.
+        metadata_dims (`List[int]`, *optional*, defaults to `[604, 7898]`):
+            Number of genres and the number of artists that were used to train the embedding layers of the prior
+            models.
+        min_duration (`int`, *optional*, defaults to 0):
+            Minimum duration of the generated audio on which the model was trained.
+        mlp_multiplier (`float`, *optional*, defaults to 1.0):
+            Multiplier coefficient used to define the hidden dimension of the MLP layers. 0.25 means that 0.25*width of
+            the model will be used.
+        music_vocab_size (`int`, *optional*, defaults to 2048):
+            Number of different music tokens. Should be similar to the `JukeboxVQVAEConfig.nb_discrete_codes`.
+        n_ctx (`int`, *optional*, defaults to 6144):
+            Number of context tokens for each prior. The context tokens are the music tokens that are attended to when
+            generating music tokens.
+        n_heads (`int`, *optional*, defaults to 2):
+                Number of attention heads.
+        nb_relevant_lyric_tokens (`int`, *optional*, defaults to 384):
+            Number of lyric tokens that are used when sampling a single window of length `n_ctx`
+        res_conv_depth (`int`, *optional*, defaults to 3):
+            Depth of the `JukeboxDecoderConvBock` used to upsample the previously sampled audio in the
+            `JukeboxMusicTokenConditioner`.
+        res_conv_width (`int`, *optional*, defaults to 128):
+            Width of the `JukeboxDecoderConvBock` used to upsample the previously sampled audio in the
+            `JukeboxMusicTokenConditioner`.
+        res_convolution_multiplier (`int`, *optional*, defaults to 1):
+            Multiplier used to scale the `hidden_dim` of the `JukeboxResConv1DBlock`.
+        res_dilation_cycle (`int`, *optional*):
+            Dilation cycle used to define the `JukeboxMusicTokenConditioner`. Usually similar to the ones used in the
+            corresponding level of the VQVAE. The first prior does not use it as it is not conditioned on upper level
+            tokens.
+        res_dilation_growth_rate (`int`, *optional*, defaults to 1):
+            Dilation grow rate used between each convolutionnal block of the `JukeboxMusicTokenConditioner`
+        res_downs_t (`List[int]`, *optional*, defaults to `[3, 2, 2]`):
+            Downsampling rates used in the audio conditioning network
+        res_strides_t (`List[int]`, *optional*, defaults to `[2, 2, 2]`):
+            Striding used in the audio conditioning network
+        resid_dropout (`int`, *optional*, defaults to 0):
+            Residual dropout used in the attention pattern.
+        sampling_rate (`int`, *optional*, defaults to 44100):
+            Sampling rate used for training.
+        spread (`int`, *optional*):
+            Spread used in the `summary_spread_attention` pattern
+        timing_dims (`int`, *optional*, defaults to 64):
+            Dimension of the timing embedding.
+        zero_out (`bool`, *optional*, defaults to `False`):
+            Whether or not to zero out convolution weights when initializing.
+    """
+
+    model_type = "jukebox_prior"
+    attribute_map = {
+        "max_position_embeddings": "n_positions",
+        "num_attention_heads": "n_head",
+    }
+
+    def __init__(
+        self,
+        act_fn="quick_gelu",
+        level=0,
+        alignment_head=2,
+        alignment_layer=68,
+        attention_multiplier=0.25,
+        attention_pattern="enc_dec_with_lyrics",
+        attn_dropout=0,
+        attn_res_scale=False,
+        blocks=64,
+        conv_res_scale=None,
+        num_layers=72,
+        emb_dropout=0,
+        encoder_config=None,
+        encoder_loss_fraction=0.4,
+        hidden_size=2048,
+        init_scale=0.2,
+        is_encoder_decoder=True,
+        lyric_vocab_size=80,
+        mask=False,
+        max_duration=600,
+        max_nb_genres=1,
+        merged_decoder=True,
+        metadata_conditioning=True,
+        metadata_dims=[604, 7898],
+        min_duration=0,
+        mlp_multiplier=1.0,
+        music_vocab_size=2048,
+        n_ctx=6144,
+        n_heads=2,
+        nb_relevant_lyric_tokens=384,
+        res_conv_depth=3,
+        res_conv_width=128,
+        res_convolution_multiplier=1,
+        res_dilation_cycle=None,
+        res_dilation_growth_rate=1,
+        res_downs_t=[3, 2, 2],
+        res_strides_t=[2, 2, 2],
+        resid_dropout=0,
+        sampling_rate=44100,
+        spread=None,
+        timing_dims=64,
+        zero_out=False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.act_fn = act_fn
+        self.alignment_head = alignment_head
+        self.alignment_layer = alignment_layer
+        self.attention_multiplier = attention_multiplier
+        self.attention_pattern = attention_pattern
+        self.attn_dropout = attn_dropout
+        self.attn_res_scale = attn_res_scale
+        self.blocks = blocks
+        self.conv_res_scale = conv_res_scale
+        self.num_layers = num_layers
+        self.emb_dropout = emb_dropout
+        self.music_vocab_size = music_vocab_size
+        if encoder_config is not None:
+            self.encoder_config = JukeboxPriorConfig(**encoder_config)
+        else:
+            self.encoder_config = None
+        self.encoder_loss_fraction = encoder_loss_fraction
+        self.init_scale = init_scale
+        self.is_encoder_decoder = is_encoder_decoder
+        self.lyric_vocab_size = lyric_vocab_size
+        self.level = level
+        self.mask = mask
+        self.max_duration = max_duration
+        self.max_nb_genres = max_nb_genres
+        self.merged_decoder = merged_decoder
+        self.metadata_conditioning = metadata_conditioning
+        self.metadata_dims = metadata_dims
+        self.min_duration = min_duration
+        self.mlp_multiplier = mlp_multiplier
+        self.n_ctx = n_ctx
+        self.n_heads = n_heads
+        self.nb_relevant_lyric_tokens = nb_relevant_lyric_tokens
+        self.res_conv_depth = res_conv_depth
+        self.res_conv_width = res_conv_width
+        self.res_convolution_multiplier = res_convolution_multiplier
+        self.res_dilation_cycle = res_dilation_cycle
+        self.res_dilation_growth_rate = res_dilation_growth_rate
+        self.res_downs_t = res_downs_t
+        self.res_strides_t = res_strides_t
+        self.resid_dropout = resid_dropout
+        self.sampling_rate = sampling_rate
+        self.spread = spread
+        self.timing_dims = timing_dims
+        self.hidden_size = hidden_size
+        self.zero_out = zero_out
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], level=0, **kwargs):
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the prior config dict if we are loading from JukeboxConfig
+        if config_dict.get("model_type") == "jukebox":
+            config_dict = config_dict[f"prior_{level}"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class JukeboxVQVAEConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`JukeboxVQVAE`]. It is used to instantiate a
+    `JukeboxVQVAE` according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the VQVAE from
+    [openai/jukebox-1b-lyrics](https://huggingface.co/openai/jukebox-1b-lyrics) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        act_fn (`str`, *optional*, defaults to `"relu"`):
+            Activation function of the model.
+        nb_discrete_codes (`int`, *optional*, defaults to 2048):
+            Number of codes of the VQVAE.
+        commit (`float`, *optional*, defaults to 0.02):
+            Commit loss multiplier.
+        conv_input_shape (`int`, *optional*, defaults to 1):
+            Number of audio channels.
+        conv_res_scale (`bool`, *optional*, defaults to `False`):
+            Whether or not to scale the residuals of the `JukeboxResConv1DBlock`.
+        embed_dim (`int`, *optional*, defaults to 64):
+            Embedding dimension of the codebook vectors.
+        hop_fraction (`List[int]`, *optional*, defaults to `[0.125, 0.5, 0.5]`):
+            Fraction of non-intersecting window used when continuing the sampling process.
+        levels (`int`, *optional*, defaults to 3):
+            Number of hierarchical levels that used in the VQVAE.
+        lmu (`float`, *optional*, defaults to 0.99):
+            Used in the codebook update, exponential moving average coefficient. For more detail refer to Appendix A.1
+            of the original [VQVAE paper](https://huggingface.co/papers/1711.00937v2.pdf)
+        multipliers (`List[int]`, *optional*, defaults to `[2, 1, 1]`):
+            Depth and width multipliers used for each level. Used on the `res_conv_width` and `res_conv_depth`
+        res_conv_depth (`int`, *optional*, defaults to 4):
+            Depth of the encoder and decoder block. If no `multipliers` are used, this is the same for each level.
+        res_conv_width (`int`, *optional*, defaults to 32):
+            Width of the encoder and decoder block. If no `multipliers` are used, this is the same for each level.
+        res_convolution_multiplier (`int`, *optional*, defaults to 1):
+            Scaling factor of the hidden dimension used in the `JukeboxResConv1DBlock`.
+        res_dilation_cycle (`int`, *optional*):
+            Dilation cycle value used in the `JukeboxResnet`. If an int is used, each new Conv1 block will have a depth
+            reduced by a power of `res_dilation_cycle`.
+        res_dilation_growth_rate (`int`, *optional*, defaults to 3):
+            Resnet dilation growth rate used in the VQVAE (dilation_growth_rate ** depth)
+        res_downs_t (`List[int]`, *optional*, defaults to `[3, 2, 2]`):
+            Downsampling rate for each level of the hierarchical VQ-VAE.
+        res_strides_t (`List[int]`, *optional*, defaults to `[2, 2, 2]`):
+            Stride used for each level of the hierarchical VQ-VAE.
+        sample_length (`int`, *optional*, defaults to 1058304):
+            Provides the max input shape of the VQVAE. Is used to compute the input shape of each level.
+        init_scale (`float`, *optional*, defaults to 0.2):
+            Initialization scale.
+        zero_out (`bool`, *optional*, defaults to `False`):
+            Whether or not to zero out convolution weights when initializing.
+    """
+
+    model_type = "jukebox_vqvae"
+
+    def __init__(
+        self,
+        act_fn="relu",
+        nb_discrete_codes=2048,
+        commit=0.02,
+        conv_input_shape=1,
+        conv_res_scale=False,
+        embed_dim=64,
+        hop_fraction=[0.125, 0.5, 0.5],
+        levels=3,
+        lmu=0.99,
+        multipliers=[2, 1, 1],
+        res_conv_depth=4,
+        res_conv_width=32,
+        res_convolution_multiplier=1,
+        res_dilation_cycle=None,
+        res_dilation_growth_rate=3,
+        res_downs_t=[3, 2, 2],
+        res_strides_t=[2, 2, 2],
+        sample_length=1058304,
+        init_scale=0.2,
+        zero_out=False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hop_fraction = hop_fraction
+        self.conv_input_shape = conv_input_shape
+        self.sample_length = sample_length
+
+        # VQVAE parameters (all used)
+        self.levels = levels
+        self.embed_dim = embed_dim
+        self.nb_discrete_codes = nb_discrete_codes
+        self.res_conv_width = res_conv_width
+        self.res_conv_depth = res_conv_depth
+        self.res_convolution_multiplier = res_convolution_multiplier
+        self.res_dilation_growth_rate = res_dilation_growth_rate
+        self.res_dilation_cycle = res_dilation_cycle
+        self.multipliers = multipliers
+        self.res_downs_t = res_downs_t
+        self.res_strides_t = res_strides_t
+        self.lmu = lmu
+        self.commit = commit
+        self.conv_res_scale = conv_res_scale
+        self.act_fn = act_fn
+        self.init_scale = init_scale
+        self.zero_out = zero_out
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs):
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the text config dict if we are loading from CLIPConfig
+        if config_dict.get("model_type") == "jukebox":
+            config_dict = config_dict["vqvae_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class JukeboxConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`JukeboxModel`].
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information. Instantiating a configuration with the defaults will
+    yield a similar configuration to that of
+    [openai/jukebox-1b-lyrics](https://huggingface.co/openai/jukebox-1b-lyrics) architecture.
+
+
+    The downsampling and stride are used to determine downsampling of the input sequence. For example, downsampling =
+    (5,3), and strides = (2, 2) will downsample the audio by 2^5 = 32 to get the first level of codes, and 2**8 = 256
+    to get the second level codes. This is mostly true for training the top level prior and the upsamplers.
+
+    Args:
+        vqvae_config (`JukeboxVQVAEConfig`, *optional*):
+            Configuration for the `JukeboxVQVAE` model.
+        prior_config_list (`List[JukeboxPriorConfig]`, *optional*):
+            List of the configs for each of the `JukeboxPrior` of the model. The original architecture uses 3 priors.
+        nb_priors (`int`, *optional*, defaults to 3):
+            Number of prior models that will sequentially sample tokens. Each prior is conditional auto regressive
+            (decoder) model, apart from the top prior, which can include a lyric encoder. The available models were
+            trained using a top prior and 2 upsampler priors.
+        sampling_rate (`int`, *optional*, defaults to 44100):
+            Sampling rate of the raw audio.
+        timing_dims (`int`, *optional*, defaults to 64):
+            Dimensions of the JukeboxRangeEmbedding layer which is equivalent to traditional positional embedding
+            layer. The timing embedding layer converts the absolute and relative position in the currently sampled
+            audio to a tensor of length `timing_dims` that will be added to the music tokens.
+        min_duration (`int`, *optional*, defaults to 0):
+            Minimum duration of the audios to generate
+        max_duration (`float`, *optional*, defaults to 600.0):
+            Maximum duration of the audios to generate
+        max_nb_genres (`int`, *optional*, defaults to 5):
+            Maximum number of genres that can be used to condition a single sample.
+        metadata_conditioning (`bool`, *optional*, defaults to `True`):
+            Whether or not to use metadata conditioning, corresponding to the artist, the genre and the min/maximum
+            duration.
+
+    Example:
+
+    ```python
+    >>> from transformers import JukeboxModel, JukeboxConfig
+
+    >>> # Initializing a Jukebox configuration
+    >>> configuration = JukeboxConfig()
+
+    >>> # Initializing a model from the configuration
+    >>> model = JukeboxModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+
+    model_type = "jukebox"
+
+    def __init__(
+        self,
+        vqvae_config=None,
+        prior_config_list=None,
+        nb_priors=3,
+        sampling_rate=44100,
+        timing_dims=64,
+        min_duration=0,
+        max_duration=600.0,
+        max_nb_genres=5,
+        metadata_conditioning=True,
+        **kwargs,
+    ):
+        if vqvae_config is None:
+            vqvae_config = {}
+            logger.info("vqvae_config is None. initializing the JukeboxVQVAE with default values.")
+
+        self.vqvae_config = JukeboxVQVAEConfig(**vqvae_config)
+        if prior_config_list is not None:
+            self.prior_configs = [JukeboxPriorConfig(**prior_config) for prior_config in prior_config_list]
+        else:
+            self.prior_configs = []
+            for prior_idx in range(nb_priors):
+                prior_config = kwargs.pop(f"prior_{prior_idx}", None)
+                if prior_config is None:
+                    prior_config = {}
+                    logger.info(
+                        f"prior_{prior_idx}'s  config is None. Initializing the JukeboxPriorConfig list with default"
+                        " values."
+                    )
+                self.prior_configs.append(JukeboxPriorConfig(**prior_config))
+
+        self.hop_fraction = self.vqvae_config.hop_fraction
+
+        self.nb_priors = nb_priors
+
+        # Metadata conditioning
+        self.max_nb_genres = max_nb_genres
+        self.sampling_rate = sampling_rate
+        self.timing_dims = timing_dims
+        self.min_duration = min_duration
+        self.max_duration = max_duration
+        self.metadata_conditioning = metadata_conditioning
+
+        super().__init__(**kwargs)
+
+    @classmethod
+    def from_configs(cls, prior_configs: list[JukeboxPriorConfig], vqvae_config: JukeboxVQVAEConfig, **kwargs):
+        r"""
+        Instantiate a [`JukeboxConfig`] (or a derived class) from clip text model configuration and clip vision model
+        configuration.
+
+        Returns:
+            [`JukeboxConfig`]: An instance of a configuration object
+        """
+        prior_config_list = [config.to_dict() for config in prior_configs]
+        return cls(prior_config_list=prior_config_list, vqvae_config_dict=vqvae_config.to_dict(), **kwargs)
+
+    def to_dict(self):
+        # Override the default to_dict to apply to_dict to the list of prior configs.
+        result = super().to_dict()
+        result["prior_config_list"] = [config.to_dict() for config in result.pop("prior_configs")]
+        return result
+
+
+__all__ = ["JukeboxConfig", "JukeboxPriorConfig", "JukeboxVQVAEConfig"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/jukebox/modeling_jukebox.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/jukebox/modeling_jukebox.py
new file mode 100644
index 0000000000000000000000000000000000000000..f928d49cf5f73156c1352e1a5cb8695a57a08c1c
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/jukebox/modeling_jukebox.py
@@ -0,0 +1,2670 @@
+# coding=utf-8
+# Copyright 2022 The OpenAI Team Authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Jukebox model."""
+
+import math
+import os
+from typing import Optional
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.nn import LayerNorm as FusedLayerNorm
+
+from ....activations import ACT2FN
+from ....modeling_utils import PreTrainedModel
+from ....utils import add_start_docstrings, logging
+from ....utils.logging import tqdm
+from .configuration_jukebox import ATTENTION_PATTERNS, JukeboxConfig, JukeboxPriorConfig, JukeboxVQVAEConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+def filter_logits(logits, top_k=0, top_p=0.0, filter_value=-float("Inf")):
+    """
+    Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
+
+    Args:
+        logits (`torch.Tensor`):
+            logits distribution shape (vocabulary size)
+        top_k (`int`, *optional*, defaults to 0):
+            When `top_k >0` keep only top key tokens with highest probability (top-k filtering).
+        top_p (`int`, *optional*, defaults to 0):
+            When `top_p>0.0` keep the top tokens with cumulative probability >= `top_p` (nucleus filtering).
+    """
+    logits = logits.clone()
+    top_k = min(top_k, logits.size(-1))  # Safety check
+
+    if top_k > 0:
+        # Remove all tokens with a probability less than the last token of the top-k
+        indices_to_remove = logits < torch.topk(logits, top_k, dim=-1)[0][..., -1:]
+        logits[indices_to_remove] = filter_value
+
+    if top_p > 0.0:
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
+        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+
+        # Remove tokens with cumulative probability above the threshold
+        sorted_indices_to_remove = cumulative_probs > top_p
+        # Shift the indices to the right to keep also the first token above the threshold
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+
+        # indices_to_remove = sorted_indices[sorted_indices_to_remove]
+        indices_to_remove = torch.zeros_like(logits, dtype=torch.bool).scatter_(
+            dim=-1, index=sorted_indices, src=sorted_indices_to_remove
+        )
+        logits[indices_to_remove] = filter_value
+    return logits
+
+
+def get_relevant_lyric_tokens(full_tokens, max_n_lyric_tokens, total_length, offset, duration):
+    """
+    Extract only the relevant tokens based on the character position. A total of `max_n_lyric_tokens` tokens will be
+    returned. If the provided token sequence is smaller, it will be padded, otherwise, only characters ranging from the
+    midpoint - `max_n_lyric_tokens//2` to the midpoint + `max_n_lyric_tokens//2` will be returned. This *focuses* on
+    the most relevant tokens (in time) for the sequence.
+
+    Args:
+        full_tokens (`list[int]`):
+            List containing the token ids of the entire lyrics.
+        total_length (`int`):
+            Total expected length of the music (not all of it is generated, see duration), in samples.
+        offset (`int`):
+            Starting sample in the music. If the offset is greater than 0, the lyrics will be shifted take that into
+            account
+        duration (`int`):
+            Expected duration of the generated music, in samples. The duration has to be smaller than the total length,
+            which represent the overall length of the signal,
+    """
+    full_tokens = full_tokens[0]
+    if len(full_tokens) < max_n_lyric_tokens:
+        tokens = torch.cat(
+            [torch.zeros(max_n_lyric_tokens - len(full_tokens), dtype=torch.long).to(full_tokens.device), full_tokens]
+        )
+        indices = [-1] * (max_n_lyric_tokens - len(full_tokens)) + list(range(0, len(full_tokens)))
+    else:
+        midpoint = int(len(full_tokens) * (offset + duration / 2.0) / total_length)
+        midpoint = min(max(midpoint, max_n_lyric_tokens // 2), len(full_tokens) - max_n_lyric_tokens // 2)
+        tokens = full_tokens[midpoint - max_n_lyric_tokens // 2 : midpoint + max_n_lyric_tokens // 2]
+        indices = list(range(midpoint - max_n_lyric_tokens // 2, midpoint + max_n_lyric_tokens // 2))
+    return tokens.unsqueeze(dim=0), indices
+
+
+# Break total_length into hops/windows of size n_ctx separated by hop_length
+def get_starts(total_length, n_ctx, hop_length):
+    starts = []
+    for start in range(0, total_length - n_ctx + hop_length, hop_length):
+        if start + n_ctx >= total_length:
+            # Last hop could be smaller, we make it n_ctx to maximise context
+            start = total_length - n_ctx
+        starts.append(start)
+    return starts
+
+
+def get_alignment(music_tokens, labels, prior, config):
+    level = prior.levels - 1  # Top level used
+    n_ctx = prior.n_ctx
+    tokens = music_tokens[level]
+    batch_size, total_length = tokens.shape[0], tokens.shape[1]
+    if total_length < n_ctx:
+        padding_length = n_ctx - total_length
+        tokens = torch.cat(
+            [tokens, torch.zeros(batch_size, n_ctx - total_length, dtype=tokens.dtype, device=tokens.device)], dim=1
+        )
+        total_length = tokens.shape[1]
+    else:
+        padding_length = 0
+
+    hop_length = int(config.hop_fraction[-level - 1] * prior.n_ctx)
+    alignment_head, alignment_layer = config.prior_alignment_head[0], config.prior_alignment_layer[0]
+    attn_layers = {alignment_layer}
+    alignment_hops = {}
+    indices_hops = {}
+    for start in tqdm(get_starts(total_length, n_ctx, hop_length), desc="Computing lyric to music alignment "):
+        end = start + n_ctx
+        # set metadata offset, sample_length and lyrics tokens
+        metadata, indices_hop = prior.get_metadata(labels, start, config.sample_length, get_indices=True, offset=0)
+        tokens_bs = torch.chunk(tokens, batch_size, dim=0)
+        metadata_bs = torch.chunk(metadata, batch_size, dim=0)
+        w_hops = []
+        for tokens_i, metadata_i in zip(tokens_bs, metadata_bs):
+            w_hop = prior.forward_tokens(tokens_i[:, start:end], [], metadata_i, get_attn_weights=attn_layers)
+            w_hops.append(w_hop[0][:, alignment_head])
+            del w_hop
+        weights = torch.cat(w_hops, dim=0)
+        del w_hops
+        alignment_hop = weights.to(device="cpu", dtype=torch.float).numpy()
+        del weights
+
+        # alignment_hop has shape (bs, n_ctx, nb_relevant_lyric_tokens)
+        # indices_hop is a list of len=bs, each entry of len hps.nb_relevant_lyric_tokens
+        indices_hops[start] = indices_hop
+        alignment_hops[start] = alignment_hop
+
+    # Combine attn for each hop into attn for full range
+    # Use indices to place them into correct place for corresponding source tokens
+    alignments = []
+    for item in range(batch_size):
+        # Note each item has different length lyrics
+        full_tokens = labels[0, 3:]
+        alignment = np.zeros((total_length, len(full_tokens) + 1))
+        for start in reversed(get_starts(total_length, n_ctx, hop_length)):
+            end = start + n_ctx
+            alignment_hop = alignment_hops[start][item]
+            indices = indices_hops[start][item]
+            alignment[start:end, indices] = alignment_hop
+        alignment = alignment[: total_length - padding_length, :-1]  # remove token padding, and last lyric index
+        alignments.append(alignment)
+    return alignments
+
+
+def save_temp_audio(fname, lvl, metas, aud):
+    aud = torch.clamp(aud, -1, 1).cpu().numpy()
+    for i in list(range(aud.shape[0])):
+        if metas is not None:
+            artists, genres, lyrics = list(metas)[i].values()
+            path = f"{fname}/lvl_{lvl}-{artists}-{genres}-{lyrics[:5]}-{i}"
+            np.save(path, aud[i])
+        else:
+            np.save(f"{fname}/lvl_{lvl}-sample-{i}", aud[i])
+
+
+def get_mask(mask, query_length, key_value_length, blocks, spread, device, sample, sample_t):
+    # returns a mask of shape 1 x 1 x query_length x key_value_length or None if masking is not needed.
+    if mask is None or query_length == 1:
+        return None
+    offset = sample_t - query_length if sample else max(key_value_length - query_length, 0)
+    if mask == "autoregressive":
+        # Masked dense
+        mask = torch.ones(query_length, key_value_length, device=device).tril(offset)
+    elif mask == "summary":
+        # Masked summary
+        mask = torch.ones(query_length, query_length, device=device).tril()
+        mask = torch.ones(query_length, query_length, device=device).tril()
+        mask = mask.view(query_length, blocks, query_length // blocks)[:, :-1, -key_value_length // blocks :]
+        mask = (
+            torch.nn.functional.pad(
+                mask,
+                (0, 0, 1, 0),
+                value=1,
+            )
+            .contiguous()
+            .view(query_length, key_value_length)
+        )
+    elif mask == "prime":
+        mask = torch.ones(query_length, key_value_length, device=device).tril(offset)
+    return mask.view(1, 1, query_length, key_value_length)
+
+
+class JukeboxConv1D(nn.Module):
+    def __init__(self, input_width, output_width):
+        super().__init__()
+        self.input_width = input_width
+        self.output_width = output_width
+        weight = torch.empty(input_width, output_width)
+        bias = torch.zeros(output_width)
+        self.weight = nn.Parameter(weight)
+        self.bias = nn.Parameter(bias)
+
+    def forward(self, hidden_states):
+        size_out = (*hidden_states.size()[:-1], self.output_width)
+        hidden_states = torch.addmm(
+            self.bias.type_as(hidden_states),
+            hidden_states.view(-1, hidden_states.size(-1)),
+            self.weight.type_as(hidden_states),
+        )
+        hidden_states = hidden_states.view(*size_out)
+        return hidden_states
+
+
+class JukeboxResConv1DBlock(nn.Module):
+    def __init__(self, config, conv_width, depth=1, res_scale=1.0):
+        super().__init__()
+        hidden_dim = config.res_convolution_multiplier * conv_width
+        dilation = config.res_dilation_growth_rate**depth
+        padding = dilation
+
+        self.res_scale = res_scale
+        self.activation = nn.ReLU()
+        self.conv1d_1 = nn.Conv1d(conv_width, hidden_dim, 3, 1, padding, dilation)
+        self.conv1d_2 = nn.Conv1d(hidden_dim, conv_width, 1, 1, 0)
+
+    def forward(self, hidden_states):
+        residuals = hidden_states
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.conv1d_1(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.conv1d_2(hidden_states)
+        return residuals + self.res_scale * hidden_states
+
+
+class JukeboxResnet1D(nn.Module):
+    def __init__(self, config, conv_width, n_depth, reverse_dilation=False):
+        super().__init__()
+        self.dilation_cycle = config.res_dilation_cycle
+        res_scale = 1.0 if not config.conv_res_scale else 1.0 / math.sqrt(n_depth)
+
+        blocks = []
+        for depth in range(n_depth):
+            block_depth = depth if self.dilation_cycle is None else depth % self.dilation_cycle
+            blocks.append(JukeboxResConv1DBlock(config, conv_width, block_depth, res_scale))
+
+        if reverse_dilation:
+            blocks = blocks[::-1]
+        self.resnet_block = nn.ModuleList(blocks)
+
+    def forward(self, hidden_states):
+        for block in self.resnet_block:
+            hidden_states = block(hidden_states)
+        return hidden_states
+
+
+class JukeboxEncoderConvBlock(nn.Module):
+    def __init__(self, config, embed_dim, hidden_dim, depth, down_t, stride_t):
+        super().__init__()
+        blocks = []
+        filter_t = stride_t * 2
+        pad_t = stride_t // 2
+        if down_t > 0:
+            for i in range(down_t):
+                blocks.append(nn.Conv1d(embed_dim if i == 0 else hidden_dim, hidden_dim, filter_t, stride_t, pad_t))
+                blocks.append(JukeboxResnet1D(config, hidden_dim, depth))
+        self.proj_out = nn.Conv1d(hidden_dim, config.embed_dim, 3, 1, 1)
+        self.downsample_block = nn.ModuleList(blocks)
+
+    def forward(self, hidden_states):
+        for block in self.downsample_block:
+            hidden_states = block(hidden_states)
+        hidden_states = self.proj_out(hidden_states)
+        return hidden_states
+
+
+class JukeboxEncoder(nn.Module):
+    def __init__(self, config, width, depth, levels, downs_t, strides_t):
+        super().__init__()
+        self.levels = levels
+        self.level_blocks = nn.ModuleList()
+
+        iterator = zip(list(range(self.levels)), downs_t, strides_t)
+        for i, down_t, stride_t in iterator:
+            self.level_blocks.append(
+                JukeboxEncoderConvBlock(
+                    config, config.conv_input_shape if i == 0 else config.embed_dim, width, depth, down_t, stride_t
+                )
+            )
+
+    def forward(self, hidden_states):
+        all_hidden_states = []
+
+        # 64, 32, ...
+        for level in range(self.levels):
+            level_block = self.level_blocks[level]
+            hidden_states = level_block(hidden_states)
+            all_hidden_states.append(hidden_states)
+
+        return all_hidden_states
+
+
+class JukeboxDecoderConvBock(nn.Module):
+    def __init__(self, config, embed_dim, hidden_dim, depth, down_t, stride_t, reverse_dilation=True):
+        self.embed_dim = embed_dim
+        self.hidden_dim = hidden_dim
+        super().__init__()
+        blocks = []
+        if down_t > 0:
+            filter_t = stride_t * 2
+            pad_t = stride_t // 2
+            self.proj_in = nn.Conv1d(embed_dim, hidden_dim, 3, 1, 1)
+            for i in range(down_t):
+                blocks.append(JukeboxResnet1D(config, hidden_dim, depth, reverse_dilation))
+                blocks.append(
+                    nn.ConvTranspose1d(
+                        hidden_dim, hidden_dim if i < down_t - 1 else embed_dim, filter_t, stride_t, pad_t
+                    )
+                )
+        self.upsample_block = nn.ModuleList(blocks)
+
+    def forward(self, hidden_states):
+        hidden_states = self.proj_in(hidden_states)
+        for block in self.upsample_block:
+            hidden_states = block(hidden_states)
+        return hidden_states
+
+
+class JukeboxDecoder(nn.Module):
+    def __init__(self, config, hidden_dim, depth, levels, downs_t, strides_t):
+        super().__init__()
+        self.levels = levels
+        self.level_blocks = nn.ModuleList()
+        for level, down_t, stride_t in zip(list(range(self.levels)), downs_t, strides_t):
+            self.level_blocks.append(
+                JukeboxDecoderConvBock(config, config.embed_dim, hidden_dim, depth, down_t, stride_t)
+            )
+
+        self.out = nn.Conv1d(config.embed_dim, config.conv_input_shape, 3, 1, 1)
+
+    def forward(self, hidden_states, all_levels=True):
+        hidden_state = hidden_states[-1]
+
+        # 32, 64 ...
+        for level in reversed(range(self.levels)):
+            level_block = self.level_blocks[level]
+            hidden_state = level_block(hidden_state)
+
+            if level != 0 and all_levels:
+                hidden_state = hidden_state + hidden_states[level - 1]
+
+        hidden_state = self.out(hidden_state)
+        return hidden_state
+
+
+class JukeboxBottleneckBlock(nn.Module):
+    def __init__(self, config: JukeboxVQVAEConfig):
+        super().__init__()
+        self.nb_discrete_codes = config.nb_discrete_codes
+        self.codebook_width = config.embed_dim
+        self.mu = config.lmu
+        self.threshold = 1.0
+        self.init = False
+        self.codebook_sum = None
+        self.codebook_elem = None
+        self.register_buffer("codebook", torch.zeros(self.nb_discrete_codes, self.codebook_width))
+
+    def _tile(self, hidden_states):
+        dim, embed_width = hidden_states.shape
+        if dim < self.nb_discrete_codes:
+            n_repeats = (self.nb_discrete_codes + dim - 1) // dim
+            std = 0.01 / np.sqrt(embed_width)
+            hidden_states = hidden_states.repeat(n_repeats, 1)
+            hidden_states = hidden_states + torch.randn_like(hidden_states) * std
+        return hidden_states
+
+    def init_codebook(self, hidden_states):
+        nb_discrete_codes = self.nb_discrete_codes
+        self.init = True
+        codes = self._tile(hidden_states)
+        self.codebook = codes[torch.randperm(codes.shape[0])][:nb_discrete_codes]
+        self.codebook_sum = self.codebook
+        self.codebook_elem = torch.ones(nb_discrete_codes, device=self.codebook.device)
+
+    def update_codebook(self, hidden_states, latent_states):
+        mu, codebook_width, nb_discrete_codes = self.mu, self.codebook_width, self.nb_discrete_codes
+        with torch.no_grad():
+            # Calculate new centres
+            # nb_discrete_codes, batch_size * seq_length
+            latent_states_onehot = torch.zeros(nb_discrete_codes, hidden_states.shape[0], device=hidden_states.device)
+            latent_states_onehot.scatter_(0, latent_states.view(1, hidden_states.shape[0]), 1)
+
+            _codebook_sum = torch.matmul(latent_states_onehot, hidden_states)
+            _codebook_elem = latent_states_onehot.sum(dim=-1)  # nb_discrete_codes
+            codes = self._tile(hidden_states)
+            _random_codebook = codes[torch.randperm(codes.shape[0])][:nb_discrete_codes]
+
+            # Update centres
+            old_codebook = self.codebook
+            self.codebook_sum = mu * self.codebook_sum + (1.0 - mu) * _codebook_sum
+            self.codebook_elem = mu * self.codebook_elem + (1.0 - mu) * _codebook_elem  # nb_discrete_codes
+            usage = (self.codebook_elem.view(nb_discrete_codes, 1) >= self.threshold).float()
+
+            norm_code = self.codebook_sum.view(nb_discrete_codes, codebook_width) / self.codebook_elem.view(
+                nb_discrete_codes, 1
+            )
+            self.codebook = usage * (norm_code) + (1 - usage) * _random_codebook
+            _codebook_prob = _codebook_elem / torch.sum(_codebook_elem)  # prob of each bin
+            entropy = -torch.sum(_codebook_prob * torch.log(_codebook_prob + 1e-8))  # entropy ie how diverse
+            used_curr = (_codebook_elem >= self.threshold).sum()
+            usage = torch.sum(usage)
+            dk = torch.linalg.norm(self.codebook - old_codebook) / np.sqrt(np.prod(old_codebook.shape))
+        return {"entropy": entropy, "used_curr": used_curr, "usage": usage, "dk": dk}
+
+    def preprocess(self, hidden_states):
+        hidden_states = hidden_states.permute(0, 2, 1).contiguous()
+        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+
+        if hidden_states.shape[-1] == self.codebook_width:
+            prenorm = torch.linalg.norm(hidden_states - torch.mean(hidden_states)) / np.sqrt(
+                np.prod(hidden_states.shape)
+            )
+        elif hidden_states.shape[-1] == 2 * self.codebook_width:
+            x1, x2 = hidden_states[..., : self.codebook_width], hidden_states[..., self.codebook_width :]
+            prenorm = (torch.linalg.norm(x1 - torch.mean(x1)) / np.sqrt(np.prod(x1.shape))) + (
+                torch.linalg.norm(x2 - torch.mean(x2)) / np.sqrt(np.prod(x2.shape))
+            )
+
+            # Normalise
+            hidden_states = x1 + x2
+
+        return hidden_states, prenorm
+
+    def postprocess(self, latent_states, dequantised_states, x_shape):
+        batch_size, time = x_shape
+        dequantised_states = dequantised_states.view(batch_size, time, -1).permute(0, 2, 1).contiguous()
+        latent_states = latent_states.view(batch_size, time)
+        return latent_states, dequantised_states
+
+    def quantise(self, latent_states):
+        # Calculate latent code latent_states
+        codebook_weights = self.codebook.t()
+        distance = (
+            torch.sum(latent_states**2, dim=-1, keepdim=True)
+            - 2 * torch.matmul(latent_states, codebook_weights)
+            + torch.sum(codebook_weights**2, dim=0, keepdim=True)
+        )  # (batch_size * latent_states , codebook_weights)
+        min_distance, music_tokens = torch.min(distance, dim=-1)
+        fit = torch.mean(min_distance)
+        return music_tokens, fit
+
+    def dequantise(self, music_tokens):
+        dequantised_states = F.embedding(music_tokens, self.codebook)
+        return dequantised_states
+
+    def encode(self, latent_states):
+        samples, _, seq_len = latent_states.shape
+
+        # Preprocess.
+        latent_states, _ = self.preprocess(latent_states)
+
+        # Quantise
+        music_tokens, _ = self.quantise(latent_states)
+
+        # Postprocess.
+        music_tokens = music_tokens.view(samples, seq_len)
+        return music_tokens
+
+    def decode(self, music_tokens):
+        samples, seq_len = music_tokens.shape
+
+        # Dequantise
+        dequantised_states = self.dequantise(music_tokens)
+
+        # Postprocess
+        dequantised_states = (
+            dequantised_states.view(samples, seq_len, self.codebook_width).permute(0, 2, 1).contiguous()
+        )
+        return dequantised_states
+
+    def forward(self, hidden_states, update_codebook=True):
+        samples, _, seq_len = hidden_states.shape
+
+        # Preprocess
+        hidden_states, prenorm = self.preprocess(hidden_states)
+
+        # Init codebook if not inited
+        if update_codebook and not self.init:
+            self.init_codebook(hidden_states)
+
+        # Quantise and dequantise through bottleneck
+        music_tokens, fit = self.quantise(hidden_states)
+        dequantised_states = self.dequantise(music_tokens)
+
+        # Update embeddings
+        if update_codebook:
+            update_metrics = self.update_codebook(hidden_states, music_tokens)
+        else:
+            update_metrics = {}
+
+        # Loss
+        commit_loss = torch.linalg.norm(dequantised_states.detach() - hidden_states) ** 2 / np.prod(
+            hidden_states.shape
+        )
+
+        # Passthrough
+        dequantised_states = hidden_states + (dequantised_states - hidden_states).detach()
+
+        # Postprocess
+        music_tokens, dequantised_states = self.postprocess(music_tokens, dequantised_states, (samples, seq_len))
+        return music_tokens, dequantised_states, commit_loss, dict(fit=fit, pn=prenorm, **update_metrics)
+
+
+class JukeboxBottleneck(nn.Module):
+    def __init__(self, config, levels):
+        super().__init__()
+        self.levels = levels
+        self.level_blocks = nn.ModuleList()
+        for level in range(self.levels):
+            self.level_blocks.append(JukeboxBottleneckBlock(config))
+
+    def encode(self, raw_audio):
+        music_tokens = [
+            level_block.encode(hidden_states) for (level_block, hidden_states) in zip(self.level_blocks, raw_audio)
+        ]
+        return music_tokens
+
+    def decode(self, music_tokens, start_level=0, end_level=None):
+        if end_level is None:
+            end_level = self.levels
+        quantised_audio = [
+            level_block.decode(z) for (level_block, z) in zip(self.level_blocks[start_level:end_level], music_tokens)
+        ]
+        return quantised_audio
+
+    def forward(self, input_audio):
+        music_tokens, quantised_states, commit_losses, metrics = [], [], [], []
+        for level in range(self.levels):
+            level_block = self.level_blocks[-level - 1]
+            hidden_states = input_audio[level]
+            sampled_tokens, quantised_state, commit_loss, metric = level_block(
+                hidden_states, update_codebook=self.training
+            )
+            music_tokens.append(sampled_tokens)
+            if not self.training:
+                # Be extra paranoid and make sure the encoder weights can't
+                # change from straight-through estimator
+                quantised_state = quantised_state.detach()
+            quantised_states.append(quantised_state)
+            commit_losses.append(commit_loss)
+            if self.training:
+                metrics.append(metric)
+        return music_tokens, quantised_states, commit_losses, metrics
+
+
+JUKEBOX_START_DOCSTRING = r"""
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config (`JukeboxConfig`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    """The Hierarchical VQ-VAE model used in Jukebox. This model follows the Hierarchical VQVAE paper from [Will Williams, Sam
+Ringer, Tom Ash, John Hughes, David MacLeod, Jamie Dougherty](https://huggingface.co/papers/2002.08111).
+
+    """,
+    JUKEBOX_START_DOCSTRING,
+)
+class JukeboxVQVAE(PreTrainedModel):
+    config: JukeboxVQVAEConfig
+    base_model_prefix = "vqvae"
+
+    def _init_weights(self, module):
+        if isinstance(module, nn.Embedding):  # embed_tokens
+            module.weight.data.normal_(mean=0.0, std=0.02 * self.config.init_scale)
+        elif isinstance(module, JukeboxConv1D):
+            if self.config.zero_out:
+                module.weight.data.zero_()
+            else:
+                module.weight.data.normal_(mean=0.0, std=0.02 * self.config.init_scale)
+        elif isinstance(module, JukeboxResConv1DBlock) and self.config.zero_out:
+            module.conv1d_2.weight.data.zero_()
+            module.conv1d_2.bias.data.zero_()
+        if isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+    def __init__(self, config: JukeboxVQVAEConfig):
+        super().__init__(config)
+        downs_t = config.res_downs_t
+        strides_t = config.res_strides_t
+        if not config.sample_length:
+            downsamples = [stride**down for stride, down in zip(strides_t, downs_t)]
+            top_raw_to_tokens = np.prod(downsamples)
+            config.sample_length = (
+                config.sample_length_in_seconds * config.sampling_rate // top_raw_to_tokens
+            ) * top_raw_to_tokens
+            config.sample_length = config.sample_length.astype(int)
+
+        self.nb_discrete_codes = config.nb_discrete_codes
+        self.commit = config.commit
+        self.sample_length = config.sample_length
+
+        self.downsamples = [stride**down for stride, down in zip(strides_t, downs_t)]
+        self.hop_lengths = np.cumprod(self.downsamples)
+        self.levels = levels = config.levels
+        self.music_tokens_shapes = [
+            (int(self.sample_length // self.hop_lengths[-level - 1])) for level in range(levels)
+        ]
+
+        self.multipliers = config.multipliers if config.multipliers is not None else [1] * levels
+
+        self.encoders = nn.ModuleList()
+        self.decoders = nn.ModuleList()
+        for level in range(levels):
+            width = config.res_conv_width * self.multipliers[level]
+            depth = config.res_conv_depth * self.multipliers[level]
+            self.encoders.append(
+                JukeboxEncoder(config, width, depth, level + 1, downs_t[: level + 1], strides_t[: level + 1])
+            )
+            self.decoders.append(
+                JukeboxDecoder(config, width, depth, level + 1, downs_t[: level + 1], strides_t[: level + 1])
+            )
+
+        self.bottleneck = JukeboxBottleneck(config, levels)
+
+    def _decode(self, music_tokens, start_level=0, end_level=None):
+        # Decode
+        if end_level is None:
+            end_level = self.levels
+        latent_states = self.bottleneck.decode(music_tokens, start_level=start_level, end_level=end_level)
+        # Use only lowest level
+        decoder, dequantised_state = self.decoders[start_level], latent_states[0:1]
+        dequantised_state = decoder(dequantised_state, all_levels=False)
+        dequantised_state = dequantised_state.permute(0, 2, 1)
+        return dequantised_state
+
+    def decode(self, music_tokens, start_level=0, end_level=None, bs_chunks=1) -> torch.Tensor:
+        """
+        Transforms the input `music_tokens` to their `raw_audio` representation.
+
+        Args:
+            music_tokens (`torch.LongTensor`):
+                Tensor of music tokens which will be decoded to raw audio by using the codebook. Each music token
+                should be an index to a corresponding `code` vector in the codebook.
+            start_level (`int`, *optional*):
+                Level at which the decoding process will start. Default to 0.
+            end_level (`int`, *optional*):
+                Level at which the decoding process will start. Default to None.
+            bs_chunks (int, *optional*):
+                Number of chunks to process at the same time.
+        """
+        token_chunks = [torch.chunk(token, bs_chunks, dim=0) for token in music_tokens]
+        dequantised_states = []
+        for i in range(bs_chunks):
+            music_tokens_i = [chunks[i] for chunks in token_chunks]
+            dequantised_state = self._decode(music_tokens_i, start_level=start_level, end_level=end_level)
+            dequantised_states.append(dequantised_state)
+        return torch.cat(dequantised_states, dim=0)
+
+    def _encode(self, raw_audio, start_level=0, end_level=None):
+        # Encode
+        if end_level is None:
+            end_level = self.levels
+        input_audio = raw_audio.permute(0, 2, 1).float()
+        latent_states = []
+        for level in range(self.levels):
+            encoder = self.encoders[level]
+            latent_state = encoder(input_audio)
+            latent_states.append(latent_state[-1])
+        music_tokens = self.bottleneck.encode(latent_states)
+        return music_tokens[start_level:end_level]
+
+    def encode(self, input_audio, start_level=0, end_level=None, bs_chunks=1):
+        """
+        Transforms the `input_audio` to a discrete representation made out of `music_tokens`.
+
+        Args:
+            input_audio (`torch.Tensor`):
+                Raw audio which will be encoded to its discrete representation using the codebook. The closest `code`
+                form the codebook will be computed for each sequence of samples.
+            start_level (`int`, *optional*, defaults to 0):
+                Level at which the encoding process will start. Default to 0.
+            end_level (`int`, *optional*):
+                Level at which the encoding process will start. Default to None.
+            bs_chunks (int, *optional*, defaults to 1):
+                Number of chunks of raw audio to process at the same time.
+        """
+        audio_chunks = torch.chunk(input_audio, bs_chunks, dim=0)
+        music_tokens_list = []
+        for chunk_i in audio_chunks:
+            music_tokens_i = self._encode(chunk_i, start_level=start_level, end_level=end_level)
+            music_tokens_list.append(music_tokens_i)
+        music_tokens = [torch.cat(music_tokens_level, dim=0) for music_tokens_level in zip(*music_tokens_list)]
+        return music_tokens
+
+    def sample(self, n_samples):
+        music_tokens = [
+            torch.randint(0, self.nb_discrete_codes, size=(n_samples, *music_tokens_shape), device="cpu")
+            for music_tokens_shape in self.music_tokens_shapes
+        ]
+        return self.decode(music_tokens)
+
+    def forward(self, raw_audio: torch.FloatTensor) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Forward pass of the VQ-VAE, encodes the `raw_audio` to latent states, which are then decoded for each level.
+        The commit loss, which ensure that the encoder's computed embeddings are close to the codebook vectors, is
+        computed.
+
+        Args:
+            raw_audio (`torch.FloatTensor`):
+                Audio input which will be encoded and decoded.
+
+        Returns:
+            `tuple[torch.Tensor, torch.Tensor]`
+
+
+        Example:
+        ```python
+        >>> from transformers import JukeboxVQVAE, set_seed
+        >>> import torch
+
+        >>> model = JukeboxVQVAE.from_pretrained("openai/jukebox-1b-lyrics").eval()
+        >>> set_seed(0)
+        >>> zs = [torch.randint(100, (4, 1))]
+        >>> model.decode(zs).shape
+        torch.Size([4, 8, 1])
+        ```
+        """
+
+        # Encode/Decode
+        input_audio = raw_audio.permute(0, 2, 1).float()
+        latent_states = []
+        for level in range(self.levels):
+            encoder = self.encoders[level]
+            latent_state = encoder(input_audio)
+            latent_states.append(latent_state[-1])
+
+        _, music_tokens, commit_losses, _ = self.bottleneck(latent_states)
+        dequantised_states = []
+        for level in range(self.levels):
+            decoder = self.decoders[level]
+            dequantised_state = decoder(music_tokens[level : level + 1], all_levels=False)
+            dequantised_states.append(dequantised_state.permute(0, 2, 1))
+
+        commit_loss = sum(commit_losses)
+        loss = self.commit * commit_loss
+
+        return dequantised_states, loss
+
+
+class JukeboxMLP(nn.Module):
+    def __init__(self, config):
+        # a single channel is always used in original code
+        super().__init__()
+        embed_dim = config.hidden_size
+        hidden_dim = int(config.mlp_multiplier * embed_dim)
+
+        self.c_fc = JukeboxConv1D(embed_dim, hidden_dim)
+        self.c_proj = JukeboxConv1D(hidden_dim, embed_dim)
+        self.act = ACT2FN[config.act_fn]
+        self.dropout = nn.Dropout(config.resid_dropout)
+
+    def forward(self, hidden_states):
+        hidden_states = self.c_fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.c_proj(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+class JukeboxLayerNorm(FusedLayerNorm):
+    def __init__(self, normalized_shape, eps=1e-5, elementwise_affine=True):
+        super().__init__(normalized_shape, eps=eps, elementwise_affine=elementwise_affine)
+        self.width = np.prod(normalized_shape)
+        self.max_numel = 65535 * self.width
+
+    def forward(self, input):
+        if input.numel() > self.max_numel:
+            return F.layer_norm(input, self.normalized_shape, self.weight, self.bias, self.eps).type_as(input)
+        else:
+            return super().forward(input).type_as(input)
+
+
+class JukeboxAttention(nn.Module):
+    def __init__(self, config, n_ctx, attn_func="dense_attn"):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.n_heads = config.n_heads
+        self.dropout = config.attn_dropout
+        hidden_dim = int(config.attention_multiplier * self.embed_dim)
+
+        self.head_dim = hidden_dim // config.n_heads
+        self.n_ctx = n_ctx
+        self.hidden_dim = hidden_dim
+        self.scale = self.head_dim**-0.25
+        self.mask = config.mask
+
+        if attn_func == "cross_attention":
+            self.c_attn = JukeboxConv1D(self.embed_dim, hidden_dim)
+            self.c_enc_kv = JukeboxConv1D(self.embed_dim, hidden_dim * 2)
+        else:
+            self.c_attn = JukeboxConv1D(self.embed_dim, hidden_dim * 3)
+
+        self.c_proj = JukeboxConv1D(hidden_dim, self.embed_dim)
+        self.attn_dropout = nn.Dropout(config.attn_dropout)
+        self.resid_dropout = nn.Dropout(config.resid_dropout)
+
+        # Sequence of length seq_len is factored as [blocks, seq_len // blocks]
+        self.attn_func = attn_func
+        if attn_func == "cross_attention":
+            self.qkv = self.decode_qkv
+        elif attn_func == "prime_attn":
+            self.qkv = self.prime_qkv
+        else:
+            self.qkv = self.factored_qkv
+
+        ATTENTION_MAP = {
+            "dense_attn": (self.dense_attn, "autoregressive"),
+            "block_attn": (self.block_attn, "autoregressive"),
+            "transpose_block_attn": (self.transpose_block_attn, "autoregressive"),
+            "prev_block_attn": (self.prev_block_attn, None),
+            "summary_attn": (self.summary_attn, "summary"),
+            "summary_spread_attn": (self.summary_spread_attn, "summary"),
+            "cross_attention": (self.dense_attn, None),
+            "prime_attn": (self.prime_attn, "prime"),
+        }
+        self.attn, self.attn_mask = ATTENTION_MAP[attn_func]
+
+        self.blocks = config.blocks
+        self.spread = config.spread
+        if self.blocks is not None:
+            self.block_ctx = self.n_ctx // self.blocks
+
+        self.sample_t = 0
+        self.cache = {}
+        self.encoder_len = config.nb_relevant_lyric_tokens  # length of the encoder input ids
+        self.record_attn = False
+
+    def _attn(self, query_states, key_states, value_states, sample):
+        scale = self.scale
+        if self.training:
+            attention_weight = torch.matmul(query_states * scale, key_states * scale)
+        else:
+            attention_weight = torch.matmul(query_states, key_states)
+            attention_weight.mul_(scale * scale)
+        attn_weight_type = attention_weight.dtype
+        attention_weight = attention_weight.float()
+        if self.mask:
+            # Generate appropriate mask to mask out all positions before current
+            # Might take up lot of memory for dense, so can cache it
+            mask = get_mask(
+                self.attn_mask,
+                query_states.size(-2),
+                key_states.size(-1),
+                self.blocks,
+                self.spread,
+                attention_weight.device,
+                sample,
+                self.sample_t,
+            )
+            if mask is not None:
+                attention_weight = attention_weight * mask + -1e9 * (1 - mask)
+        attention_prob = F.softmax(attention_weight, dim=-1).type(attn_weight_type)
+        if self.record_attn:
+            self.attention_prob = attention_prob
+            if self.attn_func == "prime_attn":
+                # only keep music queries and lyrics keys/values
+                self.attention_prob = self.attention_prob[:, :, self.encoder_len :, : self.encoder_len]
+        attention_prob = self.attn_dropout(attention_prob)
+        context_states = torch.matmul(attention_prob, value_states)
+        return context_states
+
+    def merge_heads(self, hidden_states):
+        hidden_states = hidden_states.permute(0, 2, 1, 3).contiguous()
+        new_hidden_states_shape = (*hidden_states.size()[:-2], hidden_states.size(-2) * hidden_states.size(-1))
+        return hidden_states.view(*new_hidden_states_shape)  # in Tensorflow implem: fct merge_states
+
+    def split_heads(self, hidden_states, is_key=False):
+        new_hidden_states_shape = (
+            *hidden_states.size()[:-1],
+            self.n_heads,
+            hidden_states.size(-1) // self.n_heads,
+        )
+        hidden_states = hidden_states.view(*new_hidden_states_shape)  # in Tensorflow implem: fct split_states
+        if is_key:
+            return hidden_states.permute(0, 2, 3, 1)
+        else:
+            return hidden_states.permute(0, 2, 1, 3)
+
+    def dense_attn(self, query, key, value, sample):
+        query = self.split_heads(query)
+        key = self.split_heads(key, is_key=True)
+        value = self.split_heads(value)
+        context_states = self._attn(query, key, value, sample)
+        context_states = self.merge_heads(context_states)
+        return context_states
+
+    def block_attn(self, query, key, value, sample):
+        block_ctx = self.block_ctx
+        batch_size, seq_len, embed_dim = value.shape  # For sample, query_len= 1, key_len = value_len = sample_t
+        if sample:
+            return self.dense_attn(query, key, value, sample).view(batch_size, 1, embed_dim)
+        else:
+            query_length = query.shape[1]
+            query = query.view(batch_size * query_length // block_ctx, block_ctx, embed_dim)
+            if query_length < seq_len:
+                seq_len = query_length
+                key = key[:, -seq_len:].contiguous()
+                value = value[:, -seq_len:].contiguous()
+            key = key.view(batch_size * seq_len // block_ctx, block_ctx, embed_dim)
+            value = value.view(batch_size * seq_len // block_ctx, block_ctx, embed_dim)
+            return self.dense_attn(query, key, value, sample).view(batch_size, seq_len, embed_dim)
+
+    def transpose_block_attn(self, query, key, value, sample):
+        block_ctx = self.block_ctx
+        batch_size, seq_len, embed_dim = value.shape  # For sample, query_len= 1, key_len = value_len = sample_t
+        if sample:
+            block_len = (seq_len - 1) % block_ctx
+            key = key[:, block_len::block_ctx, :]
+            value = value[:, block_len::block_ctx, :]
+            return self.dense_attn(query, key, value, sample).view(batch_size, 1, embed_dim)
+        else:
+            query_length = query.shape[1]
+            query = query.view(batch_size, query_length // block_ctx, block_ctx, embed_dim)
+            query = query.transpose(1, 2).contiguous()
+            query = query.view(batch_size * block_ctx, query_length // block_ctx, embed_dim)
+
+            key = key.view(batch_size, seq_len // block_ctx, block_ctx, embed_dim)
+            key = key.transpose(1, 2).contiguous()
+            key = key.view(batch_size * block_ctx, seq_len // block_ctx, embed_dim)
+
+            value = value.view(batch_size, seq_len // block_ctx, block_ctx, embed_dim)
+            value = value.transpose(1, 2).contiguous()
+            value = value.view(batch_size * block_ctx, seq_len // block_ctx, embed_dim)
+
+            block_attn = self.dense_attn(query, key, value, sample)
+            block_attn = block_attn.view(batch_size, block_ctx, query_length // block_ctx, embed_dim)
+            block_attn = block_attn.transpose(1, 2).contiguous()
+            block_attn = block_attn.view(batch_size, query_length, embed_dim)
+
+            return block_attn
+
+    def prev_block_attn(self, query, key, value, sample):
+        block_ctx = self.block_ctx
+        batch_size, seq_len, embed_dim = value.shape  # For sample, query_len= 1, key_len = value_len = sample_t
+        if sample:
+            block = (seq_len - 1) // block_ctx
+            prev_l = (block - 1) * block_ctx
+            if block > 0:
+                key = key[:, prev_l : prev_l + block_ctx, :]
+                value = value[:, prev_l : prev_l + block_ctx, :]
+            else:
+                key = torch.zeros(batch_size, block_ctx, embed_dim, device=query.device, dtype=query.dtype)
+                value = torch.zeros(batch_size, block_ctx, embed_dim, device=query.device, dtype=query.dtype)
+            return self.dense_attn(query, key, value, sample).view(batch_size, 1, embed_dim)
+        else:
+            query_length = query.shape[1]
+            query = query.view(batch_size * query_length // block_ctx, block_ctx, embed_dim)
+
+            key = key.view(batch_size, seq_len // block_ctx, block_ctx, embed_dim)[:, :-1, :, :]
+            key = torch.nn.functional.pad(key, (0, 0, 0, 0, 1, 0))
+            key = key.view(batch_size * seq_len // block_ctx, block_ctx, embed_dim)
+
+            value = value.view(batch_size, seq_len // block_ctx, block_ctx, embed_dim)[:, :-1, :, :]
+            value = torch.nn.functional.pad(value, (0, 0, 0, 0, 1, 0))
+            value = value.view(batch_size * seq_len // block_ctx, block_ctx, embed_dim)
+
+            if query_length < seq_len:
+                nb_query_blocks = query_length // block_ctx
+                nb_key_blocks = seq_len // block_ctx
+                seq_len = query_length
+                key = key.view(batch_size, nb_key_blocks, block_ctx, embed_dim)[:, -nb_query_blocks:]
+                key = key.contiguous().view(batch_size * nb_query_blocks, block_ctx, embed_dim)
+
+                value = value.view(batch_size, nb_key_blocks, block_ctx, embed_dim)[:, -nb_query_blocks:]
+                value = value.contiguous().view(batch_size * nb_query_blocks, block_ctx, embed_dim)
+
+            return self.dense_attn(query, key, value, sample).view(batch_size, seq_len, embed_dim)
+
+    def summary_attn(self, query, key, value, sample):
+        blocks = self.blocks
+        block_ctx = self.block_ctx
+        batch_size, seq_len, embed_dim = value.shape  # For sample, query_len= 1, key_len = value_len = sample_t
+        if sample:
+            key = key[:, block_ctx - 1 : blocks * block_ctx - 1 : block_ctx, :]
+            key = torch.nn.functional.pad(key, (0, 0, 1, 0))
+
+            value = value[:, block_ctx - 1 : blocks * block_ctx - 1 : block_ctx, :]
+            value = torch.nn.functional.pad(value, (0, 0, 1, 0))
+            return self.dense_attn(query, key, value, sample).view(batch_size, 1, embed_dim)
+        else:
+            key = key.view(batch_size, blocks, seq_len // blocks, embed_dim)[:, :-1, -1, :]
+            key = torch.nn.functional.pad(key, (0, 0, 1, 0))  # batch_size, blocks, embed_dim
+
+            value = value.view(batch_size, blocks, seq_len // blocks, embed_dim)[:, :-1, -1, :]
+            value = torch.nn.functional.pad(value, (0, 0, 1, 0))  # batch_size, blocks, embed_dim
+            return self.dense_attn(query, key, value, sample).view(batch_size, seq_len, embed_dim)
+
+    def summary_spread_attn(self, query, key, value, sample):
+        blocks = self.blocks
+        spread = self.spread
+
+        batch_size, seq_len, embed_dim = value.shape  # For sample, query_len= 1, key_len = value_len = sample_t
+        if sample:
+            raise NotImplementedError
+        else:
+            key = key.view(batch_size, blocks, seq_len // blocks, embed_dim)[:, :-1, -spread:, :]
+            key = torch.nn.functional.pad(key, (0, 0, 0, 0, 1, 0)).contiguous()
+            key = key.view(batch_size, blocks * spread, embed_dim)
+
+            value = value.view(batch_size, blocks, seq_len // blocks, embed_dim)[:, :-1, -spread:, :]
+            value = torch.nn.functional.pad(value, (0, 0, 0, 0, 1, 0)).contiguous()
+            value = value.view(batch_size, blocks * spread, embed_dim)
+
+            return self.dense_attn(query, key, value, sample).view(batch_size, seq_len, embed_dim)
+
+    def prime_attn(self, query, key, value, sample):
+        encoder_len = self._encoder_len
+        key = key[:, :encoder_len]
+        value = value[:, :encoder_len]
+        return self.dense_attn(query, key, value, sample)
+
+    def factored_qkv(self, hidden_states, last_encoder_hidden_states=None, sample=False):
+        curr_ctx = hidden_states.shape[1]
+        if last_encoder_hidden_states is not None:
+            raise TypeError("last_encoder_hidden_states should be None")
+
+        query, key, value = hidden_states.chunk(3, dim=2)
+        if sample:
+            self.sample_t += curr_ctx
+            key, value = self._append_cache(key, value)
+            l_cache = self._suff_cache_len()
+            if self._cache_len() > l_cache:
+                self._slice_cache(-l_cache)
+            if curr_ctx > 1:
+                if self.attn_func != "dense_attn":
+                    query = self._pad_to_block_ctx(query, query=True)
+                    key = self._pad_to_block_ctx(key)
+                    value = self._pad_to_block_ctx(value)
+                sample = False
+            else:
+                key = self.cache["key"]
+                value = self.cache["value"]
+        return query, key, value, sample
+
+    def prime_qkv(self, hidden_states, last_encoder_hidden_states=None, sample=False):
+        curr_ctx = hidden_states.shape[1]
+        if last_encoder_hidden_states is not None:
+            raise TypeError("last_encoder_hidden_states should be None")
+        query, key, value = hidden_states.chunk(3, dim=2)
+        if sample:
+            if self._cache_len() < self._encoder_len:
+                self._append_cache(key, value)
+            if self._cache_len() > self._encoder_len:
+                self._slice_cache(0, self._encoder_len)
+            key, value = self.cache["key"], self.cache["value"]
+            self.sample_t += curr_ctx
+        return query, key, value, sample
+
+    def decode_qkv(self, hidden_states, last_encoder_hidden_states=None, sample=False):
+        curr_ctx = hidden_states.shape[1]
+        query = hidden_states
+        if sample:
+            if self.sample_t == 0:
+                self.cache["key"], self.cache["value"] = self.c_enc_kv(
+                    last_encoder_hidden_states.type_as(hidden_states)
+                ).chunk(2, dim=2)
+            key, value = self.cache["key"], self.cache["value"]
+            self.sample_t += curr_ctx
+        else:
+            key, value = self.c_enc_kv(last_encoder_hidden_states.type_as(hidden_states)).chunk(2, dim=2)
+        return query, key, value, sample
+
+    def forward(self, hidden_states, last_encoder_hidden_states=None, sample=False):
+        curr_ctx = hidden_states.shape[1]
+        hidden_states = self.c_attn(hidden_states)
+        query, key, value, sample = self.qkv(
+            hidden_states, last_encoder_hidden_states=last_encoder_hidden_states, sample=sample
+        )
+        attention_scores = self.attn(query, key, value, sample)
+        if attention_scores.shape[1] != curr_ctx:
+            offset = self._offset(curr_ctx)
+            attention_scores = attention_scores[:, offset : offset + curr_ctx, :].contiguous()
+        attention_scores = self.c_proj(attention_scores)
+        return self.resid_dropout(attention_scores)
+
+    @property
+    def _encoder_len(self):
+        encoder_len = self.encoder_len
+        encoder_blocks = (encoder_len // self.blocks) + 1
+        return encoder_blocks * self.blocks
+
+    def _offset(self, curr_ctx):
+        if self.attn_func == "dense_attn":
+            return 0
+        return (self.sample_t - curr_ctx) % self.block_ctx
+
+    def _pad_to_block_ctx(self, hidden_states, query=False):
+        seq_len = hidden_states.shape[1]
+        offset = self._offset(seq_len) if query else 0
+        n_blocks = (seq_len + offset + self.block_ctx - 1) // self.block_ctx
+        pad = n_blocks * self.block_ctx - seq_len - offset
+        if pad == 0 and offset == 0:
+            return hidden_states
+        else:
+            return F.pad(hidden_states, (0, 0, offset, pad))
+
+    def _cache_len(self):
+        return 0 if "key" not in self.cache else self.cache["key"].shape[1]
+
+    def _suff_cache_len(self):
+        """
+        Precondition:
+            key and value are appended with the current context and self.sample_t reflects the 1-indexed sample
+            location in the context.
+        """
+        previous_block_length = (self.sample_t - 1) % self.block_ctx + 1 + self.block_ctx
+        REQUIRED_CACHE_LEN = {
+            "dense_attn": self.sample_t,
+            "block_attn": (self.sample_t - 1) % self.block_ctx + 1,
+            "transpose_block_attn": self.sample_t,
+            "prev_block_attn": self.sample_t if self.sample_t <= self.block_ctx else previous_block_length,
+            "cross_attn": self.encoder_len,
+            "prime_attn": min(self.sample_t, self._encoder_len),
+        }
+
+        return REQUIRED_CACHE_LEN[self.attn_func]
+
+    def _slice_cache(self, start, end=None):
+        self.cache["key"] = self.cache["key"][:, start:end]
+        self.cache["value"] = self.cache["value"][:, start:end]
+
+    def _append_cache(self, key, value):
+        if "key" not in self.cache:
+            self.cache["key"] = key
+            self.cache["value"] = value
+        else:
+            old_key, old_value = key, value
+            key = torch.cat([self.cache["key"], old_key], dim=1)
+            value = torch.cat([self.cache["value"], old_value], dim=1)
+            del self.cache["key"]
+            del self.cache["value"]
+            del old_key
+            del old_value
+            self.cache["key"] = key
+            self.cache["value"] = value
+        return self.cache["key"], self.cache["value"]
+
+    def del_cache(self):
+        self.sample_t = 0
+        if "key" in self.cache:
+            del self.cache["key"]
+        if "value" in self.cache:
+            del self.cache["value"]
+        self.cache = {}
+
+
+class JukeboxBlock(nn.Module):
+    def __init__(self, config, n_ctx, attn_func="dense_attn"):
+        super().__init__()
+        self.width = config.hidden_size
+        self.attn = JukeboxAttention(config, n_ctx, attn_func=attn_func)
+
+        self.layer_norm_0 = JukeboxLayerNorm(config.hidden_size)
+        self.mlp = JukeboxMLP(config)
+        self.layer_norm_1 = JukeboxLayerNorm(config.hidden_size)
+        self.res_scale = 1.0 / config.num_layers if config.attn_res_scale else 1.0
+        self.attn_func = attn_func
+
+    def forward(self, hidden_states, last_encoder_hidden_states, sample=False):
+        residuals = hidden_states
+        hidden_states = self.layer_norm_0(hidden_states)
+        hidden_states = self.attn(hidden_states, last_encoder_hidden_states, sample)
+
+        output_states = self.layer_norm_1(residuals + hidden_states)
+        output_states = self.mlp(output_states)
+        if self.res_scale == 1.0:
+            output = residuals + hidden_states + output_states
+        else:
+            output = residuals + self.res_scale * (hidden_states + output_states)
+        return output
+
+
+class JukeboxLayerStack(nn.Module):
+    def __init__(self, config, n_ctx):
+        super().__init__()
+        self.n_ctx = n_ctx
+        self.width = config.hidden_size
+        self.num_layers = config.num_layers
+        self.blocks = config.blocks
+        self.attention_pattern = config.attention_pattern
+        if self.blocks is not None:
+            self.block_ctx = n_ctx // self.blocks
+        self.encoder_len = config.nb_relevant_lyric_tokens
+        self.n_heads = config.n_heads
+
+        # Orders of attn_func
+        attention_pattern = ATTENTION_PATTERNS[self.attention_pattern]
+        self._attn_mods = nn.ModuleList()
+        for depth in range(self.num_layers):
+            self._attn_mods.append(JukeboxBlock(config, n_ctx, attn_func=attention_pattern(depth)))
+
+        self.saved_attn_weights = []
+
+    def set_record_attn(self, record_attn):
+        """
+        Makes forward prop dump self-attention softmaxes to self.saved_attn_weights.
+
+        Args:
+            record_attn (`Union[bool,set]`):
+                Either a set of layer indices indicating which layers to store, or a boolean value indicating Whether
+                to dump all.
+        """
+
+        def _should_record_attn(layer_idx):
+            if isinstance(record_attn, bool):
+                return record_attn
+            return layer_idx in record_attn
+
+        for i, layer in enumerate(self._attn_mods):
+            layer.attn.record_attn = _should_record_attn(i)
+
+        if not record_attn:
+            self.saved_attn_weights = []
+
+    def forward(self, hidden_states, last_encoder_hidden_states=None, sample=False):
+        # Blocks
+        for i, attn_layer in enumerate(self._attn_mods):
+            if attn_layer.attn_func == "cross_attention":  # attend to the lyrics
+                hidden_states = attn_layer(
+                    hidden_states, last_encoder_hidden_states=last_encoder_hidden_states, sample=sample
+                )
+            else:
+                hidden_states = attn_layer(hidden_states, last_encoder_hidden_states=None, sample=sample)
+            if attn_layer.attn.record_attn:
+                self.saved_attn_weights.append(attn_layer.attn.c_attn.weight)
+        return hidden_states
+
+    def del_cache(self):
+        for attn_layer in self._attn_mods:
+            attn_layer.attn.del_cache()
+
+
+class JukeboxPositionalEmbedding(nn.Module):
+    def __init__(self, embed_dim, width):
+        super().__init__()
+        self.pos_emb = nn.Parameter(torch.empty((embed_dim, width)))
+
+    def forward(self):
+        pos_emb = self.pos_emb
+        return pos_emb
+
+
+class JukeboxConditionalAutoregressive(nn.Module):
+    def __init__(
+        self,
+        config,
+        n_ctx=None,
+        embed_dim=None,
+        audio_conditioning=False,
+        metadata_conditioning=False,
+        is_encoder=False,
+    ):
+        """
+        Autoregressive model on either lyric tokens or music tokens, or both. The attention pattern should be properly
+        set for each configuration.
+
+        Args:
+            config (`JukeboxPriorConfig`):
+                Model configuration class with all the parameters of the model. Initializing with a config file does
+                not load the weights associated with the model, only the configuration. Check out the
+                [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+            n_ctx (`int`, *optional*):
+                Number of tokens or lyrics tokens provided in a single pass.
+            embed_dim (`int`, *optional*):
+                Either equals to the dimension of the codebook, or the sum of n_vocab (lyrics) and codebook dimension,
+                if the model combines lyrics and music tokens, or simply n_vocab if the model is a separate encoder
+            audio_conditioning (`bool`, *optional*, defaults to `False`):
+                Whether or not the prior supports conditioning on audio.
+            metadata_conditioning (`bool`, *optional*, defaults to `False`):
+                Whether or not the prior supports conditioning on artitst, genres, lyrics and timing.
+            is_encoder (`bool`, *optional*, defaults to `False`):
+                Whether the model is an encoder only model.
+        """
+
+        super().__init__()
+        self.width = config.hidden_size
+        self.num_layers = config.num_layers
+        self.n_ctx = n_ctx if n_ctx is not None else config.n_ctx
+        self.embed_dim = embed_dim if embed_dim is not None else config.music_vocab_size
+        self.embed_tokens = nn.Embedding(self.embed_dim, config.hidden_size)
+        self.embed_tokens_dropout = nn.Dropout(config.emb_dropout)
+        self.metadata_conditioning = metadata_conditioning
+        self.audio_conditioning = audio_conditioning
+        if not metadata_conditioning:
+            self.start_token = nn.Parameter(torch.empty((1, config.hidden_size)))
+        self.pos_emb = JukeboxPositionalEmbedding(self.n_ctx, config.hidden_size)
+        self.pos_emb_dropout = nn.Dropout(config.emb_dropout)
+
+        self.transformer = JukeboxLayerStack(config, n_ctx=self.n_ctx)
+        self.is_encoder = is_encoder
+        self.encoder_len = config.nb_relevant_lyric_tokens
+
+        if config.merged_decoder:
+            # Merged piped model uses this setup
+            self.add_cond_after_transformer = False
+            self.share_embed_tokens_fc_proj_out = False
+        else:
+            self.add_cond_after_transformer = True
+            self.share_embed_tokens_fc_proj_out = True
+
+        if not is_encoder:
+            self.fc_proj_out = nn.Linear(config.hidden_size, self.embed_dim, bias=False)
+            if self.share_embed_tokens_fc_proj_out:
+                self.fc_proj_out.weight = self.embed_tokens.weight
+            self.loss = torch.nn.CrossEntropyLoss()
+
+    def forward(
+        self,
+        tokens,
+        audio_conditioning=None,
+        metadata_conditioning=None,
+        last_encoder_hidden_states=None,
+        get_preds=False,
+        get_acts=False,
+        get_sep_loss=False,
+    ):
+        """
+        Args:
+            tokens (`torch.tensor`):
+                Can represent music tokens, lyrics tokens or both, depending on the configuration.
+        """
+        # Preprocess.
+        batch_size = tokens.shape[0]
+        with torch.no_grad():
+            tokens = tokens.view(batch_size, -1).long()
+
+        if not self.audio_conditioning:
+            audio_conditioning = torch.zeros(
+                (batch_size, 1, self.width),
+                device=tokens.device,
+                dtype=self.transformer._attn_mods[0].mlp.c_fc.weight.dtype,
+            )
+
+        target = tokens  # Target
+        hidden_states = self.embed_tokens(tokens)
+        # Shift by 1, and fill in start token
+        hidden_states = torch.cat((hidden_states[:, -1:], hidden_states[:, :-1]), dim=1)
+        if self.metadata_conditioning:
+            hidden_states[:, 0] = metadata_conditioning.view(batch_size, self.width)
+        else:
+            hidden_states[:, 0] = self.start_token
+
+        hidden_states = (
+            self.embed_tokens_dropout(hidden_states) + self.pos_emb_dropout(self.pos_emb()) + audio_conditioning
+        )  # Pos emb and dropout
+
+        hidden_states = self.transformer(
+            hidden_states, last_encoder_hidden_states=last_encoder_hidden_states
+        )  # Transformer
+        if self.add_cond_after_transformer:  # Piped doesn't add x_cond
+            hidden_states = hidden_states + audio_conditioning
+
+        activations = hidden_states
+        if self.is_encoder:
+            return hidden_states
+
+        hidden_states = self.fc_proj_out(hidden_states)  # Predictions
+        loss_fn = nn.CrossEntropyLoss()
+        if get_sep_loss:
+            lyric_hidden_states = hidden_states[:, : self.encoder_len].reshape(-1, self.embed_dim)
+            token_hidden_states = hidden_states[:, self.encoder_len :].reshape(-1, self.embed_dim)
+
+            lyric_loss = loss_fn(lyric_hidden_states, target[:, : self.encoder_len].reshape(-1)) / np.log(2.0)
+            music_token_loss = loss_fn(token_hidden_states, target[:, self.encoder_len :].reshape(-1)) / np.log(2.0)
+
+            loss = (lyric_loss, music_token_loss)  # Note order! Lyric is first
+        else:
+            loss = loss_fn(hidden_states.view(-1, self.embed_dim), target.view(-1)) / np.log(2.0)  # Loss
+
+        if get_preds:
+            return loss, hidden_states
+        elif get_acts:
+            return loss, activations
+        else:
+            return loss, None
+
+    def get_emb(self, sample_t, n_samples, tokens, audio_conditioning, metadata_conditioning):
+        if sample_t == 0:
+            hidden_states = torch.empty(n_samples, 1, self.width, dtype=self.embed_tokens.weight.dtype).to(
+                self.embed_tokens.weight.device
+            )
+            if self.metadata_conditioning:
+                hidden_states[:, 0] = metadata_conditioning.view(n_samples, self.width)
+            else:
+                hidden_states[:, 0] = self.start_token
+        else:
+            hidden_states = self.embed_tokens(tokens)
+        if audio_conditioning.shape == (n_samples, self.n_ctx, self.width):
+            cond = audio_conditioning[:, sample_t : sample_t + 1, :]
+        else:
+            cond = audio_conditioning
+        # Pos emb, dropout is identity at eval time
+        hidden_states = hidden_states + self.pos_emb()[sample_t : sample_t + 1] + cond
+        return hidden_states, cond
+
+    def sample(
+        self,
+        n_samples,
+        audio_conditioning=None,
+        metadata_conditioning=None,
+        last_encoder_hidden_states=None,
+        temp=1.0,
+        top_k=0,
+        top_p=0.0,
+        get_preds=False,
+        sample_tokens=None,
+    ):
+        if sample_tokens is None:
+            sample_tokens = self.n_ctx
+
+        if not self.audio_conditioning:
+            audio_conditioning = torch.zeros(
+                (n_samples, 1, self.width), dtype=self.transformer._attn_mods[0].mlp.c_fc.weight.dtype
+            ).to(self.fc_proj_out.device)
+
+        with torch.no_grad():
+            sampled_tokens = []
+            tokens = None
+            if get_preds:
+                preds = []
+
+            iter = tqdm(range(0, sample_tokens), leave=False)
+            for sample_t in iter:
+                iter.set_description(f"Ancestral sampling {sample_tokens} music tokens", refresh=True)
+                hidden_states, cond = self.get_emb(
+                    sample_t, n_samples, tokens, audio_conditioning, metadata_conditioning
+                )
+
+                hidden_states = self.transformer(
+                    hidden_states, last_encoder_hidden_states=last_encoder_hidden_states, sample=True
+                )
+                if self.add_cond_after_transformer:
+                    hidden_states = hidden_states + cond
+                hidden_states = self.fc_proj_out(hidden_states)  # Predictions
+                if get_preds:
+                    preds.append(hidden_states.clone())
+                # Adjust logits
+                hidden_states = hidden_states / temp
+                hidden_states = filter_logits(hidden_states, top_k=top_k, top_p=top_p)
+                # Sample and replace hidden_states
+                tokens = torch.distributions.Categorical(logits=hidden_states).sample()
+                sampled_tokens.append(tokens.clone())
+
+            del tokens
+            self.transformer.del_cache()
+
+            tokens = torch.cat(sampled_tokens, dim=1)
+            if get_preds:
+                preds = torch.cat(preds, dim=1)
+        if get_preds:
+            return tokens, preds
+        else:
+            return tokens
+
+    def split_chunks(self, length, chunk_size):
+        n_passes = (length + chunk_size - 1) // chunk_size
+        chunk_sizes = [*[chunk_size] * (n_passes - 1), (length - 1) % chunk_size + 1]
+        return chunk_sizes
+
+    def primed_sample(
+        self,
+        n_samples,
+        lyric_and_music_tokens,
+        audio_conditioning=None,
+        metadata_conditioning=None,
+        last_encoder_hidden_states=None,
+        temp=1.0,
+        top_k=0,
+        top_p=0.0,
+        get_preds=False,
+        chunk_size=None,
+        sample_tokens=None,
+    ):
+        if sample_tokens is None:
+            sample_tokens = self.n_ctx
+        # Preprocess.
+        batch_size = lyric_and_music_tokens.shape[0]
+        with torch.no_grad():
+            lyric_and_music_tokens = lyric_and_music_tokens.view(batch_size, -1).long()
+
+        sampled_audio = torch.split(lyric_and_music_tokens, 1, dim=1)
+        sampled_audio = list(sampled_audio)
+
+        if not self.audio_conditioning:
+            audio_conditioning = torch.zeros(
+                (n_samples, 1, self.width), dtype=self.transformer._attn_mods[0].mlp.c_fc.weight.dtype
+            ).to(lyric_and_music_tokens.device)
+
+        with torch.no_grad():
+            if get_preds:
+                preds = []
+
+            # Fill up key/value cache for past context by running forward pass.
+            # We do so in chunks instead of doing the whole past in one forward pass to reduce max memory usage.
+            if chunk_size is None:
+                chunk_size = len(sampled_audio)
+            chunk_sizes = self.split_chunks(len(sampled_audio), chunk_size)
+            x_primes = []
+            start = 0
+            token = None
+
+            for current_chunk_size in tqdm(chunk_sizes, desc="Preparing past key value", leave=False):
+                sampled_audio_prime, conds_prime = [], []
+                for sample_t in range(start, start + current_chunk_size):
+                    x_prime, cond_prime = self.get_emb(
+                        sample_t, n_samples, token, audio_conditioning, metadata_conditioning
+                    )
+                    token = sampled_audio[sample_t]
+                    sampled_audio_prime.append(x_prime)
+                    conds_prime.append(cond_prime)
+                start = start + current_chunk_size
+                x_prime, cond_prime = torch.cat(sampled_audio_prime, dim=1), torch.cat(conds_prime, dim=1)
+                del sampled_audio_prime
+                del conds_prime
+                if not get_preds:
+                    del cond_prime
+                x_prime = self.transformer(x_prime, last_encoder_hidden_states=last_encoder_hidden_states, sample=True)
+
+                if get_preds:
+                    if self.add_cond_after_transformer:
+                        x_prime = x_prime + cond_prime
+                    del cond_prime
+                    x_primes.append(x_prime)
+                else:
+                    del x_prime
+
+            if get_preds:
+                x_prime = torch.cat(x_primes, dim=1)
+                x_prime = self.fc_proj_out(x_prime)  # Predictions
+                preds.append(x_prime)
+
+            # the input of the encoder and decoder can be merged into (lyrics, music tokens)
+            input_tokens = sampled_audio[-1]
+
+            itererator = tqdm(
+                range(len(sampled_audio), sample_tokens),
+                desc=f"Sampling {len(range(len(sampled_audio), sample_tokens))} music tokens",
+                leave=False,
+            )
+            for sample_t in itererator:
+                hidden_states, cond = self.get_emb(
+                    sample_t, n_samples, input_tokens, audio_conditioning, metadata_conditioning
+                )
+
+                hidden_states = self.transformer(
+                    hidden_states, last_encoder_hidden_states=last_encoder_hidden_states, sample=True
+                )
+                if self.add_cond_after_transformer:
+                    hidden_states = hidden_states + cond
+                hidden_states = self.fc_proj_out(hidden_states)  # Predictions
+                if get_preds:
+                    preds.append(hidden_states)
+                # Adjust logits
+                hidden_states = hidden_states / temp
+                hidden_states = filter_logits(hidden_states, top_k=top_k, top_p=top_p)
+                # only music tokens are sampled
+                music_tokens = torch.distributions.Categorical(logits=hidden_states).sample()
+                sampled_audio.append(music_tokens.clone())
+                input_tokens = music_tokens
+
+            del input_tokens, music_tokens
+            self.transformer.del_cache()
+
+            music_tokens = torch.cat(sampled_audio, dim=1)
+            if get_preds:
+                preds = torch.cat(preds, dim=1)
+        if get_preds:
+            return music_tokens, preds
+        else:
+            return music_tokens
+
+
+class JukeboxMusicTokenConditioner(nn.Module):
+    """
+    The `JukeboxMusicTokenConditioner` takes music tokens as an input (corresponding to the codes of the VQVAE's
+    codebook) and upsamples it using a single layer of decoder convolution block (the same is used in the VQVAE).
+    """
+
+    def __init__(self, config, level):
+        super().__init__()
+        self.embed_tokens = nn.Embedding(config.music_vocab_size, config.hidden_size)
+        config.embed_dim = config.music_vocab_size  # setting correct argument for the `JukeboxDecoder`
+
+        self.upsampler = JukeboxDecoderConvBock(
+            config,
+            config.hidden_size,
+            config.res_conv_width,
+            config.res_conv_depth,
+            config.res_downs_t[level],
+            config.res_strides_t[level],
+            reverse_dilation=False,
+        )
+        self.layer_norm = JukeboxLayerNorm(config.hidden_size)
+
+    def forward(self, music_tokens, raw_audio_conditioning=None):
+        """
+        Args:
+            music_tokens (`torch.LongTensor`):
+                Music tokens form the upper level in range(nb_discrete_codes)
+            raw_audio_conditioning (`torch.LongTensor`, *optional*):
+                Audio used when primed sampling, raw audio information that conditions the generation
+        """
+        if raw_audio_conditioning is None:
+            raw_audio_conditioning = 0.0
+        # Embed music_tokens
+        music_tokens = music_tokens.long()
+        hidden_states = self.embed_tokens(music_tokens)
+        hidden_states = hidden_states + raw_audio_conditioning
+
+        # Run conditioner
+        hidden_states = hidden_states.permute(0, 2, 1)
+        hidden_states = self.upsampler(hidden_states)
+        hidden_states = hidden_states.permute(0, 2, 1)
+        hidden_states = self.layer_norm(hidden_states)
+        return hidden_states
+
+
+class JukeboxRangeEmbedding(nn.Module):
+    """
+    The `JukeboxRangeEmbedding` interpolate the given [pos_start, pos_end] to obtain an equivalent of time positional
+    embedding of length `n_ctx`.
+
+    Binning process : For each pos in position tensor, find its bin [start,end) mapped to [0,1,...,bins-1] [start,end)
+    -> [0,1) -> [0, bins) -> floor -> [0,...,bins-1] NOTE: Open ended interval on right, so start <= pos < end, not <=
+    end
+    """
+
+    def __init__(self, n_time, embed_dim, range, out_width, clamp=False):
+        super().__init__()
+        self.n_time = n_time
+        self.embed_dim = embed_dim
+        self.emb = nn.Embedding(embed_dim, out_width)
+        self.pos_min, self.pos_max = range
+        self.clamp = clamp
+
+    def forward(self, pos_start, pos_end=None):
+        # Check if [pos_start,pos_end] in [pos_min, pos_max)
+        if not len(pos_start.shape) == 2:
+            raise TypeError(f"Expected shape with 2 dims, got {pos_start.shape}")
+        if not (self.pos_min <= pos_start).all() and (pos_start < self.pos_max).all():
+            raise TypeError(f"Range is [{self.pos_min},{self.pos_max}), got {pos_start}")
+
+        pos_start = pos_start.float()
+        if pos_end is not None:
+            if self.clamp:
+                pos_end = pos_end.clamp(self.pos_min, self.pos_max)
+
+            pos_end = pos_end.float()
+        # Interpolate so that [pos_start, ..., pos_end] <-> position tensor of length n_ctx
+        n_time = self.n_time
+        if n_time != 1:
+            interpolation = (
+                torch.arange(0, n_time, dtype=torch.float, device=pos_start.device).view(1, n_time) / n_time
+            )
+            position = pos_start + (pos_end - pos_start) * interpolation
+        else:
+            position = pos_start
+
+        # Bin each value to bins_
+        # [0,1) -> [0,1..,embed_dim) -> [0,1...,embed_dim-1
+        normalised_position = (position - self.pos_min) / (self.pos_max - self.pos_min)
+        bins_ = (self.embed_dim * normalised_position).floor().long().detach()
+        return self.emb(bins_)
+
+
+class JukeboxLabelConditioner(nn.Module):
+    def __init__(self, config, include_time_signal):
+        super().__init__()
+
+        embed_dim = config.hidden_size
+        timing_dims = config.timing_dims
+        sampling_rate = config.sampling_rate
+        nb_genres, nb_artists = config.metadata_dims
+        music_tokens_shape = config.n_ctx
+
+        self.max_nb_genres = config.max_nb_genres
+        self.bow_genre_emb = nn.Embedding(nb_genres, embed_dim)
+        self.artist_emb = nn.Embedding(nb_artists, embed_dim)
+        self.include_time_signal = include_time_signal
+        if self.include_time_signal:
+            total_length_range = (config.min_duration * sampling_rate, config.max_duration * sampling_rate)
+            absolute_pos_range = (0.0, config.max_duration * sampling_rate)
+            relative_pos_range = (0.0, 1.0)
+            self.total_length_emb = JukeboxRangeEmbedding(1, timing_dims, total_length_range, embed_dim)
+            self.absolute_pos_emb = JukeboxRangeEmbedding(
+                music_tokens_shape, timing_dims, absolute_pos_range, embed_dim
+            )
+            self.relative_pos_emb = JukeboxRangeEmbedding(
+                music_tokens_shape, timing_dims, relative_pos_range, embed_dim, clamp=True
+            )
+
+    def forward(self, metadata):
+        total_length = metadata[:, 0:1]
+        offset = metadata[:, 1:2]
+        length = metadata[:, 2:3]
+        artist = metadata[:, 3:4]
+        genre = metadata[:, 4:]
+
+        # Start embedding of length 1
+        artist_emb = self.artist_emb(artist)
+        # Empty genre slots are denoted by -1. We mask these out.
+        mask = (genre >= 0).float().unsqueeze(2)
+        genre_emb = (self.bow_genre_emb(genre.clamp(0)) * mask).sum(dim=1, keepdim=True)
+        start_emb = genre_emb + artist_emb
+
+        # Pos embedding of length n_ctx
+        if self.include_time_signal:
+            start, end = offset, offset + length
+            total_length = total_length.float()
+            start = start.float()
+            end = end.float()
+            pos_emb = (
+                self.total_length_emb(total_length)
+                + self.absolute_pos_emb(start, end)
+                + self.relative_pos_emb(start / total_length, end / total_length)
+            )
+        else:
+            pos_emb = None
+        return start_emb, pos_emb
+
+
+class JukeboxPrior(PreTrainedModel):
+    """
+    The JukeboxPrior class, which is a wrapper around the various conditioning and the transformer. JukeboxPrior can be
+    seen as language models trained on music. They model the next `music token` prediction task. If a (lyric) `encoderù
+    is defined, it also models the `next character` prediction on the lyrics. Can be conditioned on timing, artist,
+    genre, lyrics and codes from lower-levels Priors.
+
+    Args:
+        config (`JukeboxPriorConfig`):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+        level (`int`, *optional*):
+            Current level of the Prior. Should be in range `[0,nb_priors]`.
+        nb_priors (`int`, *optional*, defaults to 3):
+            Total number of priors.
+        vqvae_encoder (`Callable`, *optional*):
+            Encoding method of the VQVAE encoder used in the forward pass of the model. Passing functions instead of
+            the vqvae module to avoid getting the parameters.
+        vqvae_decoder (`Callable`, *optional*):
+            Decoding method of the VQVAE decoder used in the forward pass of the model. Passing functions instead of
+            the vqvae module to avoid getting the parameters.
+    """
+
+    config: JukeboxPriorConfig
+
+    def _init_weights(self, module):
+        init_scale = self.config.init_scale
+
+        if isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=0.02 * init_scale)
+        elif isinstance(module, JukeboxConv1D):
+            if self.config.zero_out:
+                module.weight.data.zero_()
+            else:
+                module.weight.data.normal_(mean=0.0, std=0.02 * init_scale)
+        elif isinstance(module, JukeboxPositionalEmbedding):
+            module.pos_emb.data.normal_(mean=0.0, std=0.01 * init_scale)
+        elif isinstance(module, JukeboxRangeEmbedding):
+            module.emb.weight.data.normal_(mean=0.0, std=0.01 * init_scale)
+        elif isinstance(module, JukeboxConditionalAutoregressive) and hasattr(module, "lm_head"):
+            module.lm_head.weight.data.normal_(mean=0.0, std=0.02 * init_scale)
+        elif isinstance(module, JukeboxConditionalAutoregressive) and hasattr(module, "start_token"):
+            module.start_token.data.normal_(mean=0.0, std=0.01 * init_scale)
+        elif isinstance(module, JukeboxResConv1DBlock) and self.config.zero_out:
+            module.conv1d_2.weight.data.zero_()
+            module.conv1d_2.bias.data.zero_()
+        if isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+    def __init__(self, config: JukeboxPriorConfig, level=None, nb_priors=3, vqvae_encoder=None, vqvae_decoder=None):
+        super().__init__(config)
+        # Passing functions instead of the vqvae module to avoid getting params, only used in the
+        # forward loop
+        self.vqvae_encoder = vqvae_encoder
+        self.vqvae_decoder = vqvae_decoder
+
+        self.levels = nb_priors
+        self.level = level if level is not None else config.level
+
+        self.base_model_prefix = f"priors.{self.level}"
+
+        self.n_ctx = config.n_ctx
+
+        self.lyric_conditioning = config.nb_relevant_lyric_tokens > 0
+        self.nb_relevant_lyric_tokens = config.nb_relevant_lyric_tokens
+        self.encoder_loss_fraction = config.encoder_loss_fraction
+
+        # Audio conditioning : conditioning on music tokens (either from audio or from previous levels or both)
+        self.audio_conditioning = self.level != 0
+        self.cond_level = self.level - 1
+        if self.audio_conditioning:
+            self.conditioner_blocks = JukeboxMusicTokenConditioner(config, self.level)
+
+        # metadata conditioning : contioning on timing, genres, and artist
+        self.metadata_conditioning = config.metadata_conditioning
+        if self.metadata_conditioning:
+            self.metadata_embedding = JukeboxLabelConditioner(config, include_time_signal=not self.audio_conditioning)
+
+        # define encoder-decoder or encoder and decoder
+        self.is_encoder_decoder = config.is_encoder_decoder
+        if config.is_encoder_decoder:
+            # encoder-decoder transformer
+            self.input_shapes = [config.nb_relevant_lyric_tokens, config.n_ctx]
+            self.embed_dim_shift = [0, config.lyric_vocab_size]
+            self.width = config.hidden_size
+
+            self.nb_relevant_lyric_tokens = config.nb_relevant_lyric_tokens
+
+            self.prior = JukeboxConditionalAutoregressive(
+                config,
+                n_ctx=config.nb_relevant_lyric_tokens + config.n_ctx,
+                embed_dim=config.lyric_vocab_size + config.music_vocab_size,
+                audio_conditioning=(self.audio_conditioning or self.metadata_conditioning),
+                metadata_conditioning=True,
+            )
+
+        else:
+            # Separate encoder-decoder transformer
+            encoder_config = config.encoder_config
+
+            if self.nb_relevant_lyric_tokens != 0 and self.lyric_conditioning:
+                self.lyric_acts_width = encoder_config.hidden_size
+                self.encoder_width = config.hidden_size
+                self.encoder_dim = config.lyric_vocab_size
+                self.encoder = JukeboxConditionalAutoregressive(
+                    encoder_config,
+                    n_ctx=self.nb_relevant_lyric_tokens,
+                    embed_dim=self.encoder_dim,
+                    audio_conditioning=False,
+                    metadata_conditioning=False,
+                    is_encoder=True,
+                )
+                self.encoder.proj_in = JukeboxConv1D(encoder_config.hidden_size, config.hidden_size)
+                self.encoder.final_layer_norm = JukeboxLayerNorm(config.hidden_size)
+                self.encoder.lm_head = nn.Linear(config.hidden_size, config.lyric_vocab_size, bias=False)
+            else:
+                self.nb_relevant_lyric_tokens = 0
+
+            # decoder model on the tokens
+            self.prior = JukeboxConditionalAutoregressive(
+                config,
+                audio_conditioning=(self.audio_conditioning or self.metadata_conditioning),
+                metadata_conditioning=self.metadata_conditioning,
+            )
+
+        self.next_token_prediction_loss_dims = config.n_ctx
+        self.total_loss_dims = self.nb_relevant_lyric_tokens + self.next_token_prediction_loss_dims
+
+        self.downsamples = [stride**down for stride, down in zip(config.res_strides_t, config.res_downs_t)]
+        self.cond_downsample = self.downsamples[self.level] if self.level != 0 else None
+        self.raw_to_tokens = np.prod(self.downsamples[: nb_priors - self.level])
+        self.sample_length = self.n_ctx * self.raw_to_tokens
+
+        logger.info(
+            f"Level:{self.level}, Cond downsample:{self.cond_downsample}, Raw to tokens:{self.raw_to_tokens}, Sample"
+            f" length:{self.sample_length}"
+        )
+
+    def get_metadata(self, labels, start, total_length, offset, get_indices=False):
+        metadata = labels.clone()
+        metadata[:, 0] = total_length
+        # Set sample_length to match this level
+        metadata[:, 2] = int(self.sample_length)
+
+        # Set offset
+        metadata[:, 1:2] = int(offset * self.raw_to_tokens) + int(start * self.raw_to_tokens)
+        # here since metadata has the full token_list, we just need to selected the ones that are relevant
+
+        # Set lyric tokens
+        metadata, indices = self.set_metadata_lyric_tokens(metadata)
+        if get_indices:
+            return metadata, indices
+        else:
+            return metadata
+
+    def set_metadata_lyric_tokens(self, labels):
+        """
+        Processes the full labels to only retrieve the relevant lyric tokens and keep the metadata conditioning tokens.
+        """
+        if self.nb_relevant_lyric_tokens > 0:
+            tokens_list = torch.zeros(
+                (labels.shape[0], self.nb_relevant_lyric_tokens), dtype=torch.long, device=labels.device
+            )
+            indices_list = []  # what's the index of each current character in original array
+            for idx in range(labels.shape[0]):
+                full_tokens = labels.clone()[:, 4 + self.metadata_embedding.max_nb_genres :]
+                total_length, offset, duration = labels[idx, 0], labels[idx, 1], labels[idx, 2]
+                tokens, indices = get_relevant_lyric_tokens(
+                    full_tokens, self.nb_relevant_lyric_tokens, total_length, offset, duration
+                )
+                tokens_list[idx, :] = tokens
+                indices_list.append(indices)
+
+            return (
+                torch.cat((labels[:, : 4 + self.metadata_embedding.max_nb_genres], tokens_list), dim=-1),
+                indices_list,
+            )
+        else:
+            return labels, None
+
+    def get_music_tokens_conds(self, music_tokens, start, end):
+        """
+        Extracts current level's conditioning music tokens.
+        """
+        if self.level != 0:
+            music_tokens_cond = music_tokens[self.level - 1]
+            music_tokens = music_tokens_cond[:, start // self.cond_downsample : end // self.cond_downsample]
+            missing_cond_len = self.n_ctx // self.cond_downsample - music_tokens_cond[-1].shape[-1]
+            if missing_cond_len > 0:
+                init_cond = torch.zeros(1, missing_cond_len).to(music_tokens_cond.device)
+                music_tokens_cond = torch.cat((music_tokens_cond, init_cond), dim=-1).long()
+            music_tokens_conds = [music_tokens_cond]
+        else:
+            music_tokens_conds = None
+        return music_tokens_conds
+
+    def prior_preprocess(self, tokens, conds):
+        """
+        Shifts the input tokens to account for the dictionary merge. The embed_dim_shift give by how much the music
+        tokens should be shifted by. It is equal to `lyric_vocab_size`.
+        """
+        batch_size = tokens[0].shape[0]
+        for i in range(len(tokens)):
+            tokens[i] = (tokens[i] + int(self.embed_dim_shift[i])).view(batch_size, -1)
+
+        for i in range(len(conds)):
+            if conds[i] is None:
+                conds[i] = torch.zeros(
+                    (batch_size, self.input_shapes[i], self.width), dtype=tokens[0].dtype, device=tokens[0].device
+                )
+
+        return torch.cat(tokens, dim=1), torch.cat(conds, dim=1)
+
+    def prior_postprocess(self, tokens):
+        """
+        Shifts back the input tokens if the model uses an encoder decoder architecture. As the embedding layer is
+        shared, `prior_embed_dim_shift` shifts the music token ids by `lyric_vocab_size`. Only returns the music
+        tokens.
+        """
+        batch_size = tokens.shape[0]
+        dims = (self.input_shapes[0], tokens.shape[1] - self.input_shapes[0])
+        tokens = list(torch.split(tokens, dims, dim=1))
+
+        # Some of the input tokens might be shifted to take into account the voccabulary fusion
+        for i in range(len(tokens)):
+            bins_shift = int(self.embed_dim_shift[i])
+            tokens[i] = (tokens[i] - bins_shift).view(batch_size, -1)
+            tokens[i] = torch.clamp(tokens[i], min=0)
+            # If not masking loss, model may have generated lyric/midi tokens which are now shifted <0 by bin_shift
+        return tokens[-1]
+
+    def embed_tokens(self, music_tokens_conds):
+        """
+        Embeds the upper level music tokens and upsamples them to provide as audio conditioning.
+        """
+        music_tokens_conds = music_tokens_conds[: self.cond_level + 1]
+        audio_conditioning = None
+        for music_tokens_cond, conditioner_block in reversed(list(zip(music_tokens_conds, [self.conditioner_blocks]))):
+            audio_conditioning = conditioner_block(music_tokens_cond, audio_conditioning)
+        return audio_conditioning
+
+    def encode(self, hidden_states, start_level=None, end_level=None, bs_chunks=1):
+        """
+        Encodes the hidden states (raw audio) using the VQVAE's encoder. Returns latent_states.
+        """
+        if start_level is None:
+            start_level = self.level
+        if end_level is None:
+            end_level = self.levels
+        # Get latents
+        with torch.no_grad():
+            latent_states = self.vqvae_encoder(
+                hidden_states, start_level=start_level, end_level=end_level, bs_chunks=bs_chunks
+            )
+        return latent_states
+
+    def decode(self, music_tokens, start_level=None, end_level=None, bs_chunks=1):
+        """
+        Usamples the sequence of codebook vectors to a raw audio.
+        """
+        if start_level is None:
+            start_level = self.level
+        if end_level is None:
+            end_level = self.levels
+        with torch.no_grad():
+            output = self.vqvae_decoder(
+                music_tokens, start_level=start_level, end_level=end_level, bs_chunks=bs_chunks
+            )
+        return output
+
+    def get_cond(self, music_tokens_conds, metadata):
+        """
+        Converts the input tokens to input_embeddings. Splits the lyrics form the rest of the metadata. Lyric tokens
+        can be None.
+        """
+        if metadata is not None:
+            n_labels = metadata.shape[1] - self.nb_relevant_lyric_tokens
+            metadata, lyric_tokens = metadata[:, :n_labels], metadata[:, n_labels:]
+        else:
+            metadata, lyric_tokens = None, None
+        metadata_conditioning, metadata_pos = (
+            self.metadata_embedding(metadata) if self.metadata_conditioning else (None, None)
+        )
+        audio_conditioning = self.embed_tokens(music_tokens_conds) if self.audio_conditioning else metadata_pos
+        return audio_conditioning, metadata_conditioning, lyric_tokens
+
+    def sample(
+        self,
+        n_samples,
+        music_tokens=None,
+        music_tokens_conds=None,
+        metadata=None,
+        temp=1.0,
+        top_k=0,
+        top_p=0.0,
+        chunk_size=None,
+        sample_tokens=None,
+    ):
+        """
+        Ancestral/Prime sampling a window of tokens using the provided conditioning and metadatas.
+
+        Args:
+            n_samples (`int`):
+                Number of samples to generate.
+            music_tokens (`list[torch.LongTensor]`, *optional*):
+                Previously generated tokens at the current level. Used as context for the generation.
+            music_tokens_conds (`list[torch.FloatTensor]`, *optional*):
+                Upper-level music tokens generated by the previous prior model. Is `None` if the generation is not
+                conditioned on the upper-level tokens.
+            metadata (`list[torch.LongTensor]`, *optional*):
+                List containing the metadata tensor with the artist, genre and the lyric tokens.
+            temp (`float`, *optional*, defaults to 1.0):
+                Sampling temperature.
+            top_k (`int`, *optional*, defaults to 0):
+                Top k probabilities used for filtering.
+            top_p (`float`, *optional*, defaults to 0.0):
+                Top p probabilities used for filtering.
+            chunk_size (`int`, *optional*):
+                Size of the chunks used to prepare the cache of the transformer.
+            sample_tokens (`int`, *optional*):
+                Number of tokens to sample.
+
+        """
+        no_past_context = music_tokens is None or music_tokens.shape[1] == 0
+        name = {True: "Ancestral", False: "Primed"}[no_past_context]
+        logger.info(f"{name} sampling {n_samples} samples with temp={temp}, top_k={top_k}, top_p={top_p}")
+
+        with torch.no_grad():
+            # Currently audio_conditioning only uses immediately above layer
+            audio_conditioning, metadata_conditioning, lyric_tokens = self.get_cond(music_tokens_conds, metadata)
+            if self.is_encoder_decoder:
+                if no_past_context:  # the prime_sample function will be used with music_tokens set to None
+                    lyric_and_music_tokens, audio_conditioning = self.prior_preprocess(
+                        [lyric_tokens], [None, audio_conditioning]
+                    )
+                else:
+                    lyric_and_music_tokens, audio_conditioning = self.prior_preprocess(
+                        [lyric_tokens, music_tokens], [None, audio_conditioning]
+                    )
+                if sample_tokens is not None:
+                    sample_tokens += self.nb_relevant_lyric_tokens
+                music_tokens = self.prior.primed_sample(
+                    n_samples,
+                    lyric_and_music_tokens,
+                    audio_conditioning,
+                    metadata_conditioning,
+                    temp=temp,
+                    top_k=top_k,
+                    top_p=top_p,
+                    chunk_size=chunk_size,
+                    sample_tokens=sample_tokens,
+                )
+                music_tokens = self.prior_postprocess(music_tokens)
+            else:
+                last_encoder_hidden_states = self.get_encoder_states(lyric_tokens, sample=True)
+                if no_past_context:
+                    music_tokens = self.prior.sample(
+                        n_samples,
+                        audio_conditioning,
+                        metadata_conditioning,
+                        last_encoder_hidden_states,
+                        temp=temp,
+                        top_k=top_k,
+                        top_p=top_p,
+                        sample_tokens=sample_tokens,
+                    )
+                else:
+                    music_tokens = self.prior.primed_sample(
+                        n_samples,
+                        music_tokens,
+                        audio_conditioning,
+                        metadata_conditioning,
+                        last_encoder_hidden_states,
+                        temp=temp,
+                        top_k=top_k,
+                        top_p=top_p,
+                        chunk_size=chunk_size,
+                        sample_tokens=sample_tokens,
+                    )
+        return music_tokens
+
+    def get_encoder_states(self, lyric_tokens, sample=False):
+        """
+        Retrieve the last hidden_states of the lyric encoder that will be attended to by the decoder. Forwards through
+        the lyric encoder.
+        """
+        if self.nb_relevant_lyric_tokens != 0 and self.lyric_conditioning:
+            if sample:
+                self.encoder = self.encoder.to(lyric_tokens.device)
+            lyric_acts = self.encoder(lyric_tokens, None, None, None)
+            lyric_acts = self.encoder.proj_in(lyric_acts)
+            last_encoder_hidden_states = self.encoder.final_layer_norm(lyric_acts)
+        else:
+            last_encoder_hidden_states = None
+        return last_encoder_hidden_states
+
+    def get_encoder_loss(self, last_encoder_hidden_states, target_lyrics):
+        """
+        Computes the loss for the lyric encoder: next lyric token prediction.
+        """
+        if self.lyric_conditioning:
+            last_encoder_hidden_states = self.encoder.lm_head(last_encoder_hidden_states)
+            encoder_loss = nn.functional.cross_entropy(
+                last_encoder_hidden_states.view(-1, self.encoder_dim), target_lyrics.view(-1)
+            ) / np.log(2.0)
+        else:
+            encoder_loss = torch.tensor(0.0, device=last_encoder_hidden_states.device)
+        return encoder_loss
+
+    def forward_tokens(
+        self, music_tokens, music_tokens_conds=[], metadata=None, get_preds=False, get_attn_weights=False
+    ):
+        """
+        Applies a forward pass using the conditioning tokens. Different from the classic forward as it does not use the
+        vqvae's encoding layers.
+        """
+        if get_attn_weights:
+            self.prior.transformer.set_record_attn(get_attn_weights)
+        audio_conditioning, metadata_conditioning, lyric_tokens = self.get_cond(music_tokens_conds, metadata)
+
+        if self.is_encoder_decoder:  # the preprocess returns the full tokens (Lyrics and Music tokens), shifted
+            tokens, audio_conditioning = self.prior_preprocess(
+                [lyric_tokens, music_tokens], [None, audio_conditioning]
+            )
+            (encoder_loss, next_token_prediction_loss), preds = self.prior(
+                tokens, audio_conditioning, metadata_conditioning, get_sep_loss=True, get_preds=get_preds
+            )
+        else:
+            last_encoder_hidden_states = self.get_encoder_states(lyric_tokens)
+            encoder_loss = self.get_encoder_loss(last_encoder_hidden_states, lyric_tokens)
+            next_token_prediction_loss, preds = self.prior(
+                music_tokens,
+                audio_conditioning,
+                metadata_conditioning,
+                last_encoder_hidden_states,
+                get_preds=get_preds,
+            )
+        loss = self.encoder_loss_fraction * encoder_loss * self.nb_relevant_lyric_tokens / self.total_loss_dims
+        loss += next_token_prediction_loss * self.next_token_prediction_loss_dims / self.total_loss_dims
+
+        metrics = {
+            "bpd": next_token_prediction_loss.detach().clone(),
+            "encoder_loss": encoder_loss.detach().clone(),
+            "next_token_prediction_loss": next_token_prediction_loss.detach().clone(),
+        }
+        if get_preds:
+            metrics["preds"] = preds.detach().clone()
+        if get_attn_weights:
+            saved_attn_weights = self.prior.transformer.saved_attn_weights
+            self.prior.transformer.set_record_attn(False)
+            return saved_attn_weights
+        else:
+            return loss, metrics
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        metadata: Optional[list[torch.LongTensor]],
+        decode: Optional[bool] = False,
+        get_preds: Optional[bool] = False,
+    ) -> list[torch.Tensor]:
+        """
+        Encode the hidden states using the `vqvae` encoder, and then predicts the next token in the `forward_tokens`
+        function. The loss is the sum of the `encoder` loss and the `decoder` loss.
+
+        Args:
+            hidden_states (`torch.Tensor`):
+                Hidden states which should be raw audio
+            metadata (`list[torch.LongTensor]`, *optional*):
+                List containing the metadata conditioning tensor with the lyric and the metadata tokens.
+            decode (`bool`, *optional*, defaults to `False`):
+                Whether or not to decode the encoded to tokens.
+            get_preds (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the actual predictions of the model.
+        """
+        batch_size = hidden_states.shape[0]
+        music_tokens, *music_tokens_conds = self.encode(hidden_states, bs_chunks=batch_size)
+        loss, metrics = self.forward_tokens(
+            music_tokens=music_tokens,
+            music_tokens_conds=music_tokens_conds,
+            metadata=metadata,
+            get_preds=get_preds,
+        )
+        if decode:
+            dequantised_states = self.decode([music_tokens, *music_tokens_conds])
+        else:
+            dequantised_states = None
+        return dequantised_states, loss, metrics
+
+
+class JukeboxPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config: JukeboxConfig
+    base_model_prefix = "jukebox"
+    supports_gradient_checkpointing = False
+
+    def _init_weights(self, module):
+        if isinstance(module, (JukeboxPrior, JukeboxVQVAE)):
+            module.apply(module._init_weights)
+
+    def __init__(self, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+
+
+JUKEBOX_SAMPLING_INPUT_DOCSTRING = r"""
+            labels (`list[torch.LongTensor]` of length `n_sample`, and shape `(self.levels, self.config.max_nb_genre + lyric_sequence_length)` :
+                List of metadata such as `artist_id`, `genre_id` and the full list of lyric tokens which are used to
+                condition the generation.
+            sampling_kwargs (`dict[Any]`):
+                Various additional sampling arguments that are used by the `_sample` function. A detail list of the
+                arguments can bee seen in the [`_sample`] function documentation.
+"""
+
+
+@add_start_docstrings(
+    """The bare JUKEBOX Model used for music generation. 4 sampling techniques are supported : `primed_sample`, `upsample`,
+    `continue_sample` and `ancestral_sample`. It does not have a `forward` method as the training is not end to end. If
+    you want to fine-tune the model, it is recommended to use the `JukeboxPrior` class and train each prior
+    individually.
+    """,
+    JUKEBOX_START_DOCSTRING,
+)
+class JukeboxModel(JukeboxPreTrainedModel):
+    _no_split_modules = ["JukeboxBlock"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        vqvae_config = config.vqvae_config
+        self.vqvae = JukeboxVQVAE(vqvae_config)
+        self.set_shared_params(config)
+        self.priors = nn.ModuleList(
+            [JukeboxPrior(config.prior_configs[level], level) for level in range(config.nb_priors)]
+        )
+
+    def set_shared_params(self, model_config):
+        """
+        Initialises the parameters that are shared. This has to be done here because the list of `JukeboxPriorConfig`
+        is nest, and is thus unreachable in the `from_dict` function
+        """
+        for config in model_config.prior_configs:
+            config.sampling_rate = model_config.sampling_rate
+            config.timing_dims = model_config.timing_dims
+            config.min_duration = model_config.min_duration
+            config.max_duration = model_config.max_duration
+            config.max_nb_genres = model_config.max_nb_genres
+            config.metadata_conditioning = model_config.metadata_conditioning
+
+    def decode(self, music_tokens, start_level=0, end_level=None, bs_chunks=1):
+        return self.vqvae.decode(music_tokens, start_level, end_level, bs_chunks)
+
+    def encode(self, input_audio, start_level=0, end_level=None, bs_chunks=1):
+        return self.vqvae.encode(input_audio, start_level, end_level, bs_chunks)
+
+    def split_batch(self, obj, n_samples, split_size):
+        n_passes = (n_samples + split_size - 1) // split_size
+        if isinstance(obj, torch.Tensor):
+            return torch.split(obj, split_size, dim=0)
+        elif isinstance(obj, list):
+            return list(zip(*[torch.split(item, split_size, dim=0) for item in obj]))
+        elif obj is None:
+            return [None] * n_passes
+        else:
+            raise TypeError("Unknown input type")
+
+    # Sample a partial window of length<n_ctx with tokens_to_sample new tokens on level=level
+    def sample_partial_window(
+        self, music_tokens, labels, offset, sampling_kwargs, level, tokens_to_sample, max_batch_size
+    ):
+        prior = self.priors[level]
+        sampled_tokens = music_tokens[level]
+        n_ctx = prior.n_ctx
+        nb_sampled_tokens = sampled_tokens.shape[1]
+        if nb_sampled_tokens < n_ctx - tokens_to_sample:
+            sampling_kwargs["sample_tokens"] = nb_sampled_tokens + tokens_to_sample
+            start = 0
+        else:
+            sampling_kwargs["sample_tokens"] = n_ctx
+            start = nb_sampled_tokens - n_ctx + tokens_to_sample
+
+        return self.sample_single_window(music_tokens, labels, offset, sampling_kwargs, level, start, max_batch_size)
+
+    # Sample a single window of length=n_ctx at position=start on level=level
+    def sample_single_window(self, music_tokens, labels, offset, sampling_kwargs, level, start, max_batch_size):
+        prior = self.priors[level]
+        n_samples = music_tokens[0].shape[0]
+        n_ctx = prior.n_ctx
+        end = start + n_ctx
+        # get music_tokens already sampled at current level
+        previous_sampled_tokens = music_tokens[level][:, start:end]
+
+        sample_tokens = sampling_kwargs.get("sample_tokens", None)
+        if "sample_tokens" in sampling_kwargs:
+            sample_tokens = end - start
+
+        conditioning_tokens = previous_sampled_tokens.shape[1]
+        new_tokens = sample_tokens - previous_sampled_tokens.shape[1]
+
+        logger.info(
+            f"Sampling {sample_tokens} tokens for [{start},{start + sample_tokens}]. Conditioning on"
+            f" {conditioning_tokens} tokens"
+        )
+
+        if new_tokens <= 0:
+            # Nothing new to sample
+            return music_tokens
+
+        # get music_tokens_conds from level above
+        music_tokens_conds = prior.get_music_tokens_conds(music_tokens, start, end)
+        # if there are no levels above should return None!
+
+        # set metadata offset, sample_length and lyrics tokens
+        metadata = prior.get_metadata(labels, start, self.total_length, offset)
+
+        music_tokens_list = self.split_batch(previous_sampled_tokens, n_samples, max_batch_size)
+        music_tokens_conds_list = self.split_batch(music_tokens_conds, n_samples, max_batch_size)
+        metadata_list = self.split_batch(metadata, n_samples, max_batch_size)
+        tokens = []
+        iterator = tqdm(zip(music_tokens_list, music_tokens_conds_list, metadata_list), leave=False)
+        for music_tokens_i, music_tokens_conds_i, metadata_i in iterator:
+            name = ["Ancestral", "Primed"][music_tokens_i.shape[1] == 0]
+            iterator.set_description(
+                f"[prior level {level}] {name} Sampling {sample_tokens} tokens out of"
+                f" {self.total_length // prior.raw_to_tokens}",
+                refresh=True,
+            )
+            tokens_i = prior.sample(
+                n_samples=music_tokens_i.shape[0],
+                music_tokens=music_tokens_i,
+                music_tokens_conds=music_tokens_conds_i,
+                metadata=metadata_i,
+                **sampling_kwargs,
+            )
+            tokens.append(tokens_i)
+        sampled_tokens = torch.cat(tokens, dim=0)
+
+        # Update music_tokens with new sample
+        music_tokens_new = sampled_tokens[:, -new_tokens:]
+        music_tokens[level] = torch.cat([music_tokens[level], music_tokens_new], dim=1)
+        return music_tokens
+
+    # Sample total_length tokens at level=level with hop_length=hop_length
+    def sample_level(
+        self, music_tokens, labels, offset, sampling_kwargs, level, total_length, hop_length, max_batch_size
+    ):
+        if total_length >= self.priors[level].n_ctx:
+            iterator = get_starts(total_length, self.priors[level].n_ctx, hop_length)
+            for start in iterator:
+                music_tokens = self.sample_single_window(
+                    music_tokens, labels, offset, sampling_kwargs, level, start, max_batch_size
+                )
+
+        else:
+            music_tokens = self.sample_partial_window(
+                music_tokens, labels, offset, sampling_kwargs, level, total_length, max_batch_size
+            )
+        return music_tokens
+
+    @torch.no_grad()
+    def _sample(
+        self,
+        music_tokens,
+        labels,
+        sample_levels,
+        metas=None,
+        chunk_size=32,
+        sampling_temperature=0.98,
+        lower_batch_size=16,
+        max_batch_size=16,
+        sample_length_in_seconds=24,
+        compute_alignments=False,
+        sample_tokens=None,
+        offset=0,
+        save_results=True,
+        sample_length=None,
+    ) -> list[torch.LongTensor]:
+        """
+        Core sampling function used to generate music tokens. Iterates over the provided list of levels, while saving
+        the generated raw audio at each step.
+
+        Args:
+            music_tokens (`list[torch.LongTensor]`):
+                A sequence of music tokens of length `self.levels` which will be used as context to continue the
+                sampling process. Should have `self.levels` tensors, each corresponding to the generation at a certain
+                level.
+            labels (`list[torch.LongTensor]`):
+                List of length `n_sample`, and shape `(self.levels, 4 + self.config.max_nb_genre +
+                lyric_sequence_length)` metadata such as `artist_id`, `genre_id` and the full list of lyric tokens
+                which are used to condition the generation.
+            sample_levels (`list[int]`):
+                List of the desired levels at which the sampling will be done. A level is equivalent to the index of
+                the prior in the list of priors
+            metas (`list[Any]`, *optional*):
+                Metadatas used to generate the `labels`
+            chunk_size (`int`, *optional*, defaults to 32):
+                Size of a chunk of audio, used to fill up the memory in chunks to prevent OOM errors. Bigger chunks
+                means faster memory filling but more consumption.
+            sampling_temperature (`float`, *optional*, defaults to 0.98):
+                Temperature used to adjust the randomness of the sampling.
+            lower_batch_size (`int`, *optional*, defaults to 16):
+                Maximum batch size for the lower level priors
+            max_batch_size (`int`, *optional*, defaults to 16):
+                Maximum batch size for the top level priors
+            sample_length_in_seconds (`int`, *optional*, defaults to 24):
+                Desired length of the generation in seconds
+            compute_alignments (`bool`, *optional*, defaults to `False`):
+                Whether or not to compute the alignment between the lyrics and the audio using the top_prior
+            sample_tokens (`int`, *optional*):
+                Precise number of tokens that should be sampled at each level. This is mostly useful for running dummy
+                experiments
+            offset (`int`, *optional*, defaults to 0):
+                Audio offset used as conditioning, corresponds to the starting sample in the music. If the offset is
+                greater than 0, the lyrics will be shifted take that intoaccount
+            save_results (`bool`, *optional*, defaults to `True`):
+                Whether or not to save the intermediate results. If `True`, will generate a folder named with the start
+                time.
+            sample_length (`int`, *optional*):
+                Desired length of the generation in samples.
+
+        Returns: torch.Tensor
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, JukeboxModel, set_seed
+        >>> import torch
+
+        >>> metas = dict(artist="Zac Brown Band", genres="Country", lyrics="I met a traveller from an antique land")
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai/jukebox-1b-lyrics")
+        >>> model = JukeboxModel.from_pretrained("openai/jukebox-1b-lyrics", min_duration=0).eval()
+
+        >>> labels = tokenizer(**metas)["input_ids"]
+        >>> set_seed(0)
+        >>> zs = [torch.zeros(1, 0, dtype=torch.long) for _ in range(3)]
+        >>> zs = model._sample(zs, labels, [0], sample_length=40 * model.priors[0].raw_to_tokens, save_results=False)
+        >>> zs[0]
+        tensor([[1853, 1369, 1150, 1869, 1379, 1789,  519,  710, 1306, 1100, 1229,  519,
+              353, 1306, 1379, 1053,  519,  653, 1631, 1467, 1229, 1229,   10, 1647,
+             1254, 1229, 1306, 1528, 1789,  216, 1631, 1434,  653,  475, 1150, 1528,
+             1804,  541, 1804, 1434]])
+        ```
+        """
+
+        top_prior = self.priors[0]
+        if sample_length is not None:
+            total_length = sample_length
+        else:
+            total_length = (
+                int(sample_length_in_seconds * self.config.sampling_rate) // top_prior.raw_to_tokens
+            ) * top_prior.raw_to_tokens
+
+        if sample_levels is None:
+            sample_levels = range(len(self.priors))
+
+        # total length of the signal, might be bit different from the actual generated length
+        self.total_length = total_length
+        for level in sample_levels:
+            sampling_kwargs = {
+                "temp": 0.99 if level == len(self.priors) - 1 else sampling_temperature,
+                "chunk_size": chunk_size,
+                "sample_tokens": sample_tokens,
+            }
+            # Set correct total_length, hop_length, labels and sampling_kwargs for level
+
+            total_token_to_sample = total_length // self.priors[level].raw_to_tokens
+            hop_length = int(self.config.hop_fraction[level] * self.priors[level].n_ctx)
+            max_batch_size = lower_batch_size if level != sample_levels else max_batch_size
+            music_tokens = self.sample_level(
+                music_tokens,
+                labels[level],
+                offset,
+                sampling_kwargs,
+                level,
+                total_token_to_sample,
+                hop_length,
+                max_batch_size,
+            )
+
+            if save_results:
+                self.vqvae.to(music_tokens[level].device)
+                # Decode sample
+                with torch.no_grad():
+                    start_level = len(self.priors) - level - 1  # vqvae levels are reversed
+                    raw_audio = self.vqvae.decode(
+                        music_tokens[: level + 1], start_level=start_level, bs_chunks=music_tokens[level].shape[0]
+                    )
+                logdir = f"jukebox/level_{level}"
+                if not os.path.exists(logdir):
+                    os.makedirs(logdir)
+                save_temp_audio(logdir, level, metas=metas, aud=raw_audio.float())
+                if compute_alignments and self.priors[0] is not None and self.priors[0].nb_relevant_lyric_tokens > 0:
+                    with torch.no_grad():
+                        alignments = get_alignment(music_tokens, labels[0], self.priors[0], self.config)
+                    torch.save({"alignments": alignments}, f"{logdir}/lyric_alignments.pt")
+
+        return music_tokens
+
+    @add_start_docstrings(
+        """
+        Generates music tokens based on the provided `labels. Will start at the desired prior level and automatically
+        upsample the sequence. If you want to create the audio, you should call `model.decode(tokens)`, which will use
+        the VQ-VAE decoder to convert the music tokens to raw audio.
+
+        Args:
+            labels (`list[torch.LongTensor]`) :
+                List of length `n_sample`, and shape `(self.levels, 4 + self.config.max_nb_genre +
+                lyric_sequence_length)` metadata such as `artist_id`, `genre_id` and the full list of lyric tokens
+                which are used to condition the generation.
+            n_samples (`int`, *optional*, default to 1) :
+                Number of samples to be generated in parallel.
+        """,
+    )
+    def ancestral_sample(self, labels, n_samples=1, **sampling_kwargs) -> list[torch.LongTensor]:
+        """
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, JukeboxModel, set_seed
+
+        >>> model = JukeboxModel.from_pretrained("openai/jukebox-1b-lyrics", min_duration=0).eval()
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai/jukebox-1b-lyrics")
+
+        >>> lyrics = "Hey, are you awake? Can you talk to me?"
+        >>> artist = "Zac Brown Band"
+        >>> genre = "Country"
+        >>> metas = tokenizer(artist=artist, genres=genre, lyrics=lyrics)
+        >>> set_seed(0)
+        >>> music_tokens = model.ancestral_sample(metas.input_ids, sample_length=400)
+
+        >>> with torch.no_grad():
+        ...     model.decode(music_tokens)[:, :10].squeeze(-1)
+        tensor([[-0.0219, -0.0679, -0.1050, -0.1203, -0.1271, -0.0936, -0.0396, -0.0405,
+            -0.0818, -0.0697]])
+        ```
+        """
+
+        sample_levels = sampling_kwargs.pop("sample_levels", list(range(len(self.priors))))
+        music_tokens = [
+            torch.zeros(n_samples, 0, dtype=torch.long, device=labels[0].device) for _ in range(len(self.priors))
+        ]
+        music_tokens = self._sample(music_tokens, labels, sample_levels, **sampling_kwargs)
+        return music_tokens
+
+    @add_start_docstrings(
+        """Generates a continuation of the previously generated tokens.
+
+        Args:
+            music_tokens (`list[torch.LongTensor]` of length `self.levels` ) :
+                A sequence of music tokens which will be used as context to continue the sampling process. Should have
+                `self.levels` tensors, each corresponding to the generation at a certain level.
+        """,
+        JUKEBOX_SAMPLING_INPUT_DOCSTRING,
+    )
+    def continue_sample(self, music_tokens, labels, **sampling_kwargs) -> list[torch.LongTensor]:
+        sample_levels = sampling_kwargs.pop("sample_levels", list(range(len(self.priors))))
+        music_tokens = self._sample(music_tokens, labels, sample_levels, **sampling_kwargs)
+        return music_tokens
+
+    @add_start_docstrings(
+        """Upsamples a sequence of music tokens using the prior at level `level`.
+
+        Args:
+            music_tokens (`list[torch.LongTensor]` of length `self.levels` ) :
+                A sequence of music tokens which will be used as context to continue the sampling process. Should have
+                `self.levels` tensors, each corresponding to the generation at a certain level.
+        """,
+        JUKEBOX_SAMPLING_INPUT_DOCSTRING,
+    )
+    def upsample(self, music_tokens, labels, **sampling_kwargs) -> list[torch.LongTensor]:
+        sample_levels = sampling_kwargs.pop("sample_levels", list(range(len(self.priors) - 1)))
+        music_tokens = self._sample(music_tokens, labels, sample_levels, **sampling_kwargs)
+        return music_tokens
+
+    @add_start_docstrings(
+        """Generate a raw audio conditioned on the provided `raw_audio` which is used as conditioning at each of the
+        generation levels. The audio is encoded to music tokens using the 3 levels of the VQ-VAE. These tokens are
+        used: as conditioning for each level, which means that no ancestral sampling is required.
+
+        Args:
+            raw_audio (`list[torch.Tensor]` of length `n_samples` ) :
+                A list of raw audio that will be used as conditioning information for each samples that will be
+                generated.
+        """,
+        JUKEBOX_SAMPLING_INPUT_DOCSTRING,
+    )
+    def primed_sample(self, raw_audio, labels, **sampling_kwargs) -> list[torch.LongTensor]:
+        sample_levels = sampling_kwargs.pop("sample_levels", list(range(len(self.priors))))
+        self.vqvae.to(raw_audio.device).float()
+        with torch.no_grad():
+            music_tokens = self.vqvae.encode(
+                raw_audio, start_level=0, end_level=len(self.priors), bs_chunks=raw_audio.shape[0]
+            )
+        music_tokens = self._sample(music_tokens, labels, sample_levels, **sampling_kwargs)
+        return music_tokens
+
+
+__all__ = ["JukeboxModel", "JukeboxPreTrainedModel", "JukeboxVQVAE", "JukeboxPrior"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/jukebox/tokenization_jukebox.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/jukebox/tokenization_jukebox.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec2162db2cce52876fee37d9806c89b4dbabca2f
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/jukebox/tokenization_jukebox.py
@@ -0,0 +1,407 @@
+# coding=utf-8
+# Copyright 2022 The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for OpenAI Jukebox."""
+
+import json
+import os
+import re
+import unicodedata
+from json.encoder import INFINITY
+from typing import Any, Optional, Union
+
+import numpy as np
+import regex
+
+from ....tokenization_utils import AddedToken, PreTrainedTokenizer
+from ....tokenization_utils_base import BatchEncoding
+from ....utils import TensorType, is_flax_available, is_tf_available, is_torch_available, logging
+from ....utils.generic import _is_jax, _is_numpy
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {
+    "artists_file": "artists.json",
+    "lyrics_file": "lyrics.json",
+    "genres_file": "genres.json",
+}
+
+
+class JukeboxTokenizer(PreTrainedTokenizer):
+    """
+    Constructs a Jukebox tokenizer. Jukebox can be conditioned on 3 different inputs :
+        - Artists, unique ids are associated to each artist from the provided dictionary.
+        - Genres, unique ids are associated to each genre from the provided dictionary.
+        - Lyrics, character based tokenization. Must be initialized with the list of characters that are inside the
+        vocabulary.
+
+    This tokenizer does not require training. It should be able to process a different number of inputs:
+    as the conditioning of the model can be done on the three different queries. If None is provided, defaults values will be used.:
+
+    Depending on the number of genres on which the model should be conditioned (`n_genres`).
+    ```python
+    >>> from transformers import JukeboxTokenizer
+
+    >>> tokenizer = JukeboxTokenizer.from_pretrained("openai/jukebox-1b-lyrics")
+    >>> tokenizer("Alan Jackson", "Country Rock", "old town road")["input_ids"]
+    [tensor([[   0,    0,    0, 6785,  546,   41,   38,   30,   76,   46,   41,   49,
+               40,   76,   44,   41,   27,   30]]), tensor([[  0,   0,   0, 145,   0]]), tensor([[  0,   0,   0, 145,   0]])]
+    ```
+
+    You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
+    call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
+
+    <Tip>
+
+    If nothing is provided, the genres and the artist will either be selected randomly or set to None
+
+    </Tip>
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to:
+    this superclass for more information regarding those methods.
+
+    However the code does not allow that and only supports composing from various genres.
+
+    Args:
+        artists_file (`str`):
+            Path to the vocabulary file which contains a mapping between artists and ids. The default file supports
+            both "v2" and "v3"
+        genres_file (`str`):
+            Path to the vocabulary file which contain a mapping between genres and ids.
+        lyrics_file (`str`):
+            Path to the vocabulary file which contains the accepted characters for the lyrics tokenization.
+        version (`list[str]`, `optional`, default to `["v3", "v2", "v2"]`) :
+            List of the tokenizer versions. The `5b-lyrics`'s top level prior model was trained using `v3` instead of
+            `v2`.
+        n_genres (`int`, `optional`, defaults to 1):
+            Maximum number of genres to use for composition.
+        max_n_lyric_tokens (`int`, `optional`, defaults to 512):
+            Maximum number of lyric tokens to keep.
+        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        artists_file,
+        genres_file,
+        lyrics_file,
+        version=["v3", "v2", "v2"],
+        max_n_lyric_tokens=512,
+        n_genres=5,
+        unk_token="<|endoftext|>",
+        **kwargs,
+    ):
+        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+        self.version = version
+        self.max_n_lyric_tokens = max_n_lyric_tokens
+        self.n_genres = n_genres
+        self._added_tokens_decoder = {0: unk_token}
+
+        with open(artists_file, encoding="utf-8") as vocab_handle:
+            self.artists_encoder = json.load(vocab_handle)
+
+        with open(genres_file, encoding="utf-8") as vocab_handle:
+            self.genres_encoder = json.load(vocab_handle)
+
+        with open(lyrics_file, encoding="utf-8") as vocab_handle:
+            self.lyrics_encoder = json.load(vocab_handle)
+
+        oov = r"[^A-Za-z0-9.,:;!?\-'\"()\[\] \t\n]+"
+        # In v2, we had a n_vocab=80 and in v3 we missed + and so n_vocab=79 of characters.
+        if len(self.lyrics_encoder) == 79:
+            oov = oov.replace(r"\-'", r"\-+'")
+
+        self.out_of_vocab = regex.compile(oov)
+        self.artists_decoder = {v: k for k, v in self.artists_encoder.items()}
+        self.genres_decoder = {v: k for k, v in self.genres_encoder.items()}
+        self.lyrics_decoder = {v: k for k, v in self.lyrics_encoder.items()}
+        super().__init__(
+            unk_token=unk_token,
+            n_genres=n_genres,
+            version=version,
+            max_n_lyric_tokens=max_n_lyric_tokens,
+            **kwargs,
+        )
+
+    @property
+    def vocab_size(self):
+        return len(self.artists_encoder) + len(self.genres_encoder) + len(self.lyrics_encoder)
+
+    def get_vocab(self):
+        return {
+            "artists_encoder": self.artists_encoder,
+            "genres_encoder": self.genres_encoder,
+            "lyrics_encoder": self.lyrics_encoder,
+        }
+
+    def _convert_token_to_id(self, list_artists, list_genres, list_lyrics):
+        """Converts the artist, genre and lyrics tokens to their index using the vocabulary.
+        The total_length, offset and duration have to be provided in order to select relevant lyrics and add padding to
+        the lyrics token sequence.
+        """
+        artists_id = [self.artists_encoder.get(artist, 0) for artist in list_artists]
+        for genres in range(len(list_genres)):
+            list_genres[genres] = [self.genres_encoder.get(genre, 0) for genre in list_genres[genres]]
+            list_genres[genres] = list_genres[genres] + [-1] * (self.n_genres - len(list_genres[genres]))
+
+        lyric_ids = [[self.lyrics_encoder.get(character, 0) for character in list_lyrics[0]], [], []]
+        return artists_id, list_genres, lyric_ids
+
+    def _tokenize(self, lyrics):
+        """
+        Converts a string into a sequence of tokens (string), using the tokenizer. Split in words for word-based
+        vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
+
+        Do NOT take care of added tokens. Only the lyrics are split into character for the character-based vocabulary.
+        """
+        # only lyrics are not tokenized, but character based is easily handled
+        return list(lyrics)
+
+    def tokenize(self, artist, genre, lyrics, **kwargs):
+        """
+        Converts three strings in a 3 sequence of tokens using the tokenizer
+        """
+        artist, genre, lyrics = self.prepare_for_tokenization(artist, genre, lyrics)
+        lyrics = self._tokenize(lyrics)
+        return artist, genre, lyrics
+
+    def prepare_for_tokenization(
+        self, artists: str, genres: str, lyrics: str, is_split_into_words: bool = False
+    ) -> tuple[str, str, str, dict[str, Any]]:
+        """
+        Performs any necessary transformations before tokenization.
+
+        Args:
+            artist (`str`):
+                The artist name to prepare. This will mostly lower the string
+            genres (`str`):
+                The genre name to prepare. This will mostly lower the string.
+            lyrics (`str`):
+                The lyrics to prepare.
+            is_split_into_words (`bool`, *optional*, defaults to `False`):
+                Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
+                tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
+                which it will tokenize. This is useful for NER or token classification.
+        """
+        for idx in range(len(self.version)):
+            if self.version[idx] == "v3":
+                artists[idx] = artists[idx].lower()
+                genres[idx] = [genres[idx].lower()]
+            else:
+                artists[idx] = self._normalize(artists[idx]) + ".v2"
+                genres[idx] = [
+                    self._normalize(genre) + ".v2" for genre in genres[idx].split("_")
+                ]  # split is for the full dictionary with combined genres
+
+        if self.version[0] == "v2":
+            self.out_of_vocab = regex.compile(r"[^A-Za-z0-9.,:;!?\-'\"()\[\] \t\n]+")
+            vocab = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,:;!?-+'\"()[] \t\n"
+            self.vocab = {vocab[index]: index + 1 for index in range(len(vocab))}
+            self.vocab["<unk>"] = 0
+            self.n_vocab = len(vocab) + 1
+            self.lyrics_encoder = self.vocab
+            self.lyrics_decoder = {v: k for k, v in self.vocab.items()}
+            self.lyrics_decoder[0] = ""
+        else:
+            self.out_of_vocab = regex.compile(r"[^A-Za-z0-9.,:;!?\-+'\"()\[\] \t\n]+")
+
+        lyrics = self._run_strip_accents(lyrics)
+        lyrics = lyrics.replace("\\", "\n")
+        lyrics = self.out_of_vocab.sub("", lyrics), [], []
+        return artists, genres, lyrics
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
+    def _normalize(self, text: str) -> str:
+        """
+        Normalizes the input text. This process is for the genres and the artist
+
+        Args:
+            text (`str`):
+                Artist or Genre string to normalize
+        """
+
+        accepted = (
+            [chr(i) for i in range(ord("a"), ord("z") + 1)]
+            + [chr(i) for i in range(ord("A"), ord("Z") + 1)]
+            + [chr(i) for i in range(ord("0"), ord("9") + 1)]
+            + ["."]
+        )
+        accepted = frozenset(accepted)
+        pattern = re.compile(r"_+")
+        text = "".join([c if c in accepted else "_" for c in text.lower()])
+        text = pattern.sub("_", text).strip("_")
+        return text
+
+    def convert_lyric_tokens_to_string(self, lyrics: list[str]) -> str:
+        return " ".join(lyrics)
+
+    def convert_to_tensors(
+        self, inputs, tensor_type: Optional[Union[str, TensorType]] = None, prepend_batch_axis: bool = False
+    ):
+        """
+        Convert the inner content to tensors.
+
+        Args:
+            tensor_type (`str` or [`~utils.TensorType`], *optional*):
+                The type of tensors to use. If `str`, should be one of the values of the enum [`~utils.TensorType`]. If
+                unset, no modification is done.
+            prepend_batch_axis (`int`, *optional*, defaults to `False`):
+                Whether or not to add the batch dimension during the conversion.
+        """
+        # Convert to TensorType
+        if not isinstance(tensor_type, TensorType):
+            tensor_type = TensorType(tensor_type)
+
+        # Get a function reference for the correct framework
+        if tensor_type == TensorType.TENSORFLOW:
+            if not is_tf_available():
+                raise ImportError(
+                    "Unable to convert output to TensorFlow tensors format, TensorFlow is not installed."
+                )
+            import tensorflow as tf
+
+            as_tensor = tf.constant
+            is_tensor = tf.is_tensor
+        elif tensor_type == TensorType.PYTORCH:
+            if not is_torch_available():
+                raise ImportError("Unable to convert output to PyTorch tensors format, PyTorch is not installed.")
+            import torch
+
+            as_tensor = torch.tensor
+            is_tensor = torch.is_tensor
+        elif tensor_type == TensorType.JAX:
+            if not is_flax_available():
+                raise ImportError("Unable to convert output to JAX tensors format, JAX is not installed.")
+            import jax.numpy as jnp  # noqa: F811
+
+            as_tensor = jnp.array
+            is_tensor = _is_jax
+        else:
+            as_tensor = np.asarray
+            is_tensor = _is_numpy
+
+        # Do the tensor conversion in batch
+
+        try:
+            if prepend_batch_axis:
+                inputs = [inputs]
+
+            if not is_tensor(inputs):
+                inputs = as_tensor(inputs)
+        except:  # noqa E722
+            raise ValueError(
+                "Unable to create tensor, you should probably activate truncation and/or padding "
+                "with 'padding=True' 'truncation=True' to have batched tensors with the same length."
+            )
+
+        return inputs
+
+    def __call__(self, artist, genres, lyrics="", return_tensors="pt") -> BatchEncoding:
+        """Convert the raw string to a list of token ids
+
+        Args:
+            artist (`str`):
+                Name of the artist.
+            genres (`str`):
+                List of genres that will be mixed to condition the audio
+            lyrics (`str`, *optional*, defaults to `""`):
+                Lyrics used to condition the generation
+        """
+        input_ids = [0, 0, 0]
+        artist = [artist] * len(self.version)
+        genres = [genres] * len(self.version)
+
+        artists_tokens, genres_tokens, lyrics_tokens = self.tokenize(artist, genres, lyrics)
+        artists_id, genres_ids, full_tokens = self._convert_token_to_id(artists_tokens, genres_tokens, lyrics_tokens)
+
+        attention_masks = [-INFINITY] * len(full_tokens[-1])
+        input_ids = [
+            self.convert_to_tensors(
+                [input_ids + [artists_id[i]] + genres_ids[i] + full_tokens[i]], tensor_type=return_tensors
+            )
+            for i in range(len(self.version))
+        ]
+        return BatchEncoding({"input_ids": input_ids, "attention_masks": attention_masks})
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        """
+        Saves the tokenizer's vocabulary dictionary to the provided save_directory.
+
+        Args:
+            save_directory (`str`):
+                A path to the directory where to saved. It will be created if it doesn't exist.
+
+            filename_prefix (`Optional[str]`, *optional*):
+                A prefix to add to the names of the files saved by the tokenizer.
+
+        """
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+
+        artists_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["artists_file"]
+        )
+        with open(artists_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.artists_encoder, ensure_ascii=False))
+
+        genres_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["genres_file"]
+        )
+        with open(genres_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.genres_encoder, ensure_ascii=False))
+
+        lyrics_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["lyrics_file"]
+        )
+        with open(lyrics_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.lyrics_encoder, ensure_ascii=False))
+
+        return (artists_file, genres_file, lyrics_file)
+
+    def _convert_id_to_token(self, artists_index, genres_index, lyric_index):
+        """
+        Converts an index (integer) in a token (str) using the vocab.
+
+        Args:
+            artists_index (`int`):
+                Index of the artist in its corresponding dictionary.
+            genres_index (`Union[list[int], int]`):
+               Index of the genre in its corresponding dictionary.
+            lyric_index (`list[int]`):
+                List of character indices, which each correspond to a character.
+        """
+        artist = self.artists_decoder.get(artists_index)
+        genres = [self.genres_decoder.get(genre) for genre in genres_index]
+        lyrics = [self.lyrics_decoder.get(character) for character in lyric_index]
+        return artist, genres, lyrics
+
+
+__all__ = ["JukeboxTokenizer"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/mctct/__init__.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/mctct/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..53ec5ed37c13614266d04cde838ff7360946a451
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/mctct/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ....utils import _LazyModule
+from ....utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_mctct import *
+    from .feature_extraction_mctct import *
+    from .modeling_mctct import *
+    from .processing_mctct import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/mctct/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/mctct/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ac8db3542c99e1af5aa9beb2487df2be6c50ef0c
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/mctct/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/mctct/__pycache__/configuration_mctct.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/mctct/__pycache__/configuration_mctct.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..86e7778a21f64decc54503137a69affb32cbe036
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/mctct/__pycache__/configuration_mctct.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/mctct/__pycache__/feature_extraction_mctct.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/mctct/__pycache__/feature_extraction_mctct.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6bce04c74d0a569bb7a137428aaace321cf80bca
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/mctct/__pycache__/feature_extraction_mctct.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/mctct/__pycache__/modeling_mctct.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/mctct/__pycache__/modeling_mctct.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9c5f719903f53c83f67182b6ae9b6464502c4952
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/mctct/__pycache__/modeling_mctct.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/mctct/__pycache__/processing_mctct.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/mctct/__pycache__/processing_mctct.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0e498627cb236e43ab636294a3b88488d115ed98
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/mctct/__pycache__/processing_mctct.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/mctct/configuration_mctct.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/mctct/configuration_mctct.py
new file mode 100644
index 0000000000000000000000000000000000000000..984dca4a62bfa726bae9081183cd18853381d3ad
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/mctct/configuration_mctct.py
@@ -0,0 +1,184 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""M-CTC-T model configuration"""
+
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class MCTCTConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MCTCTModel`]. It is used to instantiate an
+    M-CTC-T model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the M-CTC-T
+    [speechbrain/m-ctc-t-large](https://huggingface.co/speechbrain/m-ctc-t-large) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 8065):
+            Vocabulary size of the M-CTC-T model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`MCTCTModel`].
+        hidden_size (`int`, *optional*, defaults to 1536):
+            Dimension of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 36):
+            Number of hidden layers in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 6144):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 4):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        attention_head_dim (`int`, *optional*, defaults to 384):
+            Dimensions of each attention head for each attention layer in the Transformer encoder.
+        max_position_embeddings (`int`, *optional*, defaults to 920):
+            The maximum sequence length that this model might ever be used with (after log-mel spectrogram extraction).
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        layerdrop (`float`, *optional*, defaults to 0.3):
+            The probability of dropping an encoder layer during training. The default 0.3 value is used in the original
+            implementation.
+        hidden_act (`str` or `function`, *optional*, defaults to `"relu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.3):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.3):
+            The dropout ratio for the attention probabilities.
+        pad_token_id (`int`, *optional*, defaults to 1):
+            The tokenizer index of the pad token.
+        bos_token_id (`int`, *optional*, defaults to 0):
+            The tokenizer index of the bos token.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            The tokenizer index of the eos token.
+        conv_glu_dim (`int`, *optional*, defaults to 1):
+            The dimension of the output of the `Conv1dSubsampler` layer in which GLU is applied on. Though the original
+            Flashlight code uses the value of 2, here it's adapted to 1 due to transposition differences.
+        conv_dropout (`int`, *optional*, defaults to 0.3):
+            The probability of randomly dropping the `Conv1dSubsampler` layer during training.
+        num_conv_layers (`int`, *optional*, defaults to 1):
+            Number of convolution layers before applying transformer encoder layers.
+        conv_kernel (`Sequence[int]`, *optional*, defaults to `(7,)`):
+            The kernel size of the 1D convolution applied before transformer layers. `len(conv_kernel)` must be equal
+            to `num_conv_layers`.
+        conv_stride (`Sequence[int]`, *optional*, defaults to `(3,)`):
+            The stride length of the 1D convolution applied before transformer layers. `len(conv_stride)` must be equal
+            to `num_conv_layers`.
+        input_feat_per_channel (`int`, *optional*, defaults to 80):
+            Feature dimensions of the channels of the input to the Conv1D layer.
+        input_channels (`int`, *optional*, defaults to 1):
+            Number of input channels of the input to the Conv1D layer.
+        conv_channels (`list[int]`, *optional*):
+            Channel sizes of intermediate Conv1D layers.
+        ctc_loss_reduction (`str`, *optional*, defaults to `"sum"`):
+            Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an
+            instance of [`MCTCTForCTC`].
+        ctc_zero_infinity (`bool`, *optional*, defaults to `False`):
+            Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses mainly
+            occur when the inputs are too short to be aligned to the targets. Only relevant when training an instance
+            of [`MCTCTForCTC`].
+
+    Example:
+
+    ```python
+    >>> from transformers import MCTCTConfig, MCTCTModel
+
+    >>> # Initializing a M-CTC-T mctct-large style configuration
+    >>> configuration = MCTCTConfig()
+
+    >>> # Initializing a model (with random weights) from the mctct-large style configuration
+    >>> model = MCTCTModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "mctct"
+
+    def __init__(
+        self,
+        vocab_size=8065,
+        hidden_size=1536,
+        num_hidden_layers=36,
+        intermediate_size=6144,
+        num_attention_heads=4,
+        attention_head_dim=384,
+        max_position_embeddings=920,
+        layer_norm_eps=1e-5,
+        layerdrop=0.3,
+        hidden_act="relu",
+        initializer_range=0.02,
+        hidden_dropout_prob=0.3,
+        attention_probs_dropout_prob=0.3,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        conv_glu_dim=1,
+        conv_dropout=0.3,
+        num_conv_layers=1,
+        conv_kernel=(7,),
+        conv_stride=(3,),
+        input_feat_per_channel=80,
+        input_channels=1,
+        conv_channels=None,
+        ctc_loss_reduction="sum",
+        ctc_zero_infinity=False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.intermediate_size = intermediate_size
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        self.max_position_embeddings = max_position_embeddings
+        self.layer_norm_eps = layer_norm_eps
+        self.layerdrop = layerdrop
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.conv_glu_dim = conv_glu_dim
+        self.conv_dropout = conv_dropout
+        self.num_conv_layers = num_conv_layers
+        self.input_feat_per_channel = input_feat_per_channel
+        self.input_channels = input_channels
+        self.conv_channels = conv_channels
+        self.ctc_loss_reduction = ctc_loss_reduction
+        self.ctc_zero_infinity = ctc_zero_infinity
+
+        # prevents config testing fail with exporting to json
+        self.conv_kernel = list(conv_kernel)
+        self.conv_stride = list(conv_stride)
+
+        if len(self.conv_kernel) != self.num_conv_layers:
+            raise ValueError(
+                "Configuration for convolutional module is incorrect. "
+                "It is required that `len(config.conv_kernel)` == `config.num_conv_layers` "
+                f"but is `len(config.conv_kernel) = {len(self.conv_kernel)}`, "
+                f"`config.num_conv_layers = {self.num_conv_layers}`."
+            )
+
+
+__all__ = ["MCTCTConfig"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/mctct/feature_extraction_mctct.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/mctct/feature_extraction_mctct.py
new file mode 100644
index 0000000000000000000000000000000000000000..966a160f91b07177b16043a00ccd76685926a422
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/mctct/feature_extraction_mctct.py
@@ -0,0 +1,291 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Feature extractor class for M-CTC-T
+"""
+
+from typing import Optional, Union
+
+import numpy as np
+
+from ....audio_utils import mel_filter_bank, optimal_fft_length, spectrogram, window_function
+from ....feature_extraction_sequence_utils import SequenceFeatureExtractor
+from ....feature_extraction_utils import BatchFeature
+from ....file_utils import PaddingStrategy, TensorType
+from ....utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class MCTCTFeatureExtractor(SequenceFeatureExtractor):
+    r"""
+    Constructs a M-CTC-T feature extractor.
+
+    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
+    most of the main methods. Users should refer to this superclass for more information regarding those methods. This
+    code has been adapted from Flashlight's C++ code. For more information about the implementation, one can refer to
+    this [notebook](https://colab.research.google.com/drive/1GLtINkkhzms-IsdcGy_-tVCkv0qNF-Gt#scrollTo=pMCRGMmUC_an)
+    that takes the user step-by-step in the implementation.
+
+    Args:
+        feature_size (`int`, defaults to 80):
+            The feature dimension of the extracted features. This is the number of mel_frequency
+        sampling_rate (`int`, defaults to 16000):
+            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
+        padding_value (`float`, defaults to 0.0):
+            The value that is used to fill the padding values.
+        hop_length (`int`, defaults to 10):
+            Number of audio samples between windows. Otherwise referred to as "shift" in many papers.
+        win_length (`int`, defaults to 25):
+            Number of ms per window
+        win_function (`str`, defaults to `"hamming_window"`):
+            Name for the window function used for windowing, must be accessible via `torch.{win_function}`
+        frame_signal_scale (`float`, defaults to 32768.0):
+            Constant multiplied in creating the frames before applying DFT.
+        preemphasis_coeff (`float`, defaults to 0.97):
+            Constant multiplied in applying Pre-emphasis before DFT.
+        mel_floor (`float` defaults to 1.0):
+            Minimum value of mel frequency banks.
+        normalize_means (`bool`, *optional*, defaults to `True`):
+            Whether or not to zero-mean normalize the extracted features.
+        normalize_vars (`bool`, *optional*, defaults to `True`):
+            Whether or not to unit-variance normalize the extracted features.
+    """
+
+    model_input_names = ["input_features", "attention_mask"]
+
+    def __init__(
+        self,
+        feature_size=80,
+        sampling_rate=16000,
+        padding_value=0.0,
+        hop_length=10,
+        win_length=25,
+        win_function="hamming_window",
+        frame_signal_scale=32768.0,
+        preemphasis_coeff=0.97,
+        mel_floor=1.0,
+        normalize_means=True,
+        normalize_vars=True,
+        return_attention_mask=False,
+        **kwargs,
+    ):
+        super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
+
+        self.feature_size = feature_size
+        self.sampling_rate = sampling_rate
+        self.padding_value = padding_value
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.frame_signal_scale = frame_signal_scale
+        self.preemphasis_coeff = preemphasis_coeff
+        self.mel_floor = mel_floor
+        self.normalize_means = normalize_means
+        self.normalize_vars = normalize_vars
+        self.win_function = win_function
+        self.return_attention_mask = return_attention_mask
+
+        self.sample_size = win_length * sampling_rate // 1000
+        self.sample_stride = hop_length * sampling_rate // 1000
+
+        self.n_fft = optimal_fft_length(self.sample_size)
+        self.n_freqs = (self.n_fft // 2) + 1
+
+    def _extract_mfsc_features(self, one_waveform: np.ndarray) -> np.ndarray:
+        """
+        Extracts MFSC Features for one waveform vector (unbatched). Adapted from Flashlight's C++ MFSC code.
+        """
+        if self.win_function == "hamming_window":
+            window = window_function(window_length=self.sample_size, name=self.win_function, periodic=False)
+        else:
+            window = window_function(window_length=self.sample_size, name=self.win_function)
+
+        fbanks = mel_filter_bank(
+            num_frequency_bins=self.n_freqs,
+            num_mel_filters=self.feature_size,
+            min_frequency=0.0,
+            max_frequency=self.sampling_rate / 2.0,
+            sampling_rate=self.sampling_rate,
+        )
+
+        msfc_features = spectrogram(
+            one_waveform * self.frame_signal_scale,
+            window=window,
+            frame_length=self.sample_size,
+            hop_length=self.sample_stride,
+            fft_length=self.n_fft,
+            center=False,
+            preemphasis=self.preemphasis_coeff,
+            mel_filters=fbanks,
+            mel_floor=self.mel_floor,
+            log_mel="log",
+        )
+        return msfc_features.T
+
+    def _normalize_one(self, x, input_length, padding_value):
+        # make sure we normalize float32 arrays
+        if self.normalize_means:
+            mean = x[:input_length].mean(axis=0)
+            x = np.subtract(x, mean)
+        if self.normalize_vars:
+            std = x[:input_length].std(axis=0)
+            x = np.divide(x, std)
+
+        if input_length < x.shape[0]:
+            x[input_length:] = padding_value
+
+        # make sure array is in float32
+        x = x.astype(np.float32)
+
+        return x
+
+    def normalize(
+        self, input_features: list[np.ndarray], attention_mask: Optional[np.ndarray] = None
+    ) -> list[np.ndarray]:
+        lengths = attention_mask.sum(-1) if attention_mask is not None else [x.shape[0] for x in input_features]
+        return [self._normalize_one(x, n, self.padding_value) for x, n in zip(input_features, lengths)]
+
+    def __call__(
+        self,
+        raw_speech: Union[np.ndarray, list[float], list[np.ndarray], list[list[float]]],
+        padding: Union[bool, str, PaddingStrategy] = False,
+        max_length: Optional[int] = None,
+        truncation: bool = False,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        sampling_rate: Optional[int] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Main method to featurize and prepare for the model one or several sequence(s). sequences. It returns the
+        log-mel spectrogram of the input audio, as implemented in the original Flashlight MFSC feature extraction code.
+
+        Args:
+            raw_speech (`torch.Tensor`, `np.ndarray`, `list[float]`, `list[torch.Tensor]`, `list[np.ndarray]`, `list[list[float]]`):
+                The sequence or batch of sequences to be padded. Each sequence can be a tensor, a numpy array, a list
+                of float values, a list of tensors, a list of numpy arrays or a list of list of float values. Must be
+                mono channel audio, not stereo, i.e. single float per timestep.
+            padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `False`):
+                Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                index) among:
+
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            max_length (`int`, *optional*):
+                Maximum length of the returned list and optionally padding length (see above).
+            truncation (`bool`):
+                Activates truncation to cut input sequences longer than *max_length* to *max_length*.
+            pad_to_multiple_of (`int`, *optional*):
+                If set will pad the sequence to a multiple of the provided value.
+
+                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
+            return_attention_mask (`bool`, *optional*):
+                Whether to return the attention mask. If left to the default, will return the attention mask according
+                to the specific feature_extractor's default.
+
+                [What are attention masks?](../glossary#attention-mask)
+
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+            sampling_rate (`int`, *optional*):
+                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
+                `sampling_rate` at the forward call to prevent silent errors.
+            padding_value (`float`, defaults to 0.0):
+        """
+
+        if sampling_rate is not None:
+            if sampling_rate != self.sampling_rate:
+                raise ValueError(
+                    f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
+                    f" {self.sampling_rate}. Please make sure that the provided `raw_speech` input was sampled with"
+                    f" {self.sampling_rate} and not {sampling_rate}."
+                )
+        else:
+            logger.warning(
+                "It is strongly recommended to pass the ``sampling_rate`` argument to this function. "
+                "Failing to do so can result in silent errors that might be hard to debug."
+            )
+
+        is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
+        if is_batched_numpy and len(raw_speech.shape) > 2:
+            raise ValueError(f"Only mono-channel audio is supported for input to {self}")
+        is_batched = is_batched_numpy or (
+            isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list)))
+        )
+
+        if is_batched:
+            raw_speech = [np.asarray(speech, dtype=np.float32) for speech in raw_speech]
+        elif not is_batched and not isinstance(raw_speech, np.ndarray):
+            raw_speech = np.asarray(raw_speech, dtype=np.float32)
+        elif isinstance(raw_speech, np.ndarray) and raw_speech.dtype is np.dtype(np.float64):
+            raw_speech = raw_speech.astype(np.float32)
+
+        # always return batch
+        if not is_batched:
+            raw_speech = [raw_speech]
+
+        # extract fbank features
+        features = [self._extract_mfsc_features(one_waveform) for one_waveform in raw_speech]
+
+        # convert into correct format for padding
+        encoded_inputs = BatchFeature({"input_features": features})
+
+        padded_inputs = self.pad(
+            encoded_inputs,
+            padding=padding,
+            max_length=max_length,
+            truncation=truncation,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_attention_mask=True,
+            **kwargs,
+        )
+        # make sure list is in array format
+        input_features = padded_inputs.get("input_features")
+        if isinstance(input_features[0], list):
+            padded_inputs["input_features"] = [np.asarray(feature, dtype=np.float32) for feature in input_features]
+
+        attention_mask = padded_inputs.get("attention_mask")
+        if attention_mask is not None:
+            padded_inputs["attention_mask"] = [np.asarray(array, dtype=np.int32) for array in attention_mask]
+
+        if self.normalize_means or self.normalize_vars:
+            attention_mask = (
+                np.array(attention_mask, dtype=np.int32)
+                if self._get_padding_strategies(padding, max_length=max_length) is not PaddingStrategy.DO_NOT_PAD
+                and padding
+                else None
+            )
+            padded_inputs["input_features"] = self.normalize(
+                padded_inputs["input_features"], attention_mask=attention_mask
+            )
+
+        if return_tensors is not None:
+            padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
+
+        return padded_inputs
+
+
+__all__ = ["MCTCTFeatureExtractor"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/mctct/modeling_mctct.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/mctct/modeling_mctct.py
new file mode 100644
index 0000000000000000000000000000000000000000..16f59d3d1dfa11a11bd1ef51ad77e137adf2cc93
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/mctct/modeling_mctct.py
@@ -0,0 +1,778 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch M-CTC-T model."""
+
+import math
+from typing import Optional, Union
+
+import torch
+from torch import nn
+
+from ....activations import ACT2FN
+from ....file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
+from ....integrations.deepspeed import is_deepspeed_zero3_enabled
+from ....integrations.fsdp import is_fsdp_managed_module
+from ....modeling_attn_mask_utils import _prepare_4d_attention_mask
+from ....modeling_layers import GradientCheckpointingLayer
+from ....modeling_outputs import BaseModelOutput, CausalLMOutput
+from ....modeling_utils import PreTrainedModel
+from ....pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ....utils import logging
+from .configuration_mctct import MCTCTConfig
+
+
+logger = logging.get_logger(__name__)
+
+_HIDDEN_STATES_START_POSITION = 1
+
+_CONFIG_FOR_DOC = "MCTCTConfig"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "speechbrain/m-ctc-t-large"
+_EXPECTED_OUTPUT_SHAPE = [1, 195, 1536]
+
+# CTC docstring
+_CTC_EXPECTED_OUTPUT = '"Mr. Quilter is the apostle of the middle classes, and we\'re glad to welcome his gospel."'
+_CTC_EXPECTED_LOSS = 1885.65
+
+
+class MCTCTConv1dSubsampler(nn.Module):
+    """
+    Convolutional subsampler: a stack of 1D convolution (along temporal dimension) followed by non-linear activation
+    via gated linear units (https://huggingface.co/papers/1911.08460)
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.glu_dim = config.conv_glu_dim
+
+        self.dropout = nn.Dropout(config.conv_dropout)
+
+        self.num_layers = config.num_conv_layers
+        self.in_channels = config.input_feat_per_channel * config.input_channels
+
+        if self.num_layers > 1:
+            if config.conv_channels is None:
+                raise ValueError(
+                    "Need to specify `conv_channels` configuration in `MCTCTConfig` to use multiple convolution"
+                    " layers."
+                )
+
+            self.mid_channels = config.conv_channels
+        else:
+            self.mid_channels = None
+
+        self.out_channels = config.hidden_size * 2  # considering GLU halving
+        self.kernel_size = config.conv_kernel
+        self.stride = config.conv_stride
+
+        # NOTE: MCTCT by construction only uses one convolution kernel. I've made this flexible to allow for
+        # multiple layers of convolutions, but not sure if this model definition should just restrict it
+        # to one layer. This becomes especially relevant when considering the padding like line 1 of forward().
+        self.conv_layers = nn.ModuleList(
+            nn.Conv1d(
+                self.in_channels if i == 0 else self.mid_channels[i],
+                self.mid_channels[i] if i < self.num_layers - 1 else self.out_channels,
+                kernel_size=k,
+                stride=self.stride[i],
+                padding="valid",
+            )
+            for i, k in enumerate(self.kernel_size)
+        )
+
+    def forward(self, input_features):
+        # NOTE: in reference to the NOTE in __init__, right now it just calculates padding as if
+        # there will be just one conv layer.
+        padding = sum(size // 2 for size in self.kernel_size)  # (7, 7) -> (3, 3)
+
+        input_features = torch.nn.functional.pad(input_features, (0, 0, padding, padding), "constant", 0)
+        hidden_states = input_features.transpose(1, 2).contiguous()  # -> Batch x Frame x Time
+        for conv in self.conv_layers:
+            hidden_states = conv(hidden_states)
+            hidden_states = nn.functional.glu(hidden_states, dim=self.glu_dim)
+            hidden_states = self.dropout(hidden_states)
+
+        hidden_states = hidden_states.transpose(1, 2).contiguous()  # -> Batch x Time x Frame
+        return hidden_states
+
+
+class MCTCTEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        # self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.LayerNorm = MCTCTLayerNorm()
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+        self.register_buffer(
+            "token_type_ids",
+            torch.zeros(self.position_ids.size(), dtype=torch.long, device=self.position_ids.device),
+            persistent=False,
+        )
+
+    def forward(
+        self, input_features=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
+    ):
+        input_shape = input_features.size() if input_features is not None else inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
+
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_features)
+
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class MCTCTSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = config.attention_head_dim
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=False)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=False)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=False)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+        self.max_position_embeddings = config.max_position_embeddings
+        self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def reshape_fortran(self, x, shape):
+        if len(x.shape) > 0:
+            x = x.permute(*reversed(range(len(x.shape))))
+        return x.reshape(*reversed(shape)).permute(*reversed(range(len(shape))))
+
+    def relative_position_embedding_rotate(self, scores):
+        # NOTE: should re-evaluate whether this re-implementation was truly necessary
+        # or the reason why my complete re-haul worked was due to some other part
+        # of the code. Adding this and the reshape fortrain code seems very undesirable.
+        scores = scores.permute(0, 2, 3, 1)  # e.g. [10, 1839, 14, 4]
+
+        batch, hidden_state, seq_len, heads = scores.shape
+
+        # e.g. [10, 1853, 14, 4]
+        scores = torch.cat((scores, torch.zeros((batch, seq_len, seq_len, heads), device=scores.device)), dim=1)
+
+        # e.g. [10, 25942, 1, 4]
+        scores = self.reshape_fortran(scores, [batch, (hidden_state + seq_len) * seq_len, 1, heads])
+
+        # e.g. [10, 25928, 1, 4]
+        scores = scores[:, : (seq_len + hidden_state - 1) * seq_len]
+
+        # e.g. [10, 1852, 14, 4]
+        scores = self.reshape_fortran(scores, [batch, hidden_state + seq_len - 1, seq_len, heads])
+
+        halfpoint = hidden_state // 2
+        scores = scores[:, halfpoint : halfpoint + seq_len].transpose(1, 2)  # e.g. [10, 14, 14, 4]
+
+        return scores.permute(0, 3, 1, 2)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        output_attentions=False,
+    ):
+        mixed_query_layer = self.query(hidden_states)
+        mixed_query_layer = mixed_query_layer / math.sqrt(self.attention_head_size)
+
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        # relative key position embeddings
+        positional_embedding = self.distance_embedding.weight
+        relative_position_scores = torch.einsum("lh, bche -> bcle", positional_embedding, query_layer.transpose(2, 3))
+
+        relative_position_scores = self.relative_position_embedding_rotate(relative_position_scores)
+        attention_scores = attention_scores + relative_position_scores
+
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in MCTCTModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).flatten(start_dim=-2)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+class MCTCTLayerNorm(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.singleton_weight = nn.Parameter(torch.ones(1))
+        self.singleton_bias = nn.Parameter(torch.zeros(1))
+
+    def forward(self, hidden_states):
+        return (hidden_states * self.singleton_weight) + self.singleton_bias
+
+
+class MCTCTSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class MCTCTAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.self = MCTCTSelfAttention(config)
+        self.output = MCTCTSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        output_attentions=False,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+
+        return outputs
+
+
+class MCTCTIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class MCTCTOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class MCTCTLayer(GradientCheckpointingLayer):
+    def __init__(self, config: MCTCTConfig):
+        super().__init__()
+
+        self.seq_len_dim = 1
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+
+        self.intermediate = MCTCTIntermediate(config)
+        self.attention = MCTCTAttention(config)
+        self.is_decoder = config.is_decoder
+        self.output = MCTCTOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        output_attentions=False,
+    ):
+        self_attention_outputs = self.attention(
+            hidden_states, attention_mask, head_mask, output_attentions=output_attentions
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class MCTCTPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config: MCTCTConfig
+    base_model_prefix = "mctct"
+    main_input_name = "input_features"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, MCTCTLayerNorm):
+            module.singleton_weight.data.fill_(1.0)
+            module.singleton_bias.data.zero_()
+        if isinstance(module, (nn.Linear, nn.Conv1d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+
+    def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
+        """
+        Computes the output length of the convolutional layers
+        """
+        dilation = 1
+        for _, kernel_sz, stride in zip(
+            range(self.config.num_conv_layers), self.config.conv_kernel, self.config.conv_stride
+        ):
+            padding = kernel_sz // 2
+            input_lengths = input_lengths + 2 * padding - dilation * (kernel_sz - 1) - 1
+            input_lengths = torch.div(input_lengths, stride, rounding_mode="trunc") + 1
+
+        return input_lengths
+
+    def _get_feature_vector_attention_mask(self, feature_vector_length, attention_mask):
+        # generate creates 3D attention mask, because of the shape of input_features
+        # convert it to 2D if that's the case
+        if len(attention_mask.shape) > 2:
+            attention_mask = attention_mask[:, :, -1]
+
+        # subsampled_lengths = attention_mask.sum(-1)
+        subsampled_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1))
+        bsz = attention_mask.size()[0]
+        attention_mask = torch.zeros(
+            (bsz, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
+        )
+
+        # these two operations makes sure that all values
+        # before the output lengths indices are attended to
+        attention_mask[(torch.arange(bsz, device=attention_mask.device), subsampled_lengths - 1)] = 1
+        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).long()
+        return attention_mask
+
+
+MCTCT_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`MCTCTConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+MCTCT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_features (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`Wav2Vec2CTCTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class MCTCTEncoder(MCTCTPreTrainedModel):
+    def __init__(self, config: MCTCTConfig):
+        super().__init__(config)
+        self.hidden_dropout_prob = config.hidden_dropout_prob
+
+        self.layer_norm = MCTCTLayerNorm()
+        self.conv = MCTCTConv1dSubsampler(config)
+        self.layers = nn.ModuleList([MCTCTLayer(config) for _ in range(config.num_hidden_layers)])
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        input_features: torch.Tensor,
+        attention_mask: torch.Tensor,
+        head_mask: torch.Tensor,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        input_features = self.layer_norm(input_features)
+
+        inputs_embeds = self.conv(input_features)
+
+        # subsample attention mask if necessary
+        if attention_mask is not None:
+            attention_mask = self._get_feature_vector_attention_mask(inputs_embeds.shape[1], attention_mask)
+
+        hidden_states = nn.functional.dropout(inputs_embeds, p=self.hidden_dropout_prob, training=self.training)
+
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype)
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        # check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            if head_mask.size()[0] != len(self.layers):
+                raise ValueError(
+                    f"The head_mask should be specified for {len(self.layers)} layers, "
+                    f"but it is for {head_mask.size()[0]}."
+                )
+
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+
+            # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            dropout_probability = torch.rand([])
+
+            skip_the_layer = self.training and dropout_probability < self.config.layerdrop
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
+                layer_outputs = encoder_layer(
+                    hidden_states=hidden_states,
+                    attention_mask=attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+                hidden_states = layer_outputs[0]
+
+            if skip_the_layer:
+                layer_outputs = (None, None)
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+@add_start_docstrings(
+    "The bare M-CTC-T Model transformer outputting raw hidden-states without any specific head on top.",
+    MCTCT_START_DOCSTRING,
+)
+class MCTCTModel(MCTCTPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+
+        self.encoder = MCTCTEncoder(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(MCTCT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self,
+        input_features: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_features is None:
+            raise ValueError("You have to specify input_features.")
+
+        encoder_outputs = self.encoder(
+            input_features,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+
+        if not return_dict:
+            return (sequence_output,) + encoder_outputs[1:]
+
+        return BaseModelOutput(
+            last_hidden_state=sequence_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """MCTCT Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
+    MCTCT_START_DOCSTRING,
+)
+class MCTCTForCTC(MCTCTPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.mctct = MCTCTModel(config)
+
+        if config.vocab_size is None:
+            raise ValueError(
+                f"You are trying to instantiate {self.__class__} with a configuration that "
+                "does not define the vocabulary size of the language model head. Please "
+                "instantiate the model as follows: `MCTCTForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
+                "or define `vocab_size` of your model's configuration."
+            )
+        output_hidden_size = config.hidden_size
+
+        self.ctc_head = nn.Linear(output_hidden_size, config.vocab_size)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(MCTCT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=CausalLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_CTC_EXPECTED_OUTPUT,
+        expected_loss=_CTC_EXPECTED_LOSS,
+    )
+    def forward(
+        self,
+        input_features: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.LongTensor] = None,
+    ) -> Union[tuple, CausalLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
+            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
+            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
+            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
+            config.vocab_size - 1]`.
+        """
+        if labels is not None and labels.max() >= self.config.vocab_size:
+            raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.mctct(
+            input_features,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+
+        logits = self.ctc_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # retrieve loss input_lengths from attention_mask
+            attention_mask = (
+                attention_mask
+                if attention_mask is not None
+                else torch.ones(input_features.shape[:-1], dtype=torch.long)
+            )
+            input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
+            # assuming that padded tokens are filled with -100
+            # when not being attended to
+            labels_mask = labels >= 0
+            target_lengths = labels_mask.sum(-1)
+            flattened_targets = labels.masked_select(labels_mask)
+
+            # ctc_loss doesn't support fp16
+            log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)
+
+            with torch.backends.cudnn.flags(enabled=False):
+                loss = nn.functional.ctc_loss(
+                    log_probs,
+                    flattened_targets,
+                    input_lengths,
+                    target_lengths,
+                    blank=self.config.pad_token_id,
+                    reduction=self.config.ctc_loss_reduction,
+                    zero_infinity=self.config.ctc_zero_infinity,
+                )
+
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutput(
+            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
+        )
+
+
+__all__ = ["MCTCTForCTC", "MCTCTModel", "MCTCTPreTrainedModel"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/mctct/processing_mctct.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/mctct/processing_mctct.py
new file mode 100644
index 0000000000000000000000000000000000000000..9661a5082d4838b4e60042be2ceb747c03e045ce
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/mctct/processing_mctct.py
@@ -0,0 +1,139 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Speech processor class for M-CTC-T
+"""
+
+import warnings
+from contextlib import contextmanager
+
+from ....processing_utils import ProcessorMixin
+
+
+class MCTCTProcessor(ProcessorMixin):
+    r"""
+    Constructs a MCTCT processor which wraps a MCTCT feature extractor and a MCTCT tokenizer into a single processor.
+
+    [`MCTCTProcessor`] offers all the functionalities of [`MCTCTFeatureExtractor`] and [`AutoTokenizer`]. See the
+    [`~MCTCTProcessor.__call__`] and [`~MCTCTProcessor.decode`] for more information.
+
+    Args:
+        feature_extractor (`MCTCTFeatureExtractor`):
+            An instance of [`MCTCTFeatureExtractor`]. The feature extractor is a required input.
+        tokenizer (`AutoTokenizer`):
+            An instance of [`AutoTokenizer`]. The tokenizer is a required input.
+    """
+
+    feature_extractor_class = "MCTCTFeatureExtractor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(self, feature_extractor, tokenizer):
+        super().__init__(feature_extractor, tokenizer)
+        self.current_processor = self.feature_extractor
+        self._in_target_context_manager = False
+
+    def __call__(self, *args, **kwargs):
+        """
+        When used in normal mode, this method forwards all its arguments to MCTCTFeatureExtractor's
+        [`~MCTCTFeatureExtractor.__call__`] and returns its output. If used in the context
+        [`~MCTCTProcessor.as_target_processor`] this method forwards all its arguments to AutoTokenizer's
+        [`~AutoTokenizer.__call__`]. Please refer to the docstring of the above two methods for more information.
+        """
+        # For backward compatibility
+        if self._in_target_context_manager:
+            return self.current_processor(*args, **kwargs)
+
+        if "raw_speech" in kwargs:
+            warnings.warn("Using `raw_speech` as a keyword argument is deprecated. Use `audio` instead.")
+            audio = kwargs.pop("raw_speech")
+        else:
+            audio = kwargs.pop("audio", None)
+        sampling_rate = kwargs.pop("sampling_rate", None)
+        text = kwargs.pop("text", None)
+        if len(args) > 0:
+            audio = args[0]
+            args = args[1:]
+
+        if audio is None and text is None:
+            raise ValueError("You need to specify either an `audio` or `text` input to process.")
+
+        if audio is not None:
+            inputs = self.feature_extractor(audio, *args, sampling_rate=sampling_rate, **kwargs)
+        if text is not None:
+            encodings = self.tokenizer(text, **kwargs)
+
+        if text is None:
+            return inputs
+        elif audio is None:
+            return encodings
+        else:
+            inputs["labels"] = encodings["input_ids"]
+            return inputs
+
+    def pad(self, *args, **kwargs):
+        """
+        When used in normal mode, this method forwards all its arguments to MCTCTFeatureExtractor's
+        [`~MCTCTFeatureExtractor.pad`] and returns its output. If used in the context
+        [`~MCTCTProcessor.as_target_processor`] this method forwards all its arguments to PreTrainedTokenizer's
+        [`~PreTrainedTokenizer.pad`]. Please refer to the docstring of the above two methods for more information.
+        """
+        # For backward compatibility
+        if self._in_target_context_manager:
+            return self.current_processor.pad(*args, **kwargs)
+
+        input_features = kwargs.pop("input_features", None)
+        labels = kwargs.pop("labels", None)
+        if len(args) > 0:
+            input_features = args[0]
+            args = args[1:]
+
+        if input_features is not None:
+            input_features = self.feature_extractor.pad(input_features, *args, **kwargs)
+        if labels is not None:
+            labels = self.tokenizer.pad(labels, **kwargs)
+
+        if labels is None:
+            return input_features
+        elif input_features is None:
+            return labels
+        else:
+            input_features["labels"] = labels["input_ids"]
+            return input_features
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to AutoTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to the
+        docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @contextmanager
+    def as_target_processor(self):
+        """
+        Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning MCTCT.
+        """
+        warnings.warn(
+            "`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your "
+            "labels by using the argument `text` of the regular `__call__` method (either in the same call as "
+            "your audio inputs, or in a separate call."
+        )
+        self._in_target_context_manager = True
+        self.current_processor = self.tokenizer
+        yield
+        self.current_processor = self.feature_extractor
+        self._in_target_context_manager = False
+
+
+__all__ = ["MCTCTProcessor"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/mega/__init__.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/mega/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cff2c19505f9306c28edb5bdabb66f668363f5fa
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/mega/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ....utils import _LazyModule
+from ....utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_mega import *
+    from .modeling_mega import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/mega/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/mega/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6c5b16c77ed153134535e843dbe2f0e2f283ddb1
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/mega/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/mega/__pycache__/configuration_mega.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/mega/__pycache__/configuration_mega.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..50090bfd036c1ea903645d64d45babde47f086ed
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/mega/__pycache__/configuration_mega.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/mega/configuration_mega.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/mega/configuration_mega.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ede2cac66bf292c448b488d18af4d2e6e2c716a
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/mega/configuration_mega.py
@@ -0,0 +1,243 @@
+# coding=utf-8
+# Copyright 2023 The Mega Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""MEGA configuration"""
+
+from collections import OrderedDict
+from collections.abc import Mapping
+
+from ....configuration_utils import PretrainedConfig
+from ....onnx import OnnxConfig
+from ....utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class MegaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MegaModel`]. It is used to instantiate a Mega
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Mega
+    [mnaylor/mega-base-wikitext](https://huggingface.co/mnaylor/mega-base-wikitext) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the Mega model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`MegaModel`].
+        hidden_size (`int`, *optional*, defaults to 128):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 4):
+            Number of hidden layers in the Mega encoder.
+        intermediate_size (`int`, *optional*, defaults to 256):
+            Dimensionality of the hidden size (self-attention value projection) within the Mega encoder
+        ema_projection_size (`int`, *optional*, defaults to 16):
+            Dimensionality of the MegaMultiDimensionDampedEma
+        bidirectional (`bool`, *optional*, defaults to `True`):
+            Whether the MegaMultiDimensionDampedEma used in Mega's self-attention should work bidirectionally (`True`)
+            or unidirectionally (`False`). Bidirectional EMA is incompatible with causal decoding, so this should be
+            False if you intend to use the model as a decoder.
+        shared_representation_size (`int`, *optional*, defaults to 64):
+            Dimensionality of the linear projection for shared representation of self-attention queries and keys
+        use_chunking (`bool`, *optional*, defaults to `False`):
+            Whether to chunk inputs for linear self-attention complexity (described as Mega-chunk in the paper)
+        chunk_size (`int`, *optional*, defaults to -1):
+            If `use_chunking` is set to `True`, determines the size of the chunks to apply to the input sequence. If
+            chunking is used, input sequences must be padded to a multiple of `chunk_size`
+        truncation (`int`, *optional*):
+            If specified, the sequence length for which to truncate MegaMultiDimensionDampedEma
+        normalize_before_mega (`bool`, *optional*, defaults to `True`):
+            Whether to normalize before (`True`) or after (`False`) passing through Mega encoder blocks
+        normalization_type (`str`, *optional*, defaults to `"scalenorm"`):
+            Type of normalization to use in Mega encoder blocks. Choose one of `"scalenorm"`, `"layernorm"`,
+            `"rmsnorm"`, `"batchnorm"`, or `"syncbatchnorm"` (GPU required for syncbatchnorm)
+        norm_affine (`bool`, *optional*, defaults to `True`):
+            If `True`, applies a parameterized affine transformation to inputs during normalization
+        activation (`str`, *optional*, defaults to `"silu"`):
+            Activation function to apply within Mega encoder blocks. Choose one of `"silu"`, `"relu"`, `"linear"`,
+            `"gelu"`, or `"gelu_accurate"`
+        attention_activation (`str`, *optional*, defaults to `"softmax"`):
+            Activation function to apply for single-headed self-attention (a la Transformer). Choose one of
+            `"softmax"`, `"laplace"`, or `"relu2"`
+        dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for EMA self-attention
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        use_feature_dropout (`bool`, *optional*, defaults to `False`):
+            Whether to use feature-based (`True`) or standard dropout (`False`)
+        use_normalized_ffn (`bool`, *optional*, defaults to `True`):
+            Whether to use the normalized feed-forward sub-layer in Mega blocks (`True`) or pass Mega encoder output
+            as-is (`False`)
+        nffn_hidden_size (`int`, *optional*, defaults to 256):
+            If using the normalized feed-forward network (NFFN) layer within Mega (`use_normalized_ffn = True`), this
+            is the hidden size of the NFFN
+        normalize_before_ffn (`bool`, *optional*, defaults to `True`):
+            Whether to normalize before (`True`) or after (`False`) the feed-forward portion of NFFN
+        nffn_activation_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the NFFN component.
+        max_positions (`int`, *optional*, defaults to 2048):
+            The maximum sequence length to use for positional representations. For `"simple"` relative positional bias,
+            this is a hard limit on input length; `"rotary"` relative positional bias will extrapolate to longer
+            sequences
+        add_token_type_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to account for token types in embeddings. Left as optional to maintain compatibility with original
+            implementation while adding support for token types.
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`MegaModel`]. Only used if
+            `add_token_type_embeddings = True`
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        ema_delta_alpha_range (`float`, *optional*, defaults to 0.2):
+            The standard deviation for initializing the delta (damping factor) and alpha (decay factor) parameters in
+            MegaMultiDimensionDampedEma.
+        ema_beta_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation for initializing the beta parameter (expansion matrix) in
+            MegaMultiDimensionDampedEma.
+        ema_gamma_omega_range (`float`, *optional*, defaults to 1.0):
+            The standard deviation for initializing the gamma (projection matrix) and omega (residual weight)
+            parameters in MultiDimensionEMA.
+        relative_positional_bias (`str`, *optional*, defaults to `"rotary"`):
+            Type of relative positional encoding. Choose one of `"rotary"` or `"simple"`. If `"simple"` is selected,
+            `max_positions` is used as a limit on input size, while `"rotary"` extrapolates beyond `max_positions`.
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+        add_lm_hidden_dense_layer (`bool`, *optional*, defaults to `True`):
+            Whether to include a hidden layer for projection between encoder outputs and LM heads (`True`) or pass
+            hidden states directly to LM head (`False`). Remains optional for compatibility with original
+            implementation
+
+    Examples:
+
+    ```python
+    >>> from transformers import MegaConfig, MegaModel
+
+    >>> # Initializing a Mega configuration
+    >>> configuration = MegaConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = MegaModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "mega"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=128,
+        num_hidden_layers=4,
+        intermediate_size=256,
+        ema_projection_size=16,
+        bidirectional=True,
+        shared_representation_size=64,
+        use_chunking=False,
+        chunk_size=-1,
+        truncation=None,
+        normalize_before_mega=True,
+        normalization_type="scalenorm",
+        norm_affine=True,
+        activation="silu",
+        attention_activation="softmax",
+        dropout_prob=0.1,
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        use_feature_dropout=False,
+        use_normalized_ffn=True,
+        nffn_hidden_size=256,
+        normalize_before_ffn=True,
+        nffn_activation_dropout_prob=0.1,
+        max_positions=2048,
+        add_token_type_embeddings=False,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        ema_delta_alpha_range=0.2,
+        ema_beta_range=0.02,
+        ema_gamma_omega_range=1.0,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        relative_positional_bias="rotary",
+        classifier_dropout=None,
+        use_cache=True,
+        add_lm_hidden_dense_layer=True,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.activation = activation
+        self.attention_activation = attention_activation
+        self.intermediate_size = intermediate_size
+        self.ema_projection_size = ema_projection_size
+        self.bidirectional = bidirectional
+        self.shared_representation_size = shared_representation_size
+        self.use_chunking = use_chunking
+        self.chunk_size = chunk_size
+        self.truncation = truncation
+        self.normalize_before_mega = normalize_before_mega
+        self.normalization_type = normalization_type
+        self.norm_affine = norm_affine
+        self.dropout_prob = dropout_prob
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.use_feature_dropout = use_feature_dropout
+        self.use_normalized_ffn = use_normalized_ffn
+        self.nffn_hidden_size = nffn_hidden_size
+        self.normalize_before_ffn = normalize_before_ffn
+        self.nffn_activation_dropout_prob = nffn_activation_dropout_prob
+        self.max_positions = max_positions
+        self.add_token_type_embeddings = add_token_type_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.ema_delta_alpha_range = ema_delta_alpha_range
+        self.ema_beta_range = ema_beta_range
+        self.ema_gamma_omega_range = ema_gamma_omega_range
+        self.relative_positional_bias = relative_positional_bias
+        self.use_cache = use_cache
+        self.classifier_dropout = classifier_dropout
+        self.add_lm_hidden_dense_layer = add_lm_hidden_dense_layer
+        self.num_attention_heads = 1  # not used but required by Hugging Face
+
+
+class MegaOnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        if self.task == "multiple-choice":
+            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
+        else:
+            dynamic_axis = {0: "batch", 1: "sequence"}
+        return OrderedDict(
+            [
+                ("input_ids", dynamic_axis),
+                ("attention_mask", dynamic_axis),
+            ]
+        )
+
+
+__all__ = ["MegaConfig", "MegaOnnxConfig"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/mega/modeling_mega.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/mega/modeling_mega.py
new file mode 100644
index 0000000000000000000000000000000000000000..c237afee9a334d57d155b8efa2eb5de8278d21dd
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/mega/modeling_mega.py
@@ -0,0 +1,2276 @@
+# coding=utf-8
+# Copyright 2023 The Mega Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch MEGA model."""
+
+import math
+from typing import Optional, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ....activations import ACT2FN
+from ....cache_utils import Cache
+from ....modeling_outputs import (
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from ....modeling_utils import PreTrainedModel
+from ....utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from ....utils.deprecation import deprecate_kwarg
+from .configuration_mega import MegaConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "mnaylor/mega-base-wikitext"
+_CONFIG_FOR_DOC = "MegaConfig"
+
+
+class MegaEmbeddings(nn.Module):
+    """
+    Mega's basic implementation does not incorporate token type embeddings, so this is a stripped-down version of
+    RoBERTa's embeddings which optionally includes token types
+    """
+
+    def __init__(self, config: MegaConfig):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.use_token_types = config.add_token_type_embeddings
+        if self.use_token_types:
+            self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+            # registering a buffer here allows model tracing when not passing optional token type IDs
+            # more info at transformers issue #5664
+            self.register_buffer(
+                "token_type_ids", torch.zeros(config.max_positions, dtype=torch.long).expand((1, -1)), persistent=False
+            )
+
+        self.padding_idx = config.pad_token_id
+
+    def forward(self, input_ids=None, token_type_ids=None, inputs_embeds=None):
+        if (input_ids is None) and (inputs_embeds is None):
+            raise ValueError("Must provide one of input_ids or inputs_embeds")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            device = input_ids.device
+
+            # get the word embeddings if only IDs are provided
+            inputs_embeds = self.word_embeddings(input_ids)
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+            device = inputs_embeds.device
+
+        # the original Mega implementation did not include token type embeddings, so we add
+        # an option to use them if desired; if embeddings are present and token type IDs are
+        # not provided, we will use a registered buffer (which helps with tracing)
+        if self.use_token_types:
+            if token_type_ids is None:
+                if hasattr(self, "token_type_ids"):
+                    buffered_token_type_ids = self.token_type_ids[:, : input_shape[1]]
+                    buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], input_shape[1])
+                    token_type_ids = buffered_token_type_ids_expanded
+                else:
+                    token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+            # access token type embeddings
+            token_type_embeddings = self.token_type_embeddings(token_type_ids)
+            # add the token type embeddings to the word embeddings
+            embeddings = inputs_embeds + token_type_embeddings
+        else:
+            embeddings = inputs_embeds
+        return embeddings
+
+
+class MegaSimpleRelativePositionalBias(nn.Module):
+    """
+    Simple relative positional embeddings copied from the Mega repo; renamed variables for better readability
+    """
+
+    def __init__(self, config: MegaConfig):
+        super().__init__()
+        self.config = config
+        self.max_positions = self.config.max_positions if self.config.chunk_size < 0 else self.config.chunk_size
+        self.rel_pos_bias = nn.Parameter(torch.Tensor(2 * config.max_positions - 1))
+
+    def forward(self, seq_len):
+        if seq_len > self.max_positions:
+            raise ValueError(f"Sequence length {seq_len} going beyond max length {self.max_positions}")
+
+        # seq_len * 2 - 1
+        bias = self.rel_pos_bias[(self.max_positions - seq_len) : (self.max_positions + seq_len - 1)]
+        # seq_len * 3 - 1
+        tile = F.pad(bias, (0, seq_len))
+        # (seq_len * 3 - 1) * seq_len
+        tile = torch.tile(tile, (seq_len,))
+        tile = tile[:-seq_len]
+        # seq_len x (3 * seq_len - 2)
+        tile = tile.view(seq_len, 3 * seq_len - 2)
+        start = (2 * seq_len - 1) // 2
+        end = tile.size(1) - start
+        tile = tile[:, start:end]
+        return tile
+
+
+class MegaRotaryRelativePositionalBias(nn.Module):
+    """
+    Rotary relative bias for positional information; similar in concept to RoPE (i.e. RoFormer) but taken from the Mega
+    repo due to differences in implementation.
+
+    When initialized, produces a positional bias which ranges from position 0 to config.max_positions, but can
+    extrapolate to longer sequences. Can be indexed according to input position IDs
+    """
+
+    def __init__(self, config: MegaConfig):
+        super().__init__()
+        if config.hidden_size % 2 != 0:
+            raise RuntimeError("Rotary positional bias requires `hidden_size` to be a multiple of 2")
+        self.config = config
+        self.embed_dim = config.shared_representation_size
+        self.max_positions = self.config.max_positions if self.config.chunk_size < 0 else self.config.chunk_size
+        self.sine, self.cosine = MegaRotaryRelativePositionalBias.get_sinusoid_embeddings(
+            config.max_positions, self.embed_dim
+        )
+        # alpha and beta parameters for the rotary bias; beta renamed to b_param to avoid clashes with tf/flax weight handling
+        # in loading pretrained weights
+        self.alpha = nn.Parameter(torch.Tensor(1, self.embed_dim))
+        self.b_param = nn.Parameter(torch.Tensor(1, self.embed_dim))
+        self.register_buffer("_float_tensor", torch.FloatTensor([0.0]))
+
+    @staticmethod
+    def get_sinusoid_embeddings(max_positions: int, embedding_dim: int):
+        half_dim = embedding_dim // 2
+        emb = math.log(10000) / half_dim
+        emb = torch.exp(torch.arange(half_dim, dtype=torch.int64).float() * -emb)
+        emb = torch.arange(max_positions, dtype=torch.float).unsqueeze(1) * emb.unsqueeze(0)
+        return torch.sin(emb), torch.cos(emb)
+
+    def rotary(self, input):
+        seq_len, embed_dim = input.size()
+        chunk_1, chunk_2 = torch.chunk(input, 2, dim=-1)
+        if self.sine is None or seq_len > self.sine.size(0):
+            self.sine, self.cosine = MegaRotaryRelativePositionalBias.get_sinusoid_embeddings(seq_len, embed_dim)
+            self.max_positions = seq_len
+        self.sine = self.sine.to(self._float_tensor)
+        self.cosine = self.cosine.to(self._float_tensor)
+
+        sin = self.sine[:seq_len]
+        cos = self.cosine[:seq_len]
+        return torch.cat([chunk_1 * cos - chunk_2 * sin, chunk_2 * cos + chunk_1 * sin], dim=1)
+
+    def forward(self, seq_len):
+        rotary_alpha = self.rotary(self.alpha.expand(seq_len, self.embed_dim))
+        rotary_beta = self.rotary(self.b_param.expand(seq_len, self.embed_dim))
+        bias = torch.einsum("mk,nk->mn", rotary_alpha, rotary_beta)
+        return bias
+
+
+class MegaDropout(nn.Module):
+    """
+    A unified class for standard dropout functionality and featurewise dropout.
+
+    The original fairseq Mega repo used 2 classes for these, which included some unnecessary handling of training logic
+    and an unused `inplace` option. The original implementation used torch.nn.functional instead of submodules, which
+    is retained here as well.
+    """
+
+    def __init__(self, dropout_probability, is_featurewise=False):
+        super().__init__()
+        self.dropout_probability = dropout_probability
+        self.is_featurewise = is_featurewise
+
+    def forward(self, input, batch_first: bool = False):
+        if self.is_featurewise:
+            if batch_first:
+                # (batch_size X sequence_length X feature_dimension)
+                # -> (batch_size X feature_dimension X sequence_length)
+                # -> (batch_size X sequence_length X feature_dimension)
+                return F.dropout2d(
+                    input.transpose(-1, -2), p=self.dropout_probability, training=self.training
+                ).transpose(-1, -2)
+            else:
+                if input.dim() != 3:
+                    raise ValueError(
+                        "Feature dropout inputs must be exactly 3-dimensional if inputs are ordered [sequence length, batch size, hidden dimension]"
+                    )
+                # (sequence_length X batch_size X feature_dimension)
+                # -> (batch_size X feature_dimension X sequence_length)
+                # -> (sequence_length X batch_size X feature_dimension)
+                return F.dropout2d(input.permute(1, 2, 0), p=self.dropout_probability, training=self.training).permute(
+                    2, 0, 1
+                )
+        else:
+            return F.dropout(input, p=self.dropout_probability, training=self.training)
+
+
+class MegaRMSNorm(nn.Module):
+    """
+    RMSNorm used in Mega implementation. Differs from T5's RMSNorm by applying the weight prior to taking the square
+    root (as opposed to after in T5)
+    """
+
+    def __init__(self, number_features, eps=1e-6, affine=True):
+        super().__init__()
+        self.num_features = number_features
+        self.eps = eps
+        self.affine = affine
+        if affine:
+            self.weight = nn.Parameter(torch.Tensor(self.num_features))
+        else:
+            self.register_parameter("weight", None)
+
+    def forward(self, input):
+        mean_square = torch.mean(torch.square(input), dim=-1, keepdim=True)
+        if self.weight is not None:
+            input = input * self.weight
+
+        input * torch.rsqrt(mean_square + self.eps)
+        return input
+
+    def extra_repr(self):
+        return f"{self.num_features}, eps={self.eps}, affine={self.affine}"
+
+
+class MegaScaleNorm(nn.Module):
+    """
+    Scale normalization introduced in MEGA which is similar to RMSNorm, but uses a single parameter for scalar
+    multiplication instead of a vector, and applies over a specified dimension
+    """
+
+    def __init__(self, dim, eps=1e-6, affine=True):
+        super().__init__()
+        self.dim = dim
+        self.eps = eps
+        self.affine = affine
+        if affine:
+            self.scalar = nn.Parameter(torch.Tensor(1))
+        else:
+            self.register_parameter("scalar", None)
+
+    def forward(self, input):
+        mean_square = torch.mean(torch.square(input), dim=self.dim, keepdim=True)
+        if self.scalar is not None:
+            input = self.scalar * input
+
+        output = input * torch.rsqrt(mean_square + self.eps)
+        return output
+
+
+class MegaSequenceNorm(nn.Module):
+    """
+    A wrapper class for various layer normalization options used in Mega. Used to handle differences in expectations on
+    input axis locations for different normalization methods.
+    """
+
+    def __init__(self, norm_type, embedding_dim, eps=1e-5, affine=True, export=False):
+        super().__init__()
+        if norm_type == "layernorm":
+            self.norm = nn.LayerNorm(embedding_dim, eps, elementwise_affine=affine)
+        elif norm_type == "scalenorm":
+            self.norm = MegaScaleNorm(dim=-1, eps=eps, affine=affine)
+        elif norm_type == "rmsnorm":
+            self.norm = MegaRMSNorm(embedding_dim, eps=eps, affine=affine)
+        elif norm_type == "batchnorm":
+            self.norm = nn.BatchNorm1d(embedding_dim, eps=eps, affine=affine)
+        elif norm_type == "syncbatchnorm":
+            self.norm = nn.SyncBatchNorm(embedding_dim, eps=eps, affine=affine)
+        else:
+            raise ValueError(f"Unknown norm type: {norm_type}")
+
+    def forward(self, input):
+        if isinstance(self.norm, nn.modules.batchnorm._BatchNorm):
+            if input.dim() != 3:
+                raise ValueError("BatchNorm inputs must be exactly 3-dimensional")
+            input = input.permute(1, 2, 0)
+            input = self.norm(input)
+            return input.permute(2, 0, 1)
+        else:
+            return self.norm(input)
+
+
+class MegaMultiDimensionDampedEma(nn.Module):
+    """
+    Mega's Exponential Moving Average layer, largely left unmodified from the original repo with the exception of
+    variable names and moving away from the stateful representation of incremental decoding state. See
+    "https://huggingface.co/papers/2209.10655" for more details.
+    """
+
+    def __init__(self, config: MegaConfig):
+        super().__init__()
+
+        self.config = config
+
+        self.embed_dim = config.hidden_size
+        self.ndim = config.ema_projection_size
+        self.bidirectional = config.bidirectional
+        self.truncation = config.truncation
+        self.scale = math.sqrt(1.0 / self.ndim)
+
+        kernel_dim = 2 * config.hidden_size if self.bidirectional else config.hidden_size
+        # renamed delta (damping_factor) and alpha (decay_factor) to be more descriptive of what the parameters are doing
+        self.damping_factor = nn.Parameter(torch.Tensor(kernel_dim, self.ndim, 1))
+        self.decay_factor = nn.Parameter(torch.Tensor(kernel_dim, self.ndim, 1))
+        # renamed gamma (kernel_projection_matrix) and beta (ema_expansion_matrix) respectively to avoid HF renaming
+        # things and align with the paper's description of these params' behavior
+        self.ema_expansion_matrix = nn.Parameter(torch.Tensor(kernel_dim, self.ndim, 1))
+        self.kernel_projection_matrix = nn.Parameter(torch.Tensor(kernel_dim, self.ndim))
+        # renamed omega to residual_weight to describe what it's doing
+        self.residual_weight = nn.Parameter(torch.Tensor(config.hidden_size))
+        self._kernel = None
+        self._coeffs = None
+
+    def _compute_ema_coefficients(self):
+        self._coeffs = None
+        # convert the alpha and delta parameters (kernel_dim x EMA projection size x 1) to [0, 1] with sigmoid
+        damping_factor = torch.sigmoid(self.damping_factor)
+        decay_factor = torch.sigmoid(self.decay_factor)
+        previous_timestep_weight = 1.0 - damping_factor * decay_factor
+        return damping_factor, previous_timestep_weight
+
+    def _compute_efficient_ema_kernel(self, length: int):
+        # computes the kernel used for efficient damped EMA applied via FFT convolution
+        self._kernel = None
+        # p and q have shape (kernel_dim x ema_projection_size x 1)
+        damping_factor, previous_timestep_weight = self._compute_ema_coefficients()
+        # extend the kernel to (kernel_dim X ema_projection_size X sequence_length) and
+        # multiply q by sequential ints up to the sequence length
+        vander = torch.arange(length).to(damping_factor).view(1, 1, length) * torch.log(previous_timestep_weight)
+        kernel = (damping_factor * self.ema_expansion_matrix) * torch.exp(vander)
+        # (kernel_dim X ema_projection_size X sequence_length) -> (kernel_dim, sequence_length)
+        return torch.einsum("dnl,dn->dl", kernel, self.kernel_projection_matrix * self.scale)
+
+    def get_ema_coefficients(self):
+        if self.training:
+            return self._compute_ema_coefficients()
+        else:
+            if self._coeffs is None:
+                self._coeffs = self._compute_ema_coefficients()
+            return self._coeffs
+
+    def get_ema_kernel(self, length: int):
+        kernel_size = length if self.truncation is None else min(self.truncation, length)
+        if self.training:
+            return self._compute_efficient_ema_kernel(kernel_size)
+        else:
+            if self._kernel is None or self._kernel.size(-1) < kernel_size:
+                self._kernel = self._compute_efficient_ema_kernel(kernel_size)
+            return self._kernel[..., :kernel_size]
+
+    def fft_convolution(self, inputs, kernel, length):
+        # this is a wrapper for repeated use of EMA calculation via FFT (fast Fourier transform) convolution
+        inputs_fft = torch.fft.rfft(inputs.float(), n=2 * length)
+        kernel_fft = torch.fft.rfft(kernel.float(), n=2 * length)
+        convolved_sequence = torch.fft.irfft(inputs_fft * kernel_fft, n=2 * length)
+        return convolved_sequence
+
+    def ema_step(self, inputs, length, past_state=None):
+        if length == 1:
+            return self.one_ema_step(inputs, past_state=past_state)
+
+        # (kernel_dim X ema_projection_size X 1)
+        damping_factor, previous_timestep_weight = self.get_ema_coefficients()
+        # (kernel_dim X ema_projection_size X 1+sequence_length)
+        vander = torch.arange(length + 1).to(damping_factor).view(1, 1, length + 1) * torch.log(
+            previous_timestep_weight
+        )
+        vander = torch.exp(vander)
+        if past_state is not None:
+            # (kernel_dim X ema_projection_size X sequence_length) * (kernel_dim X ema_projection_size X 1)
+            # -> (kernel_dim X ema_projection_size X sequence_length)
+            past_ema_proj = vander[:, :, 1:] * (self.kernel_projection_matrix * self.scale).unsqueeze(-1)
+            # past_state will be (batch_size, kernel_dim, ema_projection_size)
+            past_ema_state = torch.einsum("bdn,dnl->bdl", past_state, past_ema_proj)
+            # (kernel_dim X ema_projection_size) * (batch_size X kernel_dim X ema_projection_size)
+            # -> (batch_size X kernel_dim X ema_projection_size)
+            past_vandermonde = vander[:, :, -1] * past_state
+        else:
+            past_ema_state = None
+            past_vandermonde = None
+
+        # (kernel_dim X ema_projection_size X sequence_length)
+        vander = vander[:, :, :-1]
+        kernel = (damping_factor * self.ema_expansion_matrix) * vander
+        kernel_proj = torch.einsum("dnl,dn->dl", kernel, self.kernel_projection_matrix * self.scale)
+
+        ema_output = self.fft_convolution(inputs, kernel_proj, length=length)[..., 0:length]
+        ema_output = ema_output.type_as(inputs)
+        if past_ema_state is not None:
+            ema_output = ema_output + past_ema_state
+
+        updated_hidden_state = torch.einsum("bdl,dnl->bdn", inputs, torch.flip(kernel, dims=[2]))
+        if past_vandermonde is not None:
+            updated_hidden_state = updated_hidden_state + past_vandermonde
+        # return a tuple:
+        # (sequence_length, batch_size, kernel_dim)
+        # (batch_size, kernel_dim, ema_projection_size)
+        return ema_output.permute(2, 0, 1), updated_hidden_state
+
+    def one_ema_step(self, inputs, past_state=None):
+        damping_factor, previous_timestep_weight = self.get_ema_coefficients()
+        # (kernel_dim X ema_projection_size) x (batch_size X kernel_dim X 1)
+        # -> (batch_size X kernel_dim X ema_projection_size)
+        updated_state = (damping_factor * self.ema_expansion_matrix).squeeze(-1) * inputs
+        if past_state is not None:
+            updated_state = updated_state + previous_timestep_weight.squeeze(-1) * past_state
+        # (batch_size X kernel_dim)
+        out = torch.einsum("bdn,dn->bd", updated_state, self.kernel_projection_matrix * self.scale)
+        # (1 X batch_size X kernel_dim), (batch_size X kernel_dim X ema_projection_size)
+        return out.unsqueeze(0), updated_state
+
+    def forward(
+        self,
+        inputs,
+        attention_mask: Optional[torch.Tensor] = None,
+        prev_state: Optional[torch.Tensor] = None,
+        use_cache: bool = False,
+    ) -> torch.Tensor:
+        """
+        Mega's exponential moving average (EMA) sub-layer applied prior to single-headed (traditional) self-attention
+
+        Args:
+            inputs (`torch.Tensor` of shape `(sequence_length, batch_size, hidden_size)`):
+                Hidden state / embedding input to update via EMA based on FFT convolution
+            attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Indicates which inputs are to be ignored (mostly due to padding), where elements are either 1 for *not
+                masked* or 0 for *masked*
+            prev_state (`torch.Tensor` of shape `(batch_size, config.ndim)`, *optional*):
+                The hidden state returned from the previous timestep during incremental decoding.
+            use_cache (`bool`, default `False`):
+                Whether to perform incremental decoding; uses `prev_state` as the prior timestep, and returns the
+                updated EMA hidden state for use in the next step
+
+        Returns:
+            `tuple(torch.FloatTensor)` containing various elements depending on configuration ([`MegaConfig`]) and
+            inputs:
+            - **hidden_states** (`torch.FloatTensor` of shape `(sequence_length, batch_size, hidden_size)`) -- Hidden
+              states updated by EMA, with same shapes as inputs
+            - **updated_state** (*optional*, returned when `use_cache=True`) `torch.FloatTensor of shape `(batch_size,
+              config.ndim)` -- The incremental EMA state for use in the next step of incremental decoding
+        """
+
+        seq_len, bsz, embed_dim = inputs.size()
+        if embed_dim != self.embed_dim:
+            raise ValueError(
+                f"Unexpected embedding dimension received: input is {embed_dim}, model expects {self.embed_dim}"
+            )
+
+        # sequence_length X batch_size X hidden_size
+        residual = inputs * self.residual_weight
+
+        # (sequence_length x batch_size x hidden_size) -> (batch_size x hidden_size x sequence_length)
+        inputs = inputs.permute(1, 2, 0)
+        # mask the input: output is a tensor with 0 in the masked positions
+        if attention_mask is not None:
+            inputs = inputs * (attention_mask.unsqueeze(1).type_as(inputs))
+
+        if self.bidirectional and use_cache:
+            raise RuntimeError("Bidirectional EMA does not support incremental state")
+
+        if use_cache:
+            out, updated_state = self.ema_step(inputs, seq_len, past_state=prev_state)
+
+            # (batch_size X hidden_size) -> (1 x batch_size x hidden_size)
+            out = F.silu(out + residual)
+
+            # if incremental decoding, return the new state along with the output
+            return out, updated_state
+        else:
+            # (hidden_size x sequence_length)
+            kernel = self.get_ema_kernel(seq_len)
+            fft_len = seq_len
+            s_index = 0
+            kernel_size = kernel.size(1)
+            if self.bidirectional:
+                # split the kernel for each direction of EMA
+                k1, k2 = torch.split(kernel, [self.embed_dim, self.embed_dim], dim=0)
+                # (hidden_size X 2*sequence_length - 1)
+                kernel = F.pad(k1, (kernel_size - 1, 0)) + F.pad(k2.flip(-1), (0, kernel_size - 1))
+                inputs = F.pad(inputs, (kernel_size - 1, 0))
+                fft_len = fft_len + kernel_size - 1
+                s_index = 2 * kernel_size - 2
+
+            ema_output = self.fft_convolution(inputs, kernel, length=fft_len)[..., s_index : s_index + seq_len]
+            ema_output = ema_output.type_as(inputs)
+            # (batch_size X hidden_size X sequence_length) -> (sequence_length X batch_size X hidden_size)
+            gated_ema_output = F.silu(ema_output.permute(2, 0, 1) + residual)
+
+            return gated_ema_output, None
+
+
+class MegaGatedCrossAttention(nn.Module):
+    """
+    Gated Structured State Attention for use in encoder-decoder model. See Mega paper for more details. Only
+    modifications from original implementation are variable names, removing the unnecessary `before_attn_fn` and
+    `static_kv` arguments, and the stateful representation of incremental decoder state.
+    """
+
+    def __init__(self, config: MegaConfig):
+        super().__init__()
+
+        self.config = config
+        self.activation = ACT2FN[self.config.activation]
+        self.attention_activation = self.config.attention_activation
+        self.scaling = self.config.shared_representation_size**-0.5 if self.attention_activation == "softmax" else None
+
+        self.dropout = MegaDropout(self.config.dropout_prob, is_featurewise=self.config.use_feature_dropout)
+        self.hidden_dropout = MegaDropout(
+            self.config.hidden_dropout_prob, is_featurewise=self.config.use_feature_dropout
+        )
+        # Attention dropout is standard dropout
+        self.attention_dropout = MegaDropout(self.config.attention_probs_dropout_prob, is_featurewise=False)
+
+        self.prenorm = self.config.normalize_before_mega
+        self.norm = MegaSequenceNorm(
+            self.config.normalization_type, self.config.hidden_size, affine=self.config.norm_affine
+        )
+
+        self.k_proj = nn.Linear(self.config.hidden_size, self.config.shared_representation_size)
+        self.v_proj = nn.Linear(self.config.hidden_size, self.config.hidden_size)
+        self.q_proj = nn.Linear(
+            self.config.hidden_size, 2 * self.config.hidden_size + self.config.shared_representation_size
+        )
+        self.h_proj = nn.Linear(self.config.hidden_size, self.config.hidden_size)
+
+        if self.config.relative_positional_bias == "simple":
+            self.rel_pos_bias = MegaSimpleRelativePositionalBias(config)
+        elif self.config.relative_positional_bias == "rotary":
+            self.rel_pos_bias = MegaRotaryRelativePositionalBias(config)
+        else:
+            raise ValueError(f"unknown relative position bias: {self.config.relative_positional_bias}")
+
+        self.softmax = nn.Softmax(dim=-1)
+
+    def element_attention(self, query, key, key_padding_mask, pidx):
+        bsz, src_len, _ = key.size()
+        tgt_len = query.size(1) if pidx is None else pidx + 1
+        if key_padding_mask is not None:
+            # (batch_size X source_sequence_length) --> (batch_size X 1 X 1)
+            lengths = key_padding_mask.sum(dim=-1).view(bsz, 1, 1)
+        else:
+            lengths = src_len
+
+        # (target_sequence_length X source_sequence_length)
+        bias = self.rel_pos_bias(max(tgt_len, src_len))[:, :src_len]
+        if pidx is not None:
+            if query.size(1) != 1:
+                raise ValueError("Position offset provided with queries longer than 1 token")
+            # source_sequence_length
+            bias = bias[pidx]
+        else:
+            # (target_sequence_length X source_sequence_length)
+            bias = bias[:tgt_len]
+
+        # (batch_size X target_sequence_length X source_sequence_length)
+        qk = torch.bmm(query, key.transpose(1, 2)) / lengths + bias
+
+        attn_weights = ACT2FN[self.attention_activation](qk).type_as(qk)
+
+        if key_padding_mask is not None:
+            attn_weights = attn_weights * key_padding_mask.unsqueeze(1)
+
+        return attn_weights
+
+    def softmax_attention(self, query, key, key_padding_mask, pidx):
+        bsz, src_len, _ = key.size()
+        tgt_len = query.size(1) if pidx is None else pidx + 1
+
+        # (target_sequence_length X source_sequence_length)
+        bias = self.rel_pos_bias(max(tgt_len, src_len))[:, :src_len]
+        if pidx is not None:
+            if query.size(1) != 1:
+                raise ValueError("Position offset provided with queries longer than 1 token")
+            # source_sequence_length
+            bias = bias[pidx]
+        else:
+            # (target_sequence_length X source_sequence_length)
+            bias = bias[:tgt_len]
+
+        # scaled attention
+        query = query * self.scaling
+        # (batch_size X target_sequence_length X source_sequence_length)
+        qk = torch.bmm(query, key.transpose(1, 2)) + bias
+
+        if key_padding_mask is not None:
+            qk = qk.masked_fill((1 - key_padding_mask).unsqueeze(1).to(torch.bool), float("-inf"))
+
+        attn_weights = self.softmax(qk).type_as(qk)
+        return attn_weights
+
+    def forward(
+        self,
+        query,
+        key: Optional[torch.Tensor],
+        value: Optional[torch.Tensor],
+        key_padding_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """
+        Gated cross-attention used in Mega
+
+        Args:
+            query (`torch.Tensor` of shape `(target_sequence_length, batch_size, hidden_size)`):
+                The self (or target) sequence input used as query inputs for cross-attention
+            key (`torch.Tensor` of shape `(source_sequence_length, batch_size, hidden_size)`):
+                The cross (or source) sequence input with shape used as keys in cross-attention
+            value (`torch.Tensor` of shape `(source_sequence_length, batch_size, hidden_size)`):
+                The cross (or source) sequence input with shape used as values in cross-attention
+            key_padding_mask (`torch.LongTensor` of shape `(batch_size, source_sequence_length)`, *optional*):
+                Padding mask corresponding to the source sequence, where entries are 1 for *not masked* and 0 for
+                *masked* tokens
+            past_key_values (`tuple(torch.FloatTensor)`, *optional*):
+                If provided, the hidden state returned from the previous timestep during incremental decoding; expects
+                that prior cross-attention keys and values will be the last two items in the tuple
+            output_attentions (`bool`, defaults to `False`):
+                Whether or not to return the cross-attention weights.
+            use_cache (`bool`, defaults to `False`):
+                Whether to perform incremental decoding; uses `prev_state` as the prior timestep, and returns the
+                updated EMA hidden state for use in the next step
+
+        Returns:
+            `tuple(torch.FloatTensor)` containing various elements depending on configuration ([`MegaConfig`]) and
+            inputs:
+            - **hidden_states** (`torch.FloatTensor` of shape `(target_sequence_length, batch_size, hidden_size)`) --
+              Hidden states from target sequence updated by gated cross-attention
+            - **attn_weights** (*optional*, returned when `output_attentions=True`) `torch.FloatTensor` of shape
+              `(batch_size, source_sequence_length, target_sequence_length)` -- The pairwise cross-attention weights
+              corresponding to each token in the source and target sequences
+            - **cross_key** (*optional*, returned when `use_cache=True`) `torch.FloatTensor` of shape `(batch_size,
+              source_sequence_length, config.shared_representation_size)` -- The cross-attention key state for use in
+              the next step of incremental decoding
+            - **cross_value** (*optional*, returned when `use_cache=True`) `torch.FloatTensor` of shape `(batch_size,
+              source_sequence_length, config.hidden_size)` -- The cross-attention value state for use in the next step
+              of incremental decoding
+        """
+
+        seq_len, bsz, embed_dim = query.size()
+        if embed_dim != self.config.hidden_size:
+            raise ValueError(
+                f"Unexpected embedding dimension received: input is {embed_dim} but expected {self.config.hidden_size}"
+            )
+
+        if past_key_values is not None:
+            # make sure the inputs only have a sequence length of 1 if we're doing incremental decoding
+            if seq_len != 1:
+                raise ValueError(f"Incremental decoding requested with self-sequence length > 1: {seq_len}")
+            # expect past_key_values to have (self_key, self_value, self_ema, cross_key, cross_value)
+            prev_cross_key, prev_cross_value = past_key_values[-2:]
+            key = value = None
+
+            # use the self-attention cache to get the position id of the current step
+            prev_self_key = past_key_values[0]
+            num_incremental_steps = prev_self_key.size(1) + 1
+        else:
+            prev_cross_key = prev_cross_value = None
+            # we still need the position id if we're doing incremental decoding (past_key_values will be None for the first step)
+            num_incremental_steps = 0 if use_cache and (seq_len == 1) else None
+
+        full_query = query
+        if self.prenorm:
+            full_query = self.norm(full_query)
+
+        # (target_sequence_length X batch_size X 2*hidden_size + shared_representation_size)
+        query_projected = self.q_proj(full_query)
+        # split the query projections into separate components
+        # - residual_weight is passed through sigmoid and sent through elementwise multiplication to the gated/weighted targets prior to being added to the query directly
+        # - target_gate is a silu-gated tensor that is multiplied by the attention-weighted target below prior to residual connection
+        # - attention_query is the part that is passed to the attention function
+        residual_weight, target_gate, attention_query = torch.split(
+            query_projected,
+            [self.config.hidden_size, self.config.hidden_size, self.config.shared_representation_size],
+            dim=-1,
+        )
+
+        # (target_sequence_length X batch_size X hidden_size)
+        residual_weight = torch.sigmoid(residual_weight)
+        target_gate = F.silu(target_gate)
+
+        if key is None:
+            if value is not None:
+                raise ValueError("Key and value must be `None` simultaneously")
+            projected_key = projected_value = None
+        else:
+            # (source_sequence_length X batch_size X shared_representation_size)
+            projected_key = self.k_proj(key)
+            # (source_sequence_length X batch_size X hidden_size)
+            projected_value = self.activation(self.v_proj(key))
+
+        # (target_sequence_length X batch_size X shared_representation_size)
+        # -> (batch_size X target_sequence_length X shared_representation_size)
+        attention_query = attention_query.transpose(0, 1)
+        if projected_key is not None:
+            projected_key = projected_key.transpose(0, 1)
+        if projected_value is not None:
+            projected_value = projected_value.transpose(0, 1)
+
+        # if we're doing incremental decoding, k and v are None and need to be overwritten with past values
+        if past_key_values is not None:
+            projected_key = prev_cross_key
+            projected_value = prev_cross_value
+
+        # if we're returning the cache for later use, store these now for later return (can be done without having past_key_values provided)
+        if use_cache:
+            updated_cross_key = projected_key
+            updated_cross_value = projected_value
+
+        ctx_len = projected_key.size(1)
+        # This is part of a workaround to get around fork/join parallelism
+        # not supporting Optional types.
+        if key_padding_mask is not None and key_padding_mask.dim() == 0:
+            key_padding_mask = None
+
+        if key_padding_mask is not None:
+            if key_padding_mask.size(0) != bsz:
+                raise ValueError("Key padding mask does not align on the batch dimension")
+            if key_padding_mask.size(1) != ctx_len:
+                raise ValueError("Key padding mask does not align on the sequence length dimension")
+
+        if self.attention_activation == "softmax":
+            attn_weights = self.softmax_attention(
+                attention_query, projected_key, key_padding_mask, num_incremental_steps
+            )
+        else:
+            attn_weights = self.element_attention(
+                attention_query, projected_key, key_padding_mask, num_incremental_steps
+            )
+
+        projected_value = self.hidden_dropout(projected_value, batch_first=True)
+        kernel = self.attention_dropout(attn_weights)
+        # (batch_size X target_sequence_length X hidden_size)
+        # -> (target_sequence_length X batch_size X hidden_size)
+        weighted_targets = torch.bmm(kernel, projected_value).transpose(0, 1)
+        # (target_sequence_length X batch_size X hidden_size)
+        weighted_targets = self.activation(self.h_proj(weighted_targets * target_gate))
+        weighted_targets = self.dropout(weighted_targets)
+        out = torch.addcmul(query, residual_weight, weighted_targets - query)
+
+        if not self.prenorm:
+            out = self.norm(out)
+
+        outputs = (out, attn_weights) if output_attentions else (out,)
+        if use_cache:
+            outputs = outputs + (updated_cross_key, updated_cross_value)
+
+        return outputs
+
+
+class MegaMovingAverageGatedAttention(nn.Module):
+    """
+    Pure PyTorch implementation of Mega block; see https://huggingface.co/papers/2209.10655 and original fairseq implementation
+    at https://github.com/facebookresearch/mega (copyright Meta Research, licensed under MIT License)
+
+    Differences from original implementation include hidden state refactor and fixed inconsistency with additive /
+    multiplicative attention masks
+    """
+
+    def __init__(self, config: MegaConfig):
+        super().__init__()
+        self.config = config
+        self.activation = ACT2FN[self.config.activation]
+        self.scaling = (
+            self.config.shared_representation_size**-0.5 if self.config.attention_activation == "softmax" else None
+        )
+        self.dropout = MegaDropout(self.config.dropout_prob, is_featurewise=self.config.use_feature_dropout)
+        self.hidden_dropout = MegaDropout(
+            self.config.hidden_dropout_prob, is_featurewise=self.config.use_feature_dropout
+        )
+        # attention dropout is standard dropout
+        self.attention_dropout = MegaDropout(self.config.attention_probs_dropout_prob, is_featurewise=False)
+
+        self.norm = MegaSequenceNorm(
+            self.config.normalization_type, self.config.hidden_size, affine=self.config.norm_affine
+        )
+        self.ema_gate = MegaMultiDimensionDampedEma(config)
+
+        self.v_proj = nn.Linear(self.config.hidden_size, self.config.intermediate_size)
+        self.mx_proj = nn.Linear(
+            self.config.hidden_size,
+            self.config.shared_representation_size + self.config.intermediate_size + 2 * self.config.hidden_size,
+        )
+        self.h_proj = nn.Linear(self.config.intermediate_size, self.config.hidden_size)
+
+        self.qk_weight = nn.Parameter(torch.Tensor(2, self.config.shared_representation_size))
+        self.qk_bias = nn.Parameter(torch.Tensor(2, self.config.shared_representation_size))
+
+        if self.config.relative_positional_bias == "simple":
+            self.rel_pos_bias = MegaSimpleRelativePositionalBias(config)
+        elif self.config.relative_positional_bias == "rotary":
+            self.rel_pos_bias = MegaRotaryRelativePositionalBias(config)
+        else:
+            raise ValueError(f"Unknown relative positional bias: {self.config.relative_positional_bias}")
+
+        self.softmax = nn.Softmax(dim=-1)
+        self.attention_function = (
+            self.softmax_attention if self.config.attention_activation == "softmax" else self.element_attention
+        )
+
+    def element_attention(self, query, key, padding_mask, causal_mask):
+        """
+        Apply element-wise attention via relu^2 or laplace. Same as original implementation but with standardized
+        causal attention mask. Expects the Hugging Face standard attention mask paradigm: 1 for not masked, and 0 for
+        masked.
+        """
+        seq_len = key.size(2)
+        if padding_mask is not None:
+            # (batch_size X number of chunks X 1)
+            lengths = padding_mask.sum(-1, keepdim=True)
+            # (batch_size X number of chunks X 1 X 1)
+            lengths = lengths.clamp(min=1.0).unsqueeze(-1)
+        else:
+            lengths = seq_len
+
+        if causal_mask is not None:
+            lengths = causal_mask.sum(dim=-1, keepdim=True)
+
+        # (sequence_length X sequence_length)
+        bias = self.rel_pos_bias(seq_len)
+        if seq_len != query.size(2):
+            if query.size(2) != 1:
+                raise ValueError("Size mismatch between Q and K in element attention")
+            # (1 X sequence_length)
+            bias = bias[-1:]
+
+        # (batch_size X number of chunks X sequence_length X sequence_length)
+        qk = torch.matmul(query, key.transpose(2, 3)) / lengths + bias
+
+        attn_weights = ACT2FN[self.config.attention_activation](qk).type_as(qk)
+
+        if padding_mask is not None:
+            attn_weights = attn_weights * padding_mask.unsqueeze(2)
+
+        if causal_mask is not None:
+            attn_weights = attn_weights * causal_mask
+
+        return attn_weights
+
+    def softmax_attention(self, query, key, padding_mask, causal_mask):
+        "Standard softmax self-attention, as in the original Transformer paper"
+        seq_len = key.size(2)
+        # (sequence_length X sequence_length)
+        bias = self.rel_pos_bias(seq_len)
+        if seq_len != query.size(2):
+            if query.size(2) != 1:
+                raise ValueError("Size mismatch between Q and K in softmax attention")
+            # (1 X sequence_length)
+            bias = bias[-1:]
+
+        # scaled attention
+        query = query * self.scaling
+
+        # (batch_size x number of chunks x chunk_size x chunk_size) if chunking
+        # (batch_size x 1 x sequence_length x sequence_length) otherwise
+        qk = torch.matmul(query, key.transpose(2, 3)) + bias
+
+        # apply causal mask (presumed to be 1/0 for not masked / masked)
+        # additive, but convert to 0/-inf (which is not explicitly in the Mega source code)
+        if causal_mask is not None:
+            additive_causal_mask = torch.zeros_like(causal_mask, dtype=qk.dtype)
+            additive_causal_mask = additive_causal_mask.masked_fill((1 - causal_mask).bool(), float("-inf"))
+            qk = qk + additive_causal_mask
+
+        if padding_mask is not None:
+            # 1 for tokens which are *not masked*
+            # 0 for tokens which are *masked*
+            # replace masked tokens with -inf to make softmax ignore them
+            # need to invert the padding mask to match what mega original did
+            padding_mask = 1 - padding_mask
+            padding_mask_all = padding_mask.all(dim=-1, keepdim=True)
+            padding_mask = torch.logical_and(padding_mask, ~padding_mask_all)
+            qk = qk.masked_fill(padding_mask.unsqueeze(2).to(torch.bool), float("-inf"))
+
+        attn_weights = self.softmax(qk).type_as(qk)
+        return attn_weights
+
+    def forward(
+        self,
+        input,
+        padding_mask: Optional[torch.Tensor] = None,
+        causal_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions=False,
+        use_cache=False,
+    ):
+        """
+        Mega's self-attention block, which combines multi-headed EMA with traditional self-attention
+
+        Args:
+            input (`torch.Tensor` of shape `(sequence_length, batch_size, hidden_size)`):
+                Hidden states to be updated by Mega's self-attention
+            padding_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Indicates which inputs are to be ignored due to padding, where elements are either 1 for *not masked*
+                or 0 for *masked*
+            causal_mask (`torch.LongTensor` of shape `(sequence_length, sequence_length)`, *optional*):
+                Indicates which inputs are to be ignored due to causal attention, where elements are either 1 for *not
+                masked* or 0 for *masked*
+            past_key_values (`tuple(torch.Tensor)`, *optional*):
+                The hidden states returned from the previous timestep during incremental decoding; expects that
+                self-attention key, value, and EMA states are the first 3 entries in the tuple
+            output_attentions (`bool`, default `False`):
+                Whether to return self-attention weights
+            use_cache (`bool`, default `False`):
+                Whether to perform incremental decoding; uses `past_key_values` as prior state, and returns the updated
+                states for use in the next step
+
+        Returns:
+            `tuple(torch.FloatTensor)` containing various elements depending on configuration ([`MegaConfig`]) and
+            inputs:
+            - **hidden_states** (`torch.FloatTensor` of shape `(sequence_length, batch_size, hidden_size)`) -- Hidden
+              states from target sequence updated by Mega's self-attention
+            - **attn_weights** (*optional*, returned when `output_attentions=True`) `torch.FloatTensor` of shape
+              `(batch_size, 1, sequence_length, sequence_length)` -- The self-attention weights corresponding to how
+              each token in the input sequence attends to every other token
+            - **self_key** (*optional*, returned when `use_cache=True`) `torch.FloatTensor` of shape `(batch_size,
+              sequence_length, config.shared_representation_size)` -- The self-attention key state for use in the next
+              step of incremental decoding
+            - **self_value** (*optional*, returned when `use_cache=True`) `torch.FloatTensor` of shape `(batch_size,
+              sequence_length, config.hidden_size)` -- The self-attention value state for use in the next step of
+              incremental decoding
+            - **self_ema_state** (*optional*, returned when `use_cache=True`) `torch.FloatTensor` of shape
+              `(batch_size, config.ndim)` The incremental EMA state for use in the next step of incremental decoding.
+        """
+
+        seq_len, bsz, embed_dim = input.size()
+        if embed_dim != self.config.hidden_size:
+            raise ValueError(f"Input embedding dimension should be {self.config.hidden_size}; received {embed_dim}")
+
+        # store inputs for residual connection and handle pre-norm if requested
+        residual = input
+        if self.config.normalize_before_mega:
+            input = self.norm(input)
+
+        # (sequence_length X batch_size X hidden_size) -> (sequence_length X batch_size X intermediate_size)
+        value = self.activation(self.v_proj(input))
+
+        # unpack the incremental state if provided
+        # assumed to be (self K, self V, self EMA state, cross K, cross V)
+        # also assumes that incremental decoding is working one token at a time, so input sequence length must be 1
+        if self.config.is_decoder and (past_key_values is not None):
+            if seq_len > 1:
+                raise ValueError(f"Incremental decoding only supports self sequence length of 1; received {seq_len}")
+            # the first 3 items in the saved states will be these regardless of whether cross-attention is present
+            prev_self_key, prev_self_value, prev_ema_state = past_key_values[0:3]
+        else:
+            prev_self_key = prev_self_value = prev_ema_state = None
+
+        # ema output is (sequence_length x batch_size x hidden_size)
+        # updated_ema_state will be None if use_cache=False; otherwise (batch_size, config.ndim)
+        ema_out, updated_ema_state = self.ema_gate(
+            input, attention_mask=padding_mask, prev_state=prev_ema_state, use_cache=use_cache
+        )
+        ema_out = self.dropout(ema_out)
+
+        # (sequence_length X batch_size X hidden_size)
+        # -> (sequence_length X batch_size X 2*hidden_size + config.shared_representation_size + config.intermediate_size)
+        # - residual_weight -> sigmoid -> applied to residual connection in torch.addcmul
+        # - query_key_gates -> split into two components: query_key becomes query and key for attention input, gates becomes gating for self-attention output
+        # - intermediate_state -> added to weighted attention output, sent through activation, and has inputs subtracted during
+        #   torch.addcmul to create the final layer output
+        base = self.mx_proj(ema_out)
+        residual_weight, query_key_gates, intermediate_state = torch.split(
+            base,
+            [
+                self.config.hidden_size,
+                self.config.shared_representation_size + self.config.intermediate_size,
+                self.config.hidden_size,
+            ],
+            dim=-1,
+        )
+
+        # (sequence_length X batch_size X hidden_size)
+        residual_weight = torch.sigmoid(residual_weight)
+
+        # (sequence_length X batch_size X shared_representation_size + intermediate_size)
+        query_key_gates = F.silu(query_key_gates)
+
+        # split into two different tensors: one for Q/K usage and the other for gating self-attention
+        query_key, attention_gate = torch.split(
+            query_key_gates, [self.config.shared_representation_size, self.config.intermediate_size], dim=-1
+        )
+
+        # (sequence_length X batch_size X shared_representation_size)
+        # -> (sequence_length X batch_size X 1 X shared_representation_size)
+        # -> (sequence_length X batch_size X 2 X shared_representation_size)
+        query_key = query_key.unsqueeze(2) * self.qk_weight + self.qk_bias
+
+        # (sequence_length X batch_size X 2 X shared_representation_size)
+        # -> 2 tensors of (sequence_length X batch_size X shared_representation_size)
+        query, key = torch.unbind(query_key, dim=2)
+
+        # (sequence_length X batch_size X dimension)
+        # -> (batch_size X sequence_length X dimension)
+        # where `dimension` is either shared_representation_size (queries and keys) or intermediate_size (values)
+        query = query.transpose(0, 1)
+        key = key.transpose(0, 1)
+        value = value.transpose(0, 1)
+
+        if self.config.is_decoder:
+            # combine history and current to save updated state (if history is provided)
+            # when chunking is applied, the past states will be None at the end of the chunk, in
+            # which case, proceed as if no K/V history had been provided
+            # saved states are stored with shape (batch_size X sequence_length X dimension)
+            if prev_self_key is not None:
+                key = torch.cat([prev_self_key, key], dim=1)
+            if prev_self_value is not None:
+                value = torch.cat([prev_self_value, value], dim=1)
+
+            # if not chunking, store as-is
+            if not self.config.use_chunking:
+                updated_self_key = key
+                updated_self_value = value
+            else:
+                curr_len = key.size(1) % self.config.chunk_size
+                if curr_len == 0:
+                    # if we're chunking and have reached the end of a chunk, wipe out the saved state
+                    updated_self_key = None
+                    updated_self_value = None
+                else:
+                    updated_self_key = key
+                    updated_self_value = value
+
+        ctx_len = key.size(1)  # potentially differs from seq_len because of incremental decoding
+        if not self.config.use_chunking:
+            # if we're not chunking, treat the entire sequence as one long chunk
+            # (batch_size X sequence_length X dimension) -> (batch_size X 1 X sequence_length X dimension)
+            query = query.unsqueeze(1)
+            key = key.unsqueeze(1)
+            value = value.unsqueeze(1)
+            if padding_mask is not None:
+                # (batch_size X sequence_length) -> (batch_size X 1 X sequence_length)
+                padding_mask = padding_mask.unsqueeze(1)
+        else:
+            # otherwise, split the sequences in the batch into `n_chunks` chunks of size `chunk_size`
+            if seq_len < self.config.chunk_size:
+                query = query.unsqueeze(1)
+            else:
+                # (batch_size X sequence_length X dimension) -> (batch_size X n_chunks X chunk_size X dimension)
+                n_chunks = seq_len // self.config.chunk_size
+                query = query.reshape(bsz, n_chunks, self.config.chunk_size, self.config.shared_representation_size)
+
+            if ctx_len < self.config.chunk_size:
+                key = key.unsqueeze(1)
+                value = value.unsqueeze(1)
+                if padding_mask is not None:
+                    padding_mask = padding_mask.unsqueeze(1)
+            else:
+                # (batch_size X sequence_length X dimension) -> (batch_size X n_chunks X chunk_size X dimension)
+                n_chunks = ctx_len // self.config.chunk_size
+                key = key.reshape(bsz, n_chunks, self.config.chunk_size, self.config.shared_representation_size)
+                value = value.reshape(bsz, n_chunks, self.config.chunk_size, self.config.intermediate_size)
+                if padding_mask is not None:
+                    padding_mask = padding_mask.view(bsz, n_chunks, self.config.chunk_size)
+
+        # this is in the original Mega implementation to work around fork/join parallelism not supporting optional types
+        if padding_mask is not None and padding_mask.dim() == 0:
+            padding_mask = None
+
+        attn_weights = self.attention_function(query, key, padding_mask=padding_mask, causal_mask=causal_mask)
+
+        value = self.hidden_dropout(value, batch_first=True)
+        kernel = self.attention_dropout(attn_weights)
+
+        # (batch_size x n_chunks x chunk_size x intermediate_size) -> (sequence_length X batch_size X intermediate_size)
+        weighted_self_output = (
+            torch.matmul(kernel, value).view(bsz, seq_len, self.config.intermediate_size).transpose(0, 1)
+        )
+
+        # (sequence_length X batch_size X intermediate_size) -> (sequence_length X batch_size X hidden_size)
+        weighted_self_output = self.activation(intermediate_state + self.h_proj(weighted_self_output * attention_gate))
+        weighted_self_output = self.dropout(weighted_self_output)
+        # (sequence_length X batch_size X hidden_size)
+        out = torch.addcmul(residual, residual_weight, weighted_self_output - residual)
+
+        if not self.config.normalize_before_mega:
+            out = self.norm(out)
+
+        return_values = (out, attn_weights) if output_attentions else (out,)
+
+        if self.config.is_decoder:
+            return_values = return_values + (updated_self_key, updated_self_value, updated_ema_state)
+
+        return return_values
+
+
+class MegaNormalizedFeedForwardNetwork(nn.Module):
+    """
+    Normalized feed-forward network used in Mega blocks. Left as-is from original Mega repo aside from retrieving args
+    from Hugging Face config
+    """
+
+    def __init__(self, config: MegaConfig):
+        super().__init__()
+
+        self.config = config
+        self.hidden_dim = config.nffn_hidden_size
+        self.act_fn = config.activation
+        self.activation = ACT2FN[config.activation]
+
+        self.dropout = MegaDropout(self.config.dropout_prob, is_featurewise=self.config.use_feature_dropout)
+        self.hidden_dropout = MegaDropout(
+            self.config.nffn_activation_dropout_prob, is_featurewise=self.config.use_feature_dropout
+        )
+
+        self.prenorm = self.config.normalize_before_ffn
+        self.norm = MegaSequenceNorm(
+            self.config.normalization_type, self.config.hidden_size, affine=self.config.norm_affine
+        )
+
+        self.fc1 = nn.Linear(self.config.hidden_size, self.config.nffn_hidden_size)
+        self.fc2 = nn.Linear(self.config.nffn_hidden_size, self.config.hidden_size)
+
+    def forward(self, inputs):
+        residual = inputs
+
+        if self.prenorm:
+            inputs = self.norm(inputs)
+
+        hidden = self.activation(self.fc1(inputs))
+        hidden = self.hidden_dropout(hidden)
+        output = self.fc2(hidden)
+        output = self.dropout(output)
+        output = output + residual
+
+        if not self.prenorm:
+            output = self.norm(output)
+
+        return output
+
+
+class MegaBlock(nn.Module):
+    def __init__(self, config: MegaConfig):
+        super().__init__()
+        self.seq_len_dim = 1
+        self.mega_layer = MegaMovingAverageGatedAttention(config)
+        self.nffn = MegaNormalizedFeedForwardNetwork(config) if config.use_normalized_ffn else None
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
+            self.cross_attn = MegaGatedCrossAttention(config)
+        else:
+            self.cross_attn = None
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        causal_mask: Optional[torch.LongTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: bool = False,
+    ) -> tuple[torch.Tensor]:
+        """
+        A single Mega layer: either encoder or decoder, with optional cross-attention and optional normalized
+        feed-forward layer
+
+        Args:
+            hidden_states (`torch.Tensor` of shape `(target_sequence_length, batch_size, hidden_size)`):
+                Hidden states to be updated by the Mega block
+            attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+                Indicates which entries in the self/target sequence are to be ignored (mostly due to padding), where
+                elements are either 1 for *not masked* or 0 for *masked*. Causal attention is enforced internally.
+            causal_mask (`torch.LongTensor` of shape `(sequence_length, sequence_length)`, *optional*):
+                Indicates which inputs are to be ignored due to causal attention, where elements are either 1 for *not
+                masked* or 0 for *masked*
+            encoder_hidden_states (`torch.Tensor`, of shape `(source_sequence_length, batch_size, hidden_size)`, *optional*):
+                Encoder hidden states to be used for cross-attention (and required for encoder-decoder model setup)
+            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, source_sequence_length)`, *optional*):
+                Indicates which entries in the cross/source sequence are to be ignored (mostly due to padding), where
+                elements are either 1 for *not masked* or 0 for *masked*.
+            past_key_values (`tuple(torch.Tensor)`, *optional*):
+                The hidden states returned from the previous timestep during incremental decoding; expects that
+                self-attention key, value, and EMA states are the first 3 entries in the tuple, and (if doing
+                cross-attention) cross-attention key and value are the last 2 entries in the tuple
+            output_attentions (`bool`, default `False`):
+                Whether to return self-attention weights
+            use_cache (`bool`, default `False`):
+                Whether to perform incremental decoding; uses `past_key_values` as prior state, and returns the updated
+                states for use in the next step
+
+        Returns:
+            `tuple(torch.FloatTensor)` containing various elements depending on configuration ([`MegaConfig`]) and
+            inputs:
+            - **hidden_states** (`torch.FloatTensor` of shape `(target_sequence_length, batch_size, hidden_size)`) --
+              Hidden states from target sequence updated by Mega
+            - **self_attn_weights** (*optional*, returned when `output_attentions=True`) `torch.FloatTensor` of shape
+              `(batch_size, 1, target_sequence_length, target_sequence_length)` -- The self-attention weights
+              corresponding to how each token in the input sequence attends to every other token
+            - **cross_attn_weights** (*optional*, returned when `output_attentions=True` and
+              `config.add_cross_attention=True`) `torch.FloatTensor` of shape `(batch_size, source_sequence_length,
+              target_sequence_length)` -- Pairwise cross-attention weights between every entry in the source sequence
+              and target sequence
+            - **self_key** (*optional*, returned when `use_cache=True`) `torch.FloatTensor` of shape `(batch_size,
+              sequence_length, config.shared_representation_size)` -- The self-attention key state for use in the next
+              step of incremental decoding
+            - **self_value** (*optional*, returned when `use_cache=True`) `torch.FloatTensor` of shape `(batch_size,
+              sequence_length, config.hidden_size)` -- The self-attention value state for use in the next step of
+              incremental decoding
+            - **self_ema_state** (*optional*, returned when `use_cache=True`) `torch.FloatTensor` of shape
+              `(batch_size, config.ndim)` The incremental EMA state for use in the next step of incremental decoding.
+            - **cross_key** (*optional*, returned when `use_cache=True` and `config.is_decoder=True`)
+              `torch.FloatTensor` of shape `(batch_size, source_sequence_length, config.shared_representation_size)` --
+              The cross-attention key state for use in the next step of incremental decoding
+            - **cross_value** (*optional*, returned when `use_cache=True` and `config.is_decoder=True`)
+              `torch.FloatTensor` of shape `(batch_size, source_sequence_length, config.hidden_size)` -- The
+              cross-attention value state for use in the next step of incremental decoding
+        """
+
+        # incremental decoding in the MegaMultiDimensionDampedEma module requires that the attention mask has the same
+        # sequence length as the input tensor; if we're caching incremental states, we assume the input
+        # sequence length is 1 (Mega will break otherwise), so we take the padding mask for the final
+        # token in the input (mask is received as [batch X sequence length])
+        if use_cache and (past_key_values is not None) and (attention_mask is not None):
+            mega_padding_mask = attention_mask[:, -1].unsqueeze(-1)
+        else:
+            mega_padding_mask = attention_mask
+
+        mega_outputs = self.mega_layer(
+            input=hidden_states,
+            padding_mask=mega_padding_mask,
+            causal_mask=causal_mask,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+
+        new_hidden_states = mega_outputs[0]
+        self_key, self_value, self_ema_state = mega_outputs[-3:] if use_cache else (None, None, None)
+        self_attention_weights = mega_outputs[1] if output_attentions else None
+
+        # optional cross attention
+        if self.cross_attn is not None:
+            if encoder_hidden_states is None:
+                raise ValueError("Requested cross-attention without providing encoder hidden states")
+
+            cross_attn_outputs = self.cross_attn(
+                query=new_hidden_states,
+                key=encoder_hidden_states,
+                value=encoder_hidden_states,
+                key_padding_mask=encoder_attention_mask,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+
+            # update the hidden state from cross attention
+            new_hidden_states = cross_attn_outputs[0]
+            # store cross-attention k/v if caching
+            cross_key, cross_value = cross_attn_outputs[-2:] if use_cache else (None, None)
+            cross_attention_weights = cross_attn_outputs[1] if output_attentions else None
+
+        # optional NFFN follows cross attention
+        if self.nffn is not None:
+            new_hidden_states = self.nffn(new_hidden_states)
+
+        outs = (new_hidden_states,)
+        if output_attentions:
+            outs = outs + (self_attention_weights,)
+            if self.cross_attn is not None:
+                outs = outs + (cross_attention_weights,)
+
+        if use_cache:
+            new_key_values = (
+                self_key,
+                self_value,
+                self_ema_state,
+            )
+            if self.cross_attn is not None:
+                new_key_values = new_key_values + (cross_key, cross_value)
+
+            outs = outs + (new_key_values,)
+
+        return outs
+
+
+# copied from transformers.models.roberta.modeling_roberta.RobertaPooler with Roberta->Mega
+class MegaPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class MegaPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config: MegaConfig
+    base_model_prefix = "mega"
+    supports_gradient_checkpointing = False
+    _no_split_modules = ["MegaMovingAverageGatedAttention"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, MegaMultiDimensionDampedEma):
+            with torch.no_grad():
+                # delta & alpha
+                nn.init.normal_(module.damping_factor, mean=0.0, std=self.config.ema_delta_alpha_range)
+                nn.init.normal_(module.decay_factor, mean=0.0, std=self.config.ema_delta_alpha_range)
+                # beta [1, -1, 1, -1, ...] seems more stable.
+                val = torch.ones(self.config.ema_projection_size, 1)
+                if self.config.ema_projection_size > 1:
+                    idx = torch.tensor(list(range(1, self.config.ema_projection_size, 2)))
+                    val.index_fill_(0, idx, -1.0)
+                module.ema_expansion_matrix.normal_(mean=0.0, std=self.config.ema_beta_range).add_(val)
+                # gamma & omega
+                nn.init.normal_(module.kernel_projection_matrix, mean=0.0, std=self.config.ema_gamma_omega_range)
+                nn.init.normal_(module.residual_weight, mean=0.0, std=self.config.ema_gamma_omega_range)
+        elif isinstance(module, MegaSimpleRelativePositionalBias):
+            nn.init.normal_(module.rel_pos_bias, mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, MegaRotaryRelativePositionalBias):
+            nn.init.normal_(module.alpha, mean=0.0, std=self.config.initializer_range)
+            nn.init.normal_(module.b_param, mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, MegaScaleNorm):
+            if self.config.norm_affine:
+                nn.init.constant_(module.scalar, 1.0)
+        elif isinstance(module, MegaRMSNorm):
+            if self.config.norm_affine:
+                nn.init.constant_(module.weight, 1.0)
+        elif isinstance(module, MegaMovingAverageGatedAttention):
+            # linear layers covered separately by the generic nn.Linear init below
+            nn.init.normal_(module.qk_weight, mean=0.0, std=self.config.initializer_range)
+            nn.init.constant_(module.qk_bias, 0.0)
+        elif isinstance(module, nn.Linear):
+            # initializes all linear layers in the entire network
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+MEGA_START_DOCSTRING = r"""
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`MegaConfig`]): Model configuration class with all the parameters of the
+            model. Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+MEGA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+            This parameter can only be used when the model is initialized with `add_token_type_embeddings` parameter
+            set to `True`. All the value in this tensor should be always < config.type_vocab_size.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare MEGA Model transformer outputting raw hidden-states without any specific head on top.",
+    MEGA_START_DOCSTRING,
+)
+class MegaModel(MegaPreTrainedModel):
+    """
+
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added after self-attention, following the architecture described in *Mega: Moving Average
+    Equipped Gated Attention*_ by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig,
+    Jonathan May, and Luke Zettlemoyer
+
+    To behave as a decoder the model needs to be initialized with the `is_decoder` argument of the configuration set to
+    `True` and `bidirectional` set to `False`. To be used in a Seq2Seq model, the model needs to initialized with both
+    `is_decoder=True` and `bidirectional=False` argument as well as `add_cross_attention` set to `True`; an
+    `encoder_hidden_states` is then expected as an input to the forward pass.
+
+    .. _*Mega: Moving Average Equipped Gated Attention*: https://huggingface.co/papers/2209.10655
+
+    """
+
+    def __init__(self, config: MegaConfig, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+
+        self.embedding_layer = MegaEmbeddings(config)
+        self.layers = nn.ModuleList([MegaBlock(config) for _ in range(config.num_hidden_layers)])
+
+        self.pooler = MegaPooler(config) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing (retained from RoBERTa code)
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embedding_layer.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embedding_layer.word_embeddings = value
+
+    @add_start_docstrings_to_model_forward(MEGA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPoolingAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+            device = input_ids.device
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            device = inputs_embeds.device
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if self.config.use_chunking:
+            input_shape = torch.tensor([input_shape[0], self.config.chunk_size])
+
+        batch_size, sequence_length = input_shape
+
+        if self.config.use_chunking and (sequence_length > self.config.chunk_size):
+            if sequence_length % self.config.chunk_size != 0:
+                raise ValueError(
+                    f"config.use_chunking is activated; input sequence length must be shorter than or a multiple of config.chunk_size\nreceived sequence length of {sequence_length} with chunk size {self.config.chunk_size}"
+                )
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+            # Mega expects the causal mask to be a 2D square matrix of (from) x (to) over the input sequence length
+            # the HF utility function generates a 3D causal mask which includes batch size, so we'll create a dummy
+            # mask with the correct device and all ones
+            temp_mask_for_extension = torch.ones((1, sequence_length), dtype=torch.long, device=device)
+            causal_mask = self.create_extended_attention_mask_for_decoder(input_shape, temp_mask_for_extension)
+
+            # get rid of batch dimension in the generated mask; result is (sequence_length X sequence_length)
+            causal_mask = causal_mask.squeeze(0)
+        else:
+            use_cache = False
+            causal_mask = None
+
+        # if using cache, make sure we have a tuple of tuples which matches the length of our hidden layers
+        if (past_key_values is not None) and (len(past_key_values) != self.config.num_hidden_layers):
+            raise ValueError(
+                f"Received past key/value cache with size mismatch; expected {self.config.num_hidden_layers}, received {len(past_key_values)}"
+            )
+
+        # get embeddings (batch X sequence length X embed dim)
+        embedding_output = self.embedding_layer(
+            input_ids=input_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
+        )
+
+        # transpose for Mega --> (seq len X batch X embed dim)
+        hidden_states = embedding_output.transpose(0, 1)
+
+        # we expect encoder hidden states to also have batch first in line
+        # with typical Hugging Face behavior (which is also how we return them)
+        # Mega expects sequence length first, so do the same transpose here
+        if encoder_hidden_states is not None:
+            encoder_hidden_states = encoder_hidden_states.transpose(0, 1)
+
+        # pass through mega layers
+        all_hidden_states = (embedding_output,) if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+        next_decoder_cache = () if use_cache else None
+        for i, mega_layer in enumerate(self.layers):
+            current_decoder_cache = past_key_values[i] if past_key_values is not None else None
+            mega_outputs = mega_layer(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                causal_mask=causal_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                past_key_values=current_decoder_cache,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+
+            hidden_states = mega_outputs[0]
+            if output_hidden_states:
+                # store layer-wise hidden states in the way that the user expects
+                # (seq len X batch X embed dim) --> (batch X seq len X embed dim)
+                all_hidden_states += (hidden_states.transpose(0, 1),)
+            if output_attentions:
+                self_attn_weights = mega_outputs[1]
+                all_self_attentions += (self_attn_weights,)
+                if self.config.add_cross_attention:
+                    cross_attn_weights = mega_outputs[2]
+                    all_cross_attentions += (cross_attn_weights,)
+            if use_cache:
+                updated_cache = mega_outputs[-1]
+                next_decoder_cache += (updated_cache,)
+
+        # transpose final hidden states
+        hidden_states = hidden_states.transpose(0, 1)
+
+        # optional pooling layer
+        pooled_output = self.pooler(hidden_states) if self.pooler is not None else None
+
+        if not return_dict:
+            return (hidden_states, pooled_output) + (
+                all_hidden_states,
+                next_decoder_cache,
+                all_self_attentions,
+                all_cross_attentions,
+            )
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            pooler_output=pooled_output,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    """MEGA Model with a `language modeling` head on top for CLM fine-tuning.""", MEGA_START_DOCSTRING
+)
+class MegaForCausalLM(MegaPreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config: MegaConfig):
+        super().__init__(config)
+
+        if not config.is_decoder:
+            logger.warning("If you want to use `MegaForCausalLM` as a standalone, add `is_decoder=True.`")
+
+        self.mega = MegaModel(config, add_pooling_layer=False)
+
+        if config.add_lm_hidden_dense_layer:
+            self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+            self.hidden_activation = nn.Tanh()
+        else:
+            self.dense = None
+            self.hidden_activation = None
+
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(MEGA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, MegaForCausalLM, AutoConfig
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("mnaylor/mega-base-wikitext")
+        >>> config = AutoConfig.from_pretrained("mnaylor/mega-base-wikitext")
+        >>> config.is_decoder = True
+        >>> config.bidirectional = False
+        >>> model = MegaForCausalLM.from_pretrained(
+        ...     "mnaylor/mega-base-wikitext", config=config, ignore_mismatched_sizes=True
+        ... )
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> prediction_logits = outputs.logits
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        outputs = self.mega(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        if self.dense is not None:
+            sequence_output = self.dense(sequence_output)
+            sequence_output = self.hidden_activation(sequence_output)
+
+        prediction_scores = self.lm_head(sequence_output)
+
+        lm_loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss()
+            lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_shape)
+
+        # cut decoder_input_ids if past is used
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
+
+    def _reorder_cache(self, past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+
+
+@add_start_docstrings("""MEGA Model with a `language modeling` head on top.""", MEGA_START_DOCSTRING)
+class MegaForMaskedLM(MegaPreTrainedModel):
+    _tied_weights_keys = ["mlm_head.weight"]
+
+    def __init__(self, config: MegaConfig):
+        super().__init__(config)
+
+        if config.is_decoder:
+            logger.warning(
+                "If you want to use `MegaForMaskedLM`, set `config.is_decoder=False` for "
+                "bi-directional self-attention."
+            )
+
+        self.mega = MegaModel(config, add_pooling_layer=False)
+        if config.add_lm_hidden_dense_layer:
+            self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+            self.hidden_activation = nn.Tanh()
+        else:
+            self.dense = None
+            self.hidden_activation = None
+        self.mlm_head = nn.Linear(config.hidden_size, config.vocab_size)
+        self.dropout = nn.Dropout(config.dropout_prob)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.mlm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.mlm_head = new_embeddings
+
+    @add_start_docstrings_to_model_forward(MEGA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MaskedLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+        mask="<mask>",
+        expected_output="' Paris'",
+        expected_loss=0.1,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], MaskedLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        kwargs (`dict[str, any]`, optional, defaults to *{}*):
+            Used to hide legacy arguments that have been deprecated.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.mega(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        if self.dense is not None:
+            sequence_output = self.dense(sequence_output)
+            sequence_output = self.hidden_activation(sequence_output)
+        prediction_scores = self.mlm_head(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    MEGA Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
+    output) e.g. for GLUE tasks.
+    """,
+    MEGA_START_DOCSTRING,
+)
+class MegaForSequenceClassification(MegaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+
+        self.mega = MegaModel(config, add_pooling_layer=False)
+        self.classifier = MegaClassificationHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(MEGA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.mega(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    MEGA Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    MEGA_START_DOCSTRING,
+)
+class MegaForMultipleChoice(MegaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.mega = MegaModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(MEGA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], MultipleChoiceModelOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+            `input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+
+        flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        flat_inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+
+        outputs = self.mega(
+            flat_input_ids,
+            token_type_ids=flat_token_type_ids,
+            attention_mask=flat_attention_mask,
+            inputs_embeds=flat_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    MEGA Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    MEGA_START_DOCSTRING,
+)
+class MegaForTokenClassification(MegaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.mega = MegaModel(config, add_pooling_layer=False)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(MEGA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.mega(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+# copied from transformers.models.roberta.modeling_roberta.RobertaClassificationHead with Roberta->Mega
+class MegaClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(self, features, **kwargs):
+        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = torch.tanh(x)
+        x = self.dropout(x)
+        x = self.out_proj(x)
+        return x
+
+
+@add_start_docstrings(
+    """
+    MEGA Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    MEGA_START_DOCSTRING,
+)
+class MegaForQuestionAnswering(MegaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.mega = MegaModel(config, add_pooling_layer=False)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(MEGA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=QuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], QuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.mega(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = [
+    "MegaForCausalLM",
+    "MegaForMaskedLM",
+    "MegaForMultipleChoice",
+    "MegaForQuestionAnswering",
+    "MegaForSequenceClassification",
+    "MegaForTokenClassification",
+    "MegaModel",
+    "MegaPreTrainedModel",
+]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/mmbt/__init__.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/mmbt/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03b556e2eddf6d5f81f6f4a15346596dd5a32f85
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/mmbt/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ....utils import _LazyModule
+from ....utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_mmbt import *
+    from .modeling_mmbt import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/mmbt/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/mmbt/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..444dbab401e20eb96e693128bd97001040768eb0
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/mmbt/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/mmbt/__pycache__/configuration_mmbt.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/mmbt/__pycache__/configuration_mmbt.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cfeaf7a527e423f3ef0f5dcbda0ca07210aab3ec
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/mmbt/__pycache__/configuration_mmbt.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/mmbt/__pycache__/modeling_mmbt.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/mmbt/__pycache__/modeling_mmbt.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b0e8f9f81d9afa62331e09e1e5d7a48a528a87e0
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/mmbt/__pycache__/modeling_mmbt.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/mmbt/configuration_mmbt.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/mmbt/configuration_mmbt.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c58e4e6cdd0d0a8c87b9b94ca896c87970b5654
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/mmbt/configuration_mmbt.py
@@ -0,0 +1,45 @@
+# coding=utf-8
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""MMBT configuration"""
+
+from ....utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class MMBTConfig:
+    """
+    This is the configuration class to store the configuration of a [`MMBTModel`]. It is used to instantiate a MMBT
+    model according to the specified arguments, defining the model architecture.
+
+    Args:
+        config ([`PreTrainedConfig`]):
+            Config of the underlying Transformer models. Its values are copied over to use a single config.
+        num_labels (`int`, *optional*):
+            Size of final Linear layer for classification.
+        modal_hidden_size (`int`, *optional*, defaults to 2048):
+            Embedding dimension of the non-text modality encoder.
+    """
+
+    def __init__(self, config, num_labels=None, modal_hidden_size=2048):
+        self.__dict__ = config.__dict__
+        self.modal_hidden_size = modal_hidden_size
+        if num_labels:
+            self.num_labels = num_labels
+
+
+__all__ = ["MMBTConfig"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/mmbt/modeling_mmbt.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/mmbt/modeling_mmbt.py
new file mode 100644
index 0000000000000000000000000000000000000000..45ae577f7fced2d276f4f54c5bf859e27e08ebae
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/mmbt/modeling_mmbt.py
@@ -0,0 +1,410 @@
+# coding=utf-8
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch MMBT model."""
+
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss, MSELoss
+
+from ....modeling_outputs import BaseModelOutputWithPooling, SequenceClassifierOutput
+from ....modeling_utils import ModuleUtilsMixin
+from ....utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "MMBTConfig"
+
+
+class ModalEmbeddings(nn.Module):
+    """Generic Modal Embeddings which takes in an encoder, and a transformer embedding."""
+
+    def __init__(self, config, encoder, embeddings):
+        super().__init__()
+        self.config = config
+        self.encoder = encoder
+        self.proj_embeddings = nn.Linear(config.modal_hidden_size, config.hidden_size)
+        self.position_embeddings = embeddings.position_embeddings
+        self.token_type_embeddings = embeddings.token_type_embeddings
+        self.word_embeddings = embeddings.word_embeddings
+        self.LayerNorm = embeddings.LayerNorm
+        self.dropout = nn.Dropout(p=config.hidden_dropout_prob)
+
+    def forward(self, input_modal, start_token=None, end_token=None, position_ids=None, token_type_ids=None):
+        token_embeddings = self.proj_embeddings(self.encoder(input_modal))
+        seq_length = token_embeddings.size(1)
+
+        if start_token is not None:
+            start_token_embeds = self.word_embeddings(start_token)
+            seq_length += 1
+            token_embeddings = torch.cat([start_token_embeds.unsqueeze(1), token_embeddings], dim=1)
+
+        if end_token is not None:
+            end_token_embeds = self.word_embeddings(end_token)
+            seq_length += 1
+            token_embeddings = torch.cat([token_embeddings, end_token_embeds.unsqueeze(1)], dim=1)
+
+        if position_ids is None:
+            position_ids = torch.arange(seq_length, dtype=torch.long, device=input_modal.device)
+            position_ids = position_ids.unsqueeze(0).expand(input_modal.size(0), seq_length)
+
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(
+                (input_modal.size(0), seq_length), dtype=torch.long, device=input_modal.device
+            )
+
+        position_embeddings = self.position_embeddings(position_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+        embeddings = token_embeddings + position_embeddings + token_type_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+MMBT_START_DOCSTRING = r"""
+    MMBT model was proposed in [Supervised Multimodal Bitransformers for Classifying Images and
+    Text](https://github.com/facebookresearch/mmbt) by Douwe Kiela, Suvrat Bhooshan, Hamed Firooz, Davide Testuggine.
+    It's a supervised multimodal bitransformer model that fuses information from text and other image encoders, and
+    obtain state-of-the-art performance on various multimodal classification benchmark tasks.
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`MMBTConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration.
+        transformer (`nn.Module`): A text transformer that is used by MMBT.
+            It should have embeddings, encoder, and pooler attributes.
+        encoder (`nn.Module`): Encoder for the second modality.
+            It should take in a batch of modal inputs and return k, n dimension embeddings.
+"""
+
+MMBT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_modal (`torch.FloatTensor` of shape `(batch_size, ***)`):
+            The other modality data. It will be the shape that the encoder for that type expects. e.g. With an Image
+            Encoder, the shape would be (batch_size, channels, height, width)
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. It does not expect [CLS] token to be added as it's
+            appended to the end of other modality embeddings. Indices can be obtained using [`AutoTokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        modal_start_tokens (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Optional start token to be added to Other Modality Embedding. [CLS] Most commonly used for classification
+            tasks.
+        modal_end_tokens (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Optional end token to be added to Other Modality Embedding. [SEP] Most commonly used.
+        attention_mask (*optional*) `torch.FloatTensor` of shape `(batch_size, sequence_length)`:
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (*optional*) `torch.LongTensor` of shape `(batch_size, sequence_length)`:
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        modal_token_type_ids (*optional*) `torch.LongTensor` of shape `(batch_size, modal_sequence_length)`:
+            Segment token indices to indicate different portions of the non-text modality. The embeddings from these
+            tokens will be summed with the respective token embeddings for the non-text modality.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        modal_position_ids (`torch.LongTensor` of shape `(batch_size, modal_sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings for the non-text modality.
+            Selected in the range `[0, config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, embedding_dim)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare MMBT Model outputting raw hidden-states without any specific head on top.",
+    MMBT_START_DOCSTRING,
+)
+class MMBTModel(nn.Module, ModuleUtilsMixin):
+    def __init__(self, config, transformer, encoder):
+        super().__init__()
+        self.config = config
+        self.transformer = transformer
+        self.modal_encoder = ModalEmbeddings(config, encoder, transformer.embeddings)
+
+    @add_start_docstrings_to_model_forward(MMBT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_modal,
+        input_ids=None,
+        modal_start_tokens=None,
+        modal_end_tokens=None,
+        attention_mask=None,
+        token_type_ids=None,
+        modal_token_type_ids=None,
+        position_ids=None,
+        modal_position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        # For example purposes. Not runnable.
+        transformer = BertModel.from_pretrained("google-bert/bert-base-uncased")
+        encoder = ImageEncoder(args)
+        mmbt = MMBTModel(config, transformer, encoder)
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_txt_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_txt_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        modal_embeddings = self.modal_encoder(
+            input_modal,
+            start_token=modal_start_tokens,
+            end_token=modal_end_tokens,
+            position_ids=modal_position_ids,
+            token_type_ids=modal_token_type_ids,
+        )
+
+        input_modal_shape = modal_embeddings.size()[:-1]
+
+        if token_type_ids is None:
+            token_type_ids = torch.ones(input_txt_shape, dtype=torch.long, device=device)
+
+        txt_embeddings = self.transformer.embeddings(
+            input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
+        )
+
+        embedding_output = torch.cat([modal_embeddings, txt_embeddings], 1)
+
+        input_shape = embedding_output.size()[:-1]
+
+        if attention_mask is None:
+            attention_mask = torch.ones(input_shape, device=device)
+        else:
+            attention_mask = torch.cat(
+                [torch.ones(input_modal_shape, device=device, dtype=torch.long), attention_mask], dim=1
+            )
+        if encoder_attention_mask is None:
+            encoder_attention_mask = torch.ones(input_shape, device=device)
+        else:
+            encoder_attention_mask = torch.cat(
+                [torch.ones(input_modal_shape, device=device), encoder_attention_mask], dim=1
+            )
+
+        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
+        encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        encoder_outputs = self.transformer.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.transformer.pooler(sequence_output)
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+
+@add_start_docstrings(
+    """
+    MMBT Model with a sequence classification/regression head on top (a linear layer on top of the pooled output)
+    """,
+    MMBT_START_DOCSTRING,
+    MMBT_INPUTS_DOCSTRING,
+)
+class MMBTForClassification(nn.Module):
+    r"""
+    **labels**: (*optional*) `torch.LongTensor` of shape `(batch_size,)`:
+        Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+        config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+        `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+
+    Returns: *Tuple* comprising various elements depending on the configuration (config) and inputs: **loss**:
+    (*optional*, returned when `labels` is provided) `torch.FloatTensor` of shape `(1,)`: Classification (or
+    regression if config.num_labels==1) loss. **logits**:
+        `torch.FloatTensor` of shape `(batch_size, config.num_labels)` Classification (or regression if
+        config.num_labels==1) scores (before SoftMax).
+    **hidden_states**: (*optional*, returned when `output_hidden_states=True`) list of `torch.FloatTensor` (one for
+    the output of each layer + the output of the embeddings) of shape `(batch_size, sequence_length, hidden_size)`:
+    Hidden-states of the model at the output of each layer plus the initial embedding outputs. **attentions**:
+    (*optional*, returned when `output_attentions=True`) list of `torch.FloatTensor` (one for each layer) of shape
+    `(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used
+    to compute the weighted average in the self-attention heads.
+
+    Examples:
+
+    ```python
+    # For example purposes. Not runnable.
+    transformer = BertModel.from_pretrained("google-bert/bert-base-uncased")
+    encoder = ImageEncoder(args)
+    model = MMBTForClassification(config, transformer, encoder)
+    outputs = model(input_modal, input_ids, labels=labels)
+    loss, logits = outputs[:2]
+    ```"""
+
+    def __init__(self, config, transformer, encoder):
+        super().__init__()
+        self.num_labels = config.num_labels
+
+        self.mmbt = MMBTModel(config, transformer, encoder)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(
+        self,
+        input_modal,
+        input_ids=None,
+        modal_start_tokens=None,
+        modal_end_tokens=None,
+        attention_mask=None,
+        token_type_ids=None,
+        modal_token_type_ids=None,
+        position_ids=None,
+        modal_position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        return_dict=None,
+    ):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.mmbt(
+            input_modal=input_modal,
+            input_ids=input_ids,
+            modal_start_tokens=modal_start_tokens,
+            modal_end_tokens=modal_end_tokens,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            modal_token_type_ids=modal_token_type_ids,
+            position_ids=position_ids,
+            modal_position_ids=modal_position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.num_labels == 1:
+                #  We are doing regression
+                loss_fct = MSELoss()
+                loss = loss_fct(logits.view(-1), labels.view(-1))
+            else:
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = ["MMBTForClassification", "MMBTModel", "ModalEmbeddings"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/nat/__init__.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/nat/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5373969ce7831491b0d5fa5495078fb1d3f6e4e
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/nat/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ....utils import _LazyModule
+from ....utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_nat import *
+    from .modeling_nat import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/nat/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/nat/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9002d7bda03916286f1153a2e9e1e94a5c602447
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/nat/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/nat/__pycache__/configuration_nat.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/nat/__pycache__/configuration_nat.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dedba7310de4e7487921312c93c8e9180983629e
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/nat/__pycache__/configuration_nat.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/nat/__pycache__/modeling_nat.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/nat/__pycache__/modeling_nat.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1417a933a7deecf416ba201698a6c8c9fa25a9ce
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/nat/__pycache__/modeling_nat.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/nat/configuration_nat.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/nat/configuration_nat.py
new file mode 100644
index 0000000000000000000000000000000000000000..002eaa7f82b6927ab383523e9b9a6abc5c2433a9
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/nat/configuration_nat.py
@@ -0,0 +1,148 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Neighborhood Attention Transformer model configuration"""
+
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
+from ....utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
+
+
+logger = logging.get_logger(__name__)
+
+
+class NatConfig(BackboneConfigMixin, PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`NatModel`]. It is used to instantiate a Nat model
+    according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Nat
+    [shi-labs/nat-mini-in1k-224](https://huggingface.co/shi-labs/nat-mini-in1k-224) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        patch_size (`int`, *optional*, defaults to 4):
+            The size (resolution) of each patch. NOTE: Only patch size of 4 is supported at the moment.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        embed_dim (`int`, *optional*, defaults to 64):
+            Dimensionality of patch embedding.
+        depths (`list[int]`, *optional*, defaults to `[3, 4, 6, 5]`):
+            Number of layers in each level of the encoder.
+        num_heads (`list[int]`, *optional*, defaults to `[2, 4, 8, 16]`):
+            Number of attention heads in each layer of the Transformer encoder.
+        kernel_size (`int`, *optional*, defaults to 7):
+            Neighborhood Attention kernel size.
+        mlp_ratio (`float`, *optional*, defaults to 3.0):
+            Ratio of MLP hidden dimensionality to embedding dimensionality.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether or not a learnable bias should be added to the queries, keys and values.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings and encoder.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        drop_path_rate (`float`, *optional*, defaults to 0.1):
+            Stochastic depth rate.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder. If string, `"gelu"`, `"relu"`,
+            `"selu"` and `"gelu_new"` are supported.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        layer_scale_init_value (`float`, *optional*, defaults to 0.0):
+            The initial value for the layer scale. Disabled if <=0.
+        out_features (`list[str]`, *optional*):
+            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+        out_indices (`list[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+
+    Example:
+
+    ```python
+    >>> from transformers import NatConfig, NatModel
+
+    >>> # Initializing a Nat shi-labs/nat-mini-in1k-224 style configuration
+    >>> configuration = NatConfig()
+
+    >>> # Initializing a model (with random weights) from the shi-labs/nat-mini-in1k-224 style configuration
+    >>> model = NatModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "nat"
+
+    attribute_map = {
+        "num_attention_heads": "num_heads",
+        "num_hidden_layers": "num_layers",
+    }
+
+    def __init__(
+        self,
+        patch_size=4,
+        num_channels=3,
+        embed_dim=64,
+        depths=[3, 4, 6, 5],
+        num_heads=[2, 4, 8, 16],
+        kernel_size=7,
+        mlp_ratio=3.0,
+        qkv_bias=True,
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        drop_path_rate=0.1,
+        hidden_act="gelu",
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        layer_scale_init_value=0.0,
+        out_features=None,
+        out_indices=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.embed_dim = embed_dim
+        self.depths = depths
+        self.num_layers = len(depths)
+        self.num_heads = num_heads
+        self.kernel_size = kernel_size
+        self.mlp_ratio = mlp_ratio
+        self.qkv_bias = qkv_bias
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.drop_path_rate = drop_path_rate
+        self.hidden_act = hidden_act
+        self.layer_norm_eps = layer_norm_eps
+        self.initializer_range = initializer_range
+        # we set the hidden_size attribute in order to make Nat work with VisionEncoderDecoderModel
+        # this indicates the channel dimension after the last stage of the model
+        self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1))
+        self.layer_scale_init_value = layer_scale_init_value
+        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
+
+
+__all__ = ["NatConfig"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/nat/modeling_nat.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/nat/modeling_nat.py
new file mode 100644
index 0000000000000000000000000000000000000000..a619cdb11225779eb6ab499840658754be65c9ff
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/nat/modeling_nat.py
@@ -0,0 +1,932 @@
+# coding=utf-8
+# Copyright 2022 SHI Labs and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Neighborhood Attention Transformer model."""
+
+import math
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+from torch import nn
+
+from ....activations import ACT2FN
+from ....modeling_outputs import BackboneOutput
+from ....modeling_utils import PreTrainedModel
+from ....pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ....utils import (
+    ModelOutput,
+    OptionalDependencyNotAvailable,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_natten_available,
+    logging,
+    replace_return_docstrings,
+    requires_backends,
+)
+from ....utils.backbone_utils import BackboneMixin
+from .configuration_nat import NatConfig
+
+
+if is_natten_available():
+    from natten.functional import natten2dav, natten2dqkrpb
+else:
+
+    def natten2dqkrpb(*args, **kwargs):
+        raise OptionalDependencyNotAvailable()
+
+    def natten2dav(*args, **kwargs):
+        raise OptionalDependencyNotAvailable()
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "NatConfig"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "shi-labs/nat-mini-in1k-224"
+_EXPECTED_OUTPUT_SHAPE = [1, 7, 7, 512]
+
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "shi-labs/nat-mini-in1k-224"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "tiger cat"
+
+
+# drop_path and NatDropPath are from the timm library.
+
+
+@dataclass
+class NatEncoderOutput(ModelOutput):
+    """
+    Nat encoder's outputs, with potential hidden states and attentions.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+            shape `(batch_size, hidden_size, height, width)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
+            include the spatial dimensions.
+    """
+
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+    reshaped_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+@dataclass
+class NatModelOutput(ModelOutput):
+    """
+    Nat model's outputs that also contains a pooling of the last hidden states.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
+            Average pooling of the last layer hidden-state.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+            shape `(batch_size, hidden_size, height, width)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
+            include the spatial dimensions.
+    """
+
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    pooler_output: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+    reshaped_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+@dataclass
+class NatImageClassifierOutput(ModelOutput):
+    """
+    Nat outputs for image classification.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Classification (or regression if config.num_labels==1) loss.
+        logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+            shape `(batch_size, hidden_size, height, width)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
+            include the spatial dimensions.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+    reshaped_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+class NatEmbeddings(nn.Module):
+    """
+    Construct the patch and position embeddings.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.patch_embeddings = NatPatchEmbeddings(config)
+
+        self.norm = nn.LayerNorm(config.embed_dim)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, pixel_values: Optional[torch.FloatTensor]) -> tuple[torch.Tensor]:
+        embeddings = self.patch_embeddings(pixel_values)
+        embeddings = self.norm(embeddings)
+
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+class NatPatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, height, width, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        patch_size = config.patch_size
+        num_channels, hidden_size = config.num_channels, config.embed_dim
+        self.num_channels = num_channels
+
+        if patch_size == 4:
+            pass
+        else:
+            # TODO: Support arbitrary patch sizes.
+            raise ValueError("Dinat only supports patch size of 4 at the moment.")
+
+        self.projection = nn.Sequential(
+            nn.Conv2d(self.num_channels, hidden_size // 2, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)),
+            nn.Conv2d(hidden_size // 2, hidden_size, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)),
+        )
+
+    def forward(self, pixel_values: Optional[torch.FloatTensor]) -> torch.Tensor:
+        _, num_channels, height, width = pixel_values.shape
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        embeddings = self.projection(pixel_values)
+        embeddings = embeddings.permute(0, 2, 3, 1)
+
+        return embeddings
+
+
+class NatDownsampler(nn.Module):
+    """
+    Convolutional Downsampling Layer.
+
+    Args:
+        dim (`int`):
+            Number of input channels.
+        norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
+            Normalization layer class.
+    """
+
+    def __init__(self, dim: int, norm_layer: nn.Module = nn.LayerNorm) -> None:
+        super().__init__()
+        self.dim = dim
+        self.reduction = nn.Conv2d(dim, 2 * dim, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
+        self.norm = norm_layer(2 * dim)
+
+    def forward(self, input_feature: torch.Tensor) -> torch.Tensor:
+        input_feature = self.reduction(input_feature.permute(0, 3, 1, 2)).permute(0, 2, 3, 1)
+        input_feature = self.norm(input_feature)
+        return input_feature
+
+
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+
+
+class NatDropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return f"p={self.drop_prob}"
+
+
+class NeighborhoodAttention(nn.Module):
+    def __init__(self, config, dim, num_heads, kernel_size):
+        super().__init__()
+        if dim % num_heads != 0:
+            raise ValueError(
+                f"The hidden size ({dim}) is not a multiple of the number of attention heads ({num_heads})"
+            )
+
+        self.num_attention_heads = num_heads
+        self.attention_head_size = int(dim / num_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.kernel_size = kernel_size
+
+        # rpb is learnable relative positional biases; same concept is used Swin.
+        self.rpb = nn.Parameter(torch.zeros(num_heads, (2 * self.kernel_size - 1), (2 * self.kernel_size - 1)))
+
+        self.query = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
+        self.key = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
+        self.value = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 3, 1, 2, 4)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.Tensor]:
+        query_layer = self.transpose_for_scores(self.query(hidden_states))
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        # Apply the scale factor before computing attention weights. It's usually more efficient because
+        # attention weights are typically a bigger tensor compared to query.
+        # It gives identical results because scalars are commutable in matrix multiplication.
+        query_layer = query_layer / math.sqrt(self.attention_head_size)
+
+        # Compute NA between "query" and "key" to get the raw attention scores, and add relative positional biases.
+        attention_scores = natten2dqkrpb(query_layer, key_layer, self.rpb, self.kernel_size, 1)
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        context_layer = natten2dav(attention_probs, value_layer, self.kernel_size, 1)
+        context_layer = context_layer.permute(0, 2, 3, 1, 4).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+class NeighborhoodAttentionOutput(nn.Module):
+    def __init__(self, config, dim):
+        super().__init__()
+        self.dense = nn.Linear(dim, dim)
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        return hidden_states
+
+
+class NeighborhoodAttentionModule(nn.Module):
+    def __init__(self, config, dim, num_heads, kernel_size):
+        super().__init__()
+        self.self = NeighborhoodAttention(config, dim, num_heads, kernel_size)
+        self.output = NeighborhoodAttentionOutput(config, dim)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.Tensor]:
+        self_outputs = self.self(hidden_states, output_attentions)
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class NatIntermediate(nn.Module):
+    def __init__(self, config, dim):
+        super().__init__()
+        self.dense = nn.Linear(dim, int(config.mlp_ratio * dim))
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class NatOutput(nn.Module):
+    def __init__(self, config, dim):
+        super().__init__()
+        self.dense = nn.Linear(int(config.mlp_ratio * dim), dim)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+class NatLayer(nn.Module):
+    def __init__(self, config, dim, num_heads, drop_path_rate=0.0):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.kernel_size = config.kernel_size
+        self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps)
+        self.attention = NeighborhoodAttentionModule(config, dim, num_heads, kernel_size=self.kernel_size)
+        self.drop_path = NatDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
+        self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps)
+        self.intermediate = NatIntermediate(config, dim)
+        self.output = NatOutput(config, dim)
+        self.layer_scale_parameters = (
+            nn.Parameter(config.layer_scale_init_value * torch.ones((2, dim)), requires_grad=True)
+            if config.layer_scale_init_value > 0
+            else None
+        )
+
+    def maybe_pad(self, hidden_states, height, width):
+        window_size = self.kernel_size
+        pad_values = (0, 0, 0, 0, 0, 0)
+        if height < window_size or width < window_size:
+            pad_l = pad_t = 0
+            pad_r = max(0, window_size - width)
+            pad_b = max(0, window_size - height)
+            pad_values = (0, 0, pad_l, pad_r, pad_t, pad_b)
+            hidden_states = nn.functional.pad(hidden_states, pad_values)
+        return hidden_states, pad_values
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        batch_size, height, width, channels = hidden_states.size()
+        shortcut = hidden_states
+
+        hidden_states = self.layernorm_before(hidden_states)
+        # pad hidden_states if they are smaller than kernel size
+        hidden_states, pad_values = self.maybe_pad(hidden_states, height, width)
+
+        _, height_pad, width_pad, _ = hidden_states.shape
+
+        attention_outputs = self.attention(hidden_states, output_attentions=output_attentions)
+
+        attention_output = attention_outputs[0]
+
+        was_padded = pad_values[3] > 0 or pad_values[5] > 0
+        if was_padded:
+            attention_output = attention_output[:, :height, :width, :].contiguous()
+
+        if self.layer_scale_parameters is not None:
+            attention_output = self.layer_scale_parameters[0] * attention_output
+
+        hidden_states = shortcut + self.drop_path(attention_output)
+
+        layer_output = self.layernorm_after(hidden_states)
+        layer_output = self.output(self.intermediate(layer_output))
+
+        if self.layer_scale_parameters is not None:
+            layer_output = self.layer_scale_parameters[1] * layer_output
+
+        layer_output = hidden_states + self.drop_path(layer_output)
+
+        layer_outputs = (layer_output, attention_outputs[1]) if output_attentions else (layer_output,)
+        return layer_outputs
+
+
+class NatStage(nn.Module):
+    def __init__(self, config, dim, depth, num_heads, drop_path_rate, downsample):
+        super().__init__()
+        self.config = config
+        self.dim = dim
+        self.layers = nn.ModuleList(
+            [
+                NatLayer(
+                    config=config,
+                    dim=dim,
+                    num_heads=num_heads,
+                    drop_path_rate=drop_path_rate[i],
+                )
+                for i in range(depth)
+            ]
+        )
+
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(dim=dim, norm_layer=nn.LayerNorm)
+        else:
+            self.downsample = None
+
+        self.pointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.Tensor]:
+        _, height, width, _ = hidden_states.size()
+        for i, layer_module in enumerate(self.layers):
+            layer_outputs = layer_module(hidden_states, output_attentions)
+            hidden_states = layer_outputs[0]
+
+        hidden_states_before_downsampling = hidden_states
+        if self.downsample is not None:
+            hidden_states = self.downsample(hidden_states_before_downsampling)
+
+        stage_outputs = (hidden_states, hidden_states_before_downsampling)
+
+        if output_attentions:
+            stage_outputs += layer_outputs[1:]
+        return stage_outputs
+
+
+class NatEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.num_levels = len(config.depths)
+        self.config = config
+        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths), device="cpu")]
+        self.levels = nn.ModuleList(
+            [
+                NatStage(
+                    config=config,
+                    dim=int(config.embed_dim * 2**i_layer),
+                    depth=config.depths[i_layer],
+                    num_heads=config.num_heads[i_layer],
+                    drop_path_rate=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])],
+                    downsample=NatDownsampler if (i_layer < self.num_levels - 1) else None,
+                )
+                for i_layer in range(self.num_levels)
+            ]
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        output_hidden_states_before_downsampling: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[tuple, NatEncoderOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_reshaped_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        if output_hidden_states:
+            # rearrange b h w c -> b c h w
+            reshaped_hidden_state = hidden_states.permute(0, 3, 1, 2)
+            all_hidden_states += (hidden_states,)
+            all_reshaped_hidden_states += (reshaped_hidden_state,)
+
+        for i, layer_module in enumerate(self.levels):
+            layer_outputs = layer_module(hidden_states, output_attentions)
+
+            hidden_states = layer_outputs[0]
+            hidden_states_before_downsampling = layer_outputs[1]
+
+            if output_hidden_states and output_hidden_states_before_downsampling:
+                # rearrange b h w c -> b c h w
+                reshaped_hidden_state = hidden_states_before_downsampling.permute(0, 3, 1, 2)
+                all_hidden_states += (hidden_states_before_downsampling,)
+                all_reshaped_hidden_states += (reshaped_hidden_state,)
+            elif output_hidden_states and not output_hidden_states_before_downsampling:
+                # rearrange b h w c -> b c h w
+                reshaped_hidden_state = hidden_states.permute(0, 3, 1, 2)
+                all_hidden_states += (hidden_states,)
+                all_reshaped_hidden_states += (reshaped_hidden_state,)
+
+            if output_attentions:
+                all_self_attentions += layer_outputs[2:]
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+
+        return NatEncoderOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            reshaped_hidden_states=all_reshaped_hidden_states,
+        )
+
+
+class NatPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config: NatConfig
+    base_model_prefix = "nat"
+    main_input_name = "pixel_values"
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+NAT_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`NatConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+NAT_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`]
+            for details.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Nat Model transformer outputting raw hidden-states without any specific head on top.",
+    NAT_START_DOCSTRING,
+)
+class NatModel(NatPreTrainedModel):
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+
+        requires_backends(self, ["natten"])
+
+        self.config = config
+        self.num_levels = len(config.depths)
+        self.num_features = int(config.embed_dim * 2 ** (self.num_levels - 1))
+
+        self.embeddings = NatEmbeddings(config)
+        self.encoder = NatEncoder(config)
+
+        self.layernorm = nn.LayerNorm(self.num_features, eps=config.layer_norm_eps)
+        self.pooler = nn.AdaptiveAvgPool1d(1) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.patch_embeddings
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(NAT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=NatModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, NatModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        embedding_output = self.embeddings(pixel_values)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+
+        pooled_output = None
+        if self.pooler is not None:
+            pooled_output = self.pooler(sequence_output.flatten(1, 2).transpose(1, 2))
+            pooled_output = torch.flatten(pooled_output, 1)
+
+        if not return_dict:
+            output = (sequence_output, pooled_output) + encoder_outputs[1:]
+
+            return output
+
+        return NatModelOutput(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            reshaped_hidden_states=encoder_outputs.reshaped_hidden_states,
+        )
+
+
+@add_start_docstrings(
+    """
+    Nat Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
+    the [CLS] token) e.g. for ImageNet.
+    """,
+    NAT_START_DOCSTRING,
+)
+class NatForImageClassification(NatPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        requires_backends(self, ["natten"])
+
+        self.num_labels = config.num_labels
+        self.nat = NatModel(config)
+
+        # Classifier head
+        self.classifier = (
+            nn.Linear(self.nat.num_features, config.num_labels) if config.num_labels > 0 else nn.Identity()
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(NAT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=NatImageClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, NatImageClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.nat(
+            pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(labels, logits, self.config)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return NatImageClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            reshaped_hidden_states=outputs.reshaped_hidden_states,
+        )
+
+
+@add_start_docstrings(
+    "NAT backbone, to be used with frameworks like DETR and MaskFormer.",
+    NAT_START_DOCSTRING,
+)
+class NatBackbone(NatPreTrainedModel, BackboneMixin):
+    def __init__(self, config):
+        super().__init__(config)
+        super()._init_backbone(config)
+
+        requires_backends(self, ["natten"])
+
+        self.embeddings = NatEmbeddings(config)
+        self.encoder = NatEncoder(config)
+        self.num_features = [config.embed_dim] + [int(config.embed_dim * 2**i) for i in range(len(config.depths))]
+
+        # Add layer norms to hidden states of out_features
+        hidden_states_norms = {}
+        for stage, num_channels in zip(self.out_features, self.channels):
+            hidden_states_norms[stage] = nn.LayerNorm(num_channels)
+        self.hidden_states_norms = nn.ModuleDict(hidden_states_norms)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.patch_embeddings
+
+    @add_start_docstrings_to_model_forward(NAT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> BackboneOutput:
+        """
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, AutoBackbone
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> processor = AutoImageProcessor.from_pretrained("shi-labs/nat-mini-in1k-224")
+        >>> model = AutoBackbone.from_pretrained(
+        ...     "shi-labs/nat-mini-in1k-224", out_features=["stage1", "stage2", "stage3", "stage4"]
+        ... )
+
+        >>> inputs = processor(image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+
+        >>> feature_maps = outputs.feature_maps
+        >>> list(feature_maps[-1].shape)
+        [1, 512, 7, 7]
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        embedding_output = self.embeddings(pixel_values)
+
+        outputs = self.encoder(
+            embedding_output,
+            output_attentions=output_attentions,
+            output_hidden_states=True,
+            output_hidden_states_before_downsampling=True,
+            return_dict=True,
+        )
+
+        hidden_states = outputs.reshaped_hidden_states
+
+        feature_maps = ()
+        for stage, hidden_state in zip(self.stage_names, hidden_states):
+            if stage in self.out_features:
+                # TODO can we simplify this?
+                batch_size, num_channels, height, width = hidden_state.shape
+                hidden_state = hidden_state.permute(0, 2, 3, 1).contiguous()
+                hidden_state = hidden_state.view(batch_size, height * width, num_channels)
+                hidden_state = self.hidden_states_norms[stage](hidden_state)
+                hidden_state = hidden_state.view(batch_size, height, width, num_channels)
+                hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous()
+                feature_maps += (hidden_state,)
+
+        if not return_dict:
+            output = (feature_maps,)
+            if output_hidden_states:
+                output += (outputs.hidden_states,)
+            return output
+
+        return BackboneOutput(
+            feature_maps=feature_maps,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = ["NatForImageClassification", "NatModel", "NatPreTrainedModel", "NatBackbone"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/nezha/__init__.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/nezha/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0690129ae9edf84829278790b5e065bbd6608ee
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/nezha/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ....utils import _LazyModule
+from ....utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_nezha import *
+    from .modeling_nezha import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/nezha/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/nezha/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a9e41f8b5b8bdb4a94e796ca45760aabcbadb3e7
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/nezha/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/nezha/__pycache__/configuration_nezha.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/nezha/__pycache__/configuration_nezha.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dd3850f62442cb6f5bb5d5286e69549a7982f757
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/nezha/__pycache__/configuration_nezha.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/nezha/__pycache__/modeling_nezha.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/nezha/__pycache__/modeling_nezha.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2d90a4bada1dace9abd3529b671025ee2383e7dd
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/nezha/__pycache__/modeling_nezha.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/nezha/configuration_nezha.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/nezha/configuration_nezha.py
new file mode 100644
index 0000000000000000000000000000000000000000..00d193cd1ae68655e1631b7caea2220987a29f0b
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/nezha/configuration_nezha.py
@@ -0,0 +1,105 @@
+from .... import PretrainedConfig
+
+
+class NezhaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of an [`NezhaModel`]. It is used to instantiate an Nezha
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Nezha
+    [sijunhe/nezha-cn-base](https://huggingface.co/sijunhe/nezha-cn-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, optional, defaults to 21128):
+            Vocabulary size of the NEZHA model. Defines the different tokens that can be represented by the
+            *inputs_ids* passed to the forward method of [`NezhaModel`].
+        hidden_size (`int`, optional, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, optional, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, optional, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, optional, defaults to 3072):
+            The dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, optional, defaults to "gelu"):
+            The non-linear activation function (function or string) in the encoder and pooler.
+        hidden_dropout_prob (`float`, optional, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, optional, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, optional, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, optional, defaults to 2):
+            The vocabulary size of the *token_type_ids* passed into [`NezhaModel`].
+        initializer_range (`float`, optional, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, optional, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        classifier_dropout (`float`, optional, defaults to 0.1):
+            The dropout ratio for attached classifiers.
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
+
+    Example:
+
+    ```python
+    >>> from transformers import NezhaConfig, NezhaModel
+
+    >>> # Initializing an Nezha configuration
+    >>> configuration = NezhaConfig()
+
+    >>> # Initializing a model (with random weights) from the Nezha-base style configuration model
+    >>> model = NezhaModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "nezha"
+
+    def __init__(
+        self,
+        vocab_size=21128,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        max_relative_position=64,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        classifier_dropout=0.1,
+        pad_token_id=0,
+        bos_token_id=2,
+        eos_token_id=3,
+        use_cache=True,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.max_relative_position = max_relative_position
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.classifier_dropout = classifier_dropout
+        self.use_cache = use_cache
+
+
+__all__ = ["NezhaConfig"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/nezha/modeling_nezha.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/nezha/modeling_nezha.py
new file mode 100644
index 0000000000000000000000000000000000000000..ddfecac9f50651ad16ff53814b262435e7edcc3e
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/nezha/modeling_nezha.py
@@ -0,0 +1,1689 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Nezha model."""
+
+import math
+import os
+import warnings
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ....activations import ACT2FN
+from ....cache_utils import Cache
+from ....modeling_layers import GradientCheckpointingLayer
+from ....modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    NextSentencePredictorOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from ....modeling_utils import PreTrainedModel
+from ....pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ....utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from ....utils.deprecation import deprecate_kwarg
+from .configuration_nezha import NezhaConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "sijunhe/nezha-cn-base"
+_CONFIG_FOR_DOC = "NezhaConfig"
+
+
+def load_tf_weights_in_nezha(model, config, tf_checkpoint_path):
+    """Load tf checkpoints in a pytorch model."""
+    try:
+        import re
+
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        logger.info(f"Loading TF weight {name} with shape {shape}")
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+
+    for name, array in zip(names, arrays):
+        name = name.split("/")
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(
+            n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
+            for n in name
+        ):
+            logger.info(f"Skipping {'/'.join(name)}")
+            continue
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
+                scope_names = re.split(r"_(\d+)", m_name)
+            else:
+                scope_names = [m_name]
+            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
+                pointer = getattr(pointer, "bias")
+            elif scope_names[0] == "output_weights":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "squad":
+                pointer = getattr(pointer, "classifier")
+            else:
+                try:
+                    pointer = getattr(pointer, scope_names[0])
+                except AttributeError:
+                    logger.info(f"Skipping {'/'.join(name)}")
+                    continue
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
+                pointer = pointer[num]
+        if m_name[-11:] == "_embeddings":
+            pointer = getattr(pointer, "weight")
+        elif m_name == "kernel":
+            array = np.transpose(array)
+        try:
+            if pointer.shape != array.shape:
+                raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        logger.info(f"Initialize PyTorch weight {name}")
+        pointer.data = torch.from_numpy(array)
+    return model
+
+
+class NezhaRelativePositionsEncoding(nn.Module):
+    """Implement the Functional Relative Position Encoding"""
+
+    def __init__(self, length, depth, max_relative_position=127):
+        super().__init__()
+        vocab_size = max_relative_position * 2 + 1
+        range_vec = torch.arange(length)
+        range_mat = range_vec.repeat(length).view(length, length)
+        distance_mat = range_mat - torch.t(range_mat)
+        distance_mat_clipped = torch.clamp(distance_mat, -max_relative_position, max_relative_position)
+        final_mat = distance_mat_clipped + max_relative_position
+
+        embeddings_table = torch.zeros(vocab_size, depth)
+        position = torch.arange(0, vocab_size, dtype=torch.int64).float().unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, depth, 2).float() * (-math.log(10000.0) / depth))
+        embeddings_table[:, 0::2] = torch.sin(position * div_term)
+        embeddings_table[:, 1::2] = torch.cos(position * div_term)
+
+        flat_relative_positions_matrix = final_mat.view(-1)
+        one_hot_relative_positions_matrix = torch.nn.functional.one_hot(
+            flat_relative_positions_matrix, num_classes=vocab_size
+        ).float()
+        positions_encoding = torch.matmul(one_hot_relative_positions_matrix, embeddings_table)
+        my_shape = list(final_mat.size())
+        my_shape.append(depth)
+        positions_encoding = positions_encoding.view(my_shape)
+        self.register_buffer("positions_encoding", positions_encoding, persistent=False)
+
+    def forward(self, length):
+        return self.positions_encoding[:length, :length, :]
+
+
+class NezhaEmbeddings(nn.Module):
+    """Construct the embeddings from word and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.register_buffer(
+            "token_type_ids", torch.zeros((1, config.max_position_embeddings), dtype=torch.long), persistent=False
+        )
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+    ) -> torch.Tensor:
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=inputs_embeds.device)
+
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class NezhaSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.relative_positions_encoding = NezhaRelativePositionsEncoding(
+            length=config.max_position_embeddings,
+            depth=self.attention_head_size,
+            max_relative_position=config.max_relative_position,
+        )
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.Tensor]:
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_values is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_values[0]
+            value_layer = past_key_values[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_values is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_values[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_values[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_values` is always `None`
+            past_key_values = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        batch_size, num_attention_heads, from_seq_length, to_seq_length = attention_scores.size()
+        relations_keys = self.relative_positions_encoding(to_seq_length)
+        query_layer_t = query_layer.permute(2, 0, 1, 3)
+
+        query_layer_r = query_layer_t.contiguous().view(
+            from_seq_length, batch_size * num_attention_heads, self.attention_head_size
+        )
+        key_position_scores = torch.matmul(query_layer_r, relations_keys.permute(0, 2, 1))
+        key_position_scores_r = key_position_scores.view(
+            from_seq_length, batch_size, num_attention_heads, from_seq_length
+        )
+        key_position_scores_r_t = key_position_scores_r.permute(1, 2, 0, 3)
+        attention_scores = attention_scores + key_position_scores_r_t
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in NezhaModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+        relations_values = self.relative_positions_encoding(to_seq_length)
+        attention_probs_t = attention_probs.permute(2, 0, 1, 3)
+        attentions_probs_r = attention_probs_t.contiguous().view(
+            from_seq_length, batch_size * num_attention_heads, to_seq_length
+        )
+        value_position_scores = torch.matmul(attentions_probs_r, relations_values)
+        value_position_scores_r = value_position_scores.view(
+            from_seq_length, batch_size, num_attention_heads, self.attention_head_size
+        )
+        value_position_scores_r_t = value_position_scores_r.permute(1, 2, 0, 3)
+        context_layer = context_layer + value_position_scores_r_t
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_values,)
+        return outputs
+
+
+class NezhaSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class NezhaAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.self = NezhaSelfAttention(config)
+        self.output = NezhaSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.Tensor]:
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_values,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class NezhaIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class NezhaOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class NezhaLayer(GradientCheckpointingLayer):
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = NezhaAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
+            self.crossattention = NezhaAttention(config)
+        self.intermediate = NezhaIntermediate(config)
+        self.output = NezhaOutput(config)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.Tensor]:
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_values[:2] if past_key_values is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_values=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
+                    " by setting `config.add_cross_attention=True`"
+                )
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_values tuple
+            cross_attn_past_key_value = past_key_values[-2:] if past_key_values is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class NezhaEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([NezhaLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            layer_outputs = layer_module(
+                hidden_states,
+                attention_mask,
+                layer_head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                past_key_values[i] if past_key_values is not None else None,
+                output_attentions,
+            )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class NezhaPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class NezhaPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class NezhaLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = NezhaPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def _tie_weights(self):
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+class NezhaOnlyMLMHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = NezhaLMPredictionHead(config)
+
+    def forward(self, sequence_output: torch.Tensor) -> torch.Tensor:
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class NezhaOnlyNSPHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, pooled_output):
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+
+
+class NezhaPreTrainingHeads(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = NezhaLMPredictionHead(config)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+class NezhaPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config: NezhaConfig
+    load_tf_weights = load_tf_weights_in_nezha
+    base_model_prefix = "nezha"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+@dataclass
+class NezhaForPreTrainingOutput(ModelOutput):
+    """
+    Output type of [`NezhaForPreTraining`].
+
+    Args:
+        loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
+            Total loss as the sum of the masked language modeling loss and the next sequence prediction
+            (classification) loss.
+        prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
+            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
+            before SoftMax).
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    prediction_logits: Optional[torch.FloatTensor] = None
+    seq_relationship_logits: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+
+
+NEZHA_START_DOCSTRING = r"""
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`NezhaConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+NEZHA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Nezha Model transformer outputting raw hidden-states without any specific head on top.",
+    NEZHA_START_DOCSTRING,
+)
+class NezhaModel(NezhaPreTrainedModel):
+    """
+
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
+    all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
+    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
+    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
+    """
+
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = NezhaEmbeddings(config)
+        self.encoder = NezhaEncoder(config)
+
+        self.pooler = NezhaPooler(config) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPoolingAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        # past_key_values_length
+        past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0
+
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, "token_type_ids"):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Nezha Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
+    sentence prediction (classification)` head.
+    """,
+    NEZHA_START_DOCSTRING,
+)
+class NezhaForPreTraining(NezhaPreTrainedModel):
+    _tied_weights_keys = ["cls.predictions.decoder"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.nezha = NezhaModel(config)
+        self.cls = NezhaPreTrainingHeads(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+        self.cls.predictions.bias = new_embeddings.bias
+
+    @add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=NezhaForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        next_sentence_label: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], NezhaForPreTrainingOutput]:
+        r"""
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+                config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked),
+                the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+            next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+                Labels for computing the next sequence prediction (classification) loss. Input should be a sequence
+                pair (see `input_ids` docstring) Indices should be in `[0, 1]`:
+
+                - 0 indicates sequence B is a continuation of sequence A,
+                - 1 indicates sequence B is a random sequence.
+            kwargs (`dict[str, any]`, optional, defaults to *{}*):
+                Used to hide legacy arguments that have been deprecated.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, NezhaForPreTraining
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("sijunhe/nezha-cn-base")
+        >>> model = NezhaForPreTraining.from_pretrained("sijunhe/nezha-cn-base")
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> prediction_logits = outputs.prediction_logits
+        >>> seq_relationship_logits = outputs.seq_relationship_logits
+        ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.nezha(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output, pooled_output = outputs[:2]
+        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
+
+        total_loss = None
+        if labels is not None and next_sentence_label is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
+            total_loss = masked_lm_loss + next_sentence_loss
+
+        if not return_dict:
+            output = (prediction_scores, seq_relationship_score) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return NezhaForPreTrainingOutput(
+            loss=total_loss,
+            prediction_logits=prediction_scores,
+            seq_relationship_logits=seq_relationship_score,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings("""Nezha Model with a `language modeling` head on top.""", NEZHA_START_DOCSTRING)
+class NezhaForMaskedLM(NezhaPreTrainedModel):
+    _tied_weights_keys = ["cls.predictions.decoder"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        if config.is_decoder:
+            logger.warning(
+                "If you want to use `NezhaForMaskedLM` make sure `config.is_decoder=False` for "
+                "bi-directional self-attention."
+            )
+
+        self.nezha = NezhaModel(config, add_pooling_layer=False)
+        self.cls = NezhaOnlyMLMHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+        self.cls.predictions.bias = new_embeddings.bias
+
+    @add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MaskedLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], MaskedLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.nezha(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+        effective_batch_size = input_shape[0]
+
+        #  add a dummy token
+        if self.config.pad_token_id is None:
+            raise ValueError("The PAD token should be defined for generation")
+
+        attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1)
+        dummy_token = torch.full(
+            (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device
+        )
+        input_ids = torch.cat([input_ids, dummy_token], dim=1)
+
+        return {"input_ids": input_ids, "attention_mask": attention_mask}
+
+
+@add_start_docstrings(
+    """Nezha Model with a `next sentence prediction (classification)` head on top.""",
+    NEZHA_START_DOCSTRING,
+)
+class NezhaForNextSentencePrediction(NezhaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.nezha = NezhaModel(config)
+        self.cls = NezhaOnlyNSPHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[tuple[torch.Tensor], NextSentencePredictorOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
+            (see `input_ids` docstring). Indices should be in `[0, 1]`:
+
+            - 0 indicates sequence B is a continuation of sequence A,
+            - 1 indicates sequence B is a random sequence.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, NezhaForNextSentencePrediction
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("sijunhe/nezha-cn-base")
+        >>> model = NezhaForNextSentencePrediction.from_pretrained("sijunhe/nezha-cn-base")
+
+        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+        >>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt")
+
+        >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
+        >>> logits = outputs.logits
+        >>> assert logits[0, 0] < logits[0, 1]  # next sentence was random
+        ```
+        """
+
+        if "next_sentence_label" in kwargs:
+            warnings.warn(
+                "The `next_sentence_label` argument is deprecated and will be removed in a future version, use"
+                " `labels` instead.",
+                FutureWarning,
+            )
+            labels = kwargs.pop("next_sentence_label")
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.nezha(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        seq_relationship_scores = self.cls(pooled_output)
+
+        next_sentence_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            next_sentence_loss = loss_fct(seq_relationship_scores.view(-1, 2), labels.view(-1))
+
+        if not return_dict:
+            output = (seq_relationship_scores,) + outputs[2:]
+            return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output
+
+        return NextSentencePredictorOutput(
+            loss=next_sentence_loss,
+            logits=seq_relationship_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Nezha Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
+    output) e.g. for GLUE tasks.
+    """,
+    NEZHA_START_DOCSTRING,
+)
+class NezhaForSequenceClassification(NezhaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+
+        self.nezha = NezhaModel(config)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.nezha(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Nezha Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    NEZHA_START_DOCSTRING,
+)
+class NezhaForMultipleChoice(NezhaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.nezha = NezhaModel(config)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], MultipleChoiceModelOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+            `input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+
+        outputs = self.nezha(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+        print(pooled_output.shape)
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        print(logits.shape)
+        print(num_choices)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Nezha Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    NEZHA_START_DOCSTRING,
+)
+class NezhaForTokenClassification(NezhaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.nezha = NezhaModel(config, add_pooling_layer=False)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.nezha(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Nezha Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    NEZHA_START_DOCSTRING,
+)
+class NezhaForQuestionAnswering(NezhaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.nezha = NezhaModel(config, add_pooling_layer=False)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=QuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        start_positions: Optional[torch.Tensor] = None,
+        end_positions: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], QuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.nezha(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = [
+    "NezhaForNextSentencePrediction",
+    "NezhaForMaskedLM",
+    "NezhaForPreTraining",
+    "NezhaForMultipleChoice",
+    "NezhaForQuestionAnswering",
+    "NezhaForSequenceClassification",
+    "NezhaForTokenClassification",
+    "NezhaModel",
+    "NezhaPreTrainedModel",
+]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/open_llama/__init__.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/open_llama/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b3964d194bed041987c6236b5c60bfcf3b7caf4
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/open_llama/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2023 EleutherAI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ....utils import _LazyModule
+from ....utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_open_llama import *
+    from .modeling_open_llama import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/open_llama/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/open_llama/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..04d369425a0ec19dbd48a6cca9304c198289608d
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/open_llama/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/open_llama/__pycache__/configuration_open_llama.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/open_llama/__pycache__/configuration_open_llama.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..64db71a4590d2a02480904d9c7ceb74bdb897f3b
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/open_llama/__pycache__/configuration_open_llama.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/open_llama/__pycache__/modeling_open_llama.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/open_llama/__pycache__/modeling_open_llama.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8b0408ffc49facf44e6e353a972e7b7adac6242d
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/open_llama/__pycache__/modeling_open_llama.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/open_llama/configuration_open_llama.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/open_llama/configuration_open_llama.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4bc9cc72a7661fe571cce6649b40f2307d9b3ce
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/open_llama/configuration_open_llama.py
@@ -0,0 +1,169 @@
+# coding=utf-8
+# Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Open-Llama model configuration"""
+
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class OpenLlamaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`OpenLlamaModel`]. It is used to instantiate an
+    Open-Llama model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the
+    [s-JoL/Open-Llama-V1](https://huggingface.co/s-JoL/Open-Llama-V1).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Open-Llama model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`OpenLlamaModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings(`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
+            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
+            these scaling strategies behave:
+            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
+            experimental feature, subject to breaking API changes in future versions.
+
+        Example:
+
+    ```python
+    >>> from transformers import OpenLlamaModel, OpenLlamaConfig
+
+    >>> # Initializing a Open-Llama open_llama-7b style configuration
+    >>> configuration = OpenLlamaConfig()
+
+    >>> # Initializing a model from the open_llama-7b style configuration
+    >>> model = OpenLlamaModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "open-llama"
+
+    def __init__(
+        self,
+        vocab_size=100000,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        use_memory_efficient_attention=True,
+        hidden_dropout_prob=0.1,
+        attention_dropout_prob=0.1,
+        use_stable_embedding=True,
+        shared_input_output_embedding=True,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.use_memory_efficient_attention = kwargs.pop(
+            "use_memorry_efficient_attention", use_memory_efficient_attention
+        )
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_dropout_prob = attention_dropout_prob
+        self.use_stable_embedding = use_stable_embedding
+        self.shared_input_output_embedding = shared_input_output_embedding
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self._rope_scaling_validation()
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                f"`rope_scaling` must be a dictionary with two fields, `type` and `factor`, got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+            raise ValueError(
+                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+            )
+        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+            raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
+
+
+__all__ = ["OpenLlamaConfig"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/open_llama/modeling_open_llama.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/open_llama/modeling_open_llama.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e182e0f813f1c0c9ed138c39561cc23eff8cb26
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/open_llama/modeling_open_llama.py
@@ -0,0 +1,940 @@
+# coding=utf-8
+# Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Open-Llama model."""
+
+import math
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ....activations import ACT2FN
+from ....cache_utils import Cache
+from ....modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
+from ....modeling_layers import GradientCheckpointingLayer
+from ....modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from ....modeling_utils import PreTrainedModel
+from ....utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from ....utils.deprecation import deprecate_kwarg
+from .configuration_open_llama import OpenLlamaConfig
+
+
+logger = logging.get_logger(__name__)
+
+try:
+    from xformers import ops as xops
+except ImportError:
+    xops = None
+
+
+_CONFIG_FOR_DOC = "OpenLlamaConfig"
+
+
+class OpenLlamaRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        OpenLlamaRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class OpenLlamaRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+    cos_cached: torch.Tensor
+    sin_cached: torch.Tensor
+
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (
+            self.base
+            ** (torch.arange(0, self.dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / self.dim)
+        )
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
+
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype),
+            self.sin_cached[:seq_len].to(dtype=x.dtype),
+        )
+
+
+class OpenLlamaLinearScalingRotaryEmbedding(OpenLlamaRotaryEmbedding):
+    """OpenLlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
+        t = t / self.scaling_factor
+
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+
+class OpenLlamaDynamicNTKScalingRotaryEmbedding(OpenLlamaRotaryEmbedding):
+    """OpenLlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+
+        if seq_len > self.max_position_embeddings:
+            base = self.base * (
+                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
+            ) ** (self.dim / (self.dim - 2))
+            inv_freq = 1.0 / (
+                base
+                ** (torch.arange(0, self.dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / self.dim)
+            )
+            self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
+
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`):
+            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+            used to pass offsetted position ids when working with a KV-cache.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class OpenLlamaMLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        dropout_prob: float,
+    ):
+        super().__init__()
+        self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
+        self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
+        self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
+        self.act_fn = ACT2FN[hidden_act]
+        self.dropout = nn.Dropout(dropout_prob)
+
+    def forward(self, x):
+        out = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return self.dropout(out)
+
+
+class OpenLlamaAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: OpenLlamaConfig):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.dropout_prob = config.attention_dropout_prob
+        self.rope_theta = config.rope_theta
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self._init_rope()
+
+    def _init_rope(self):
+        if self.config.rope_scaling is None:
+            self.rotary_emb = OpenLlamaRotaryEmbedding(
+                self.head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                base=self.rope_theta,
+            )
+        else:
+            scaling_type = self.config.rope_scaling["type"]
+            scaling_factor = self.config.rope_scaling["factor"]
+            if scaling_type == "linear":
+                self.rotary_emb = OpenLlamaLinearScalingRotaryEmbedding(
+                    self.head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                )
+            elif scaling_type == "dynamic":
+                self.rotary_emb = OpenLlamaDynamicNTKScalingRotaryEmbedding(
+                    self.head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                )
+            else:
+                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_values is not None:
+            kv_seq_len += past_key_values[0].shape[-2]
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        # [bsz, nh, t, hd]
+
+        if past_key_values is not None:
+            # reuse k, v, self_attention
+            key_states = torch.cat([past_key_values[0], key_states], dim=2)
+            value_states = torch.cat([past_key_values[1], value_states], dim=2)
+
+        past_key_values = (key_states, value_states) if use_cache else None
+
+        if self.config.use_memory_efficient_attention and xops is not None and self.training:
+            attn_weights = None
+            query_states = query_states.transpose(1, 2)
+            key_states = key_states.transpose(1, 2)
+            value_states = value_states.transpose(1, 2)
+            attn_output = xops.memory_efficient_attention(
+                query_states, key_states, value_states, attn_bias=xops.LowerTriangularMask(), p=self.dropout_prob
+            )
+        else:
+            attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+            if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention weights should be of size {(bsz * self.num_heads, q_len, kv_seq_len)}, but is"
+                    f" {attn_weights.size()}"
+                )
+
+            if attention_mask is not None:
+                if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                    raise ValueError(
+                        f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                    )
+                attn_weights = attn_weights + attention_mask
+                attn_weights = torch.max(
+                    attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min, device=attn_weights.device)
+                )
+
+            # upcast attention to fp32
+            attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+            attn_output = torch.matmul(attn_weights, value_states)
+
+            if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+                raise ValueError(
+                    f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                    f" {attn_output.size()}"
+                )
+
+            attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_values
+
+
+class OpenLlamaDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: OpenLlamaConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = OpenLlamaAttention(config=config)
+        self.mlp = OpenLlamaMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            dropout_prob=config.hidden_dropout_prob,
+        )
+        self.input_layernorm = OpenLlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = OpenLlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_values (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+OPEN_LLAMA_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`OpenLlamaConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare Open-Llama Model outputting raw hidden-states without any specific head on top.",
+    OPEN_LLAMA_START_DOCSTRING,
+)
+class OpenLlamaPreTrainedModel(PreTrainedModel):
+    config: OpenLlamaConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["OpenLlamaDecoderLayer"]
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            if self.config.use_stable_embedding:
+                torch.nn.init.xavier_normal_(module.weight.data)
+            else:
+                module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+OPEN_LLAMA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Open-Llama Model outputting raw hidden-states without any specific head on top.",
+    OPEN_LLAMA_START_DOCSTRING,
+)
+class OpenLlamaModel(OpenLlamaPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`OpenLlamaDecoderLayer`]
+
+    Args:
+        config: OpenLlamaConfig
+    """
+
+    def __init__(self, config: OpenLlamaConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        if config.use_stable_embedding:
+            self.embed_layer_norm = nn.LayerNorm(config.hidden_size)
+        else:
+            self.embed_layer_norm = None
+        self.layers = nn.ModuleList([OpenLlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.norm = OpenLlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(OPEN_LLAMA_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        if past_key_values is not None:
+            past_key_values_length = past_key_values.get_seq_length()
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+            if self.embed_layer_norm:
+                inputs_embeds = self.embed_layer_norm(inputs_embeds)
+        # embed positions
+        if self.config.use_memory_efficient_attention and self.training:
+            attention_mask = None
+        elif attention_mask is None:
+            attention_mask = torch.ones(
+                (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
+            )
+
+        input_shape = (batch_size, seq_length)
+        attention_mask = _prepare_4d_causal_attention_mask(
+            attention_mask, input_shape, inputs_embeds, past_key_values_length
+        )
+
+        hidden_states = inputs_embeds
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values[idx] if past_key_values is not None else None,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+class OpenLlamaForCausalLM(OpenLlamaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = OpenLlamaModel(config)
+        if config.shared_input_output_embedding:
+            self.lm_head = None
+        else:
+            self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(OPEN_LLAMA_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, CausalLMOutputWithPast]:
+        r"""
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, OpenLlamaForCausalLM
+
+        >>> model = OpenLlamaForCausalLM.from_pretrained("openlm-research/open_llama_7b")
+        >>> tokenizer = AutoTokenizer.from_pretrained("openlm-research/open_llama_7b")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        if self.config.shared_input_output_embedding:
+            logits = torch.einsum(
+                "blh,vh->blv", hidden_states.to(self.model.embed_tokens.weight.device), self.model.embed_tokens.weight
+            )
+        else:
+            logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        if past_key_values is not None:
+            past_length = past_key_values.get_seq_length()
+
+            # Some generation methods already pass only the last input ID
+            if input_ids.shape[1] > past_length:
+                remove_prefix_length = past_length
+            else:
+                # Default to old behavior: keep only final ID
+                remove_prefix_length = input_ids.shape[1] - 1
+
+            input_ids = input_ids[:, remove_prefix_length:]
+
+        position_ids = kwargs.get("position_ids")
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+
+
+@add_start_docstrings(
+    """
+    The LLaMa Model transformer with a sequence classification head on top (linear layer).
+
+    [`OpenLlamaForSequenceClassification`] uses the last token in order to do the classification, as other causal
+    models (e.g. GPT-2) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    OPEN_LLAMA_START_DOCSTRING,
+)
+class OpenLlamaForSequenceClassification(OpenLlamaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = OpenLlamaModel(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(OPEN_LLAMA_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
+                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
+                sequence_lengths = sequence_lengths % input_ids.shape[-1]
+                sequence_lengths = sequence_lengths.to(logits.device)
+            else:
+                sequence_lengths = -1
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+__all__ = ["OpenLlamaPreTrainedModel", "OpenLlamaModel", "OpenLlamaForCausalLM", "OpenLlamaForSequenceClassification"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/qdqbert/__init__.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/qdqbert/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..864b321bc2ee3a521e3d6da5403cab7363dea56b
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/qdqbert/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2021 NVIDIA Corporation and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ....utils import _LazyModule
+from ....utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_qdqbert import *
+    from .modeling_qdqbert import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/qdqbert/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/qdqbert/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..34ccd57657abda438441e5f37669ebe6852eb62d
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/qdqbert/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/qdqbert/__pycache__/configuration_qdqbert.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/qdqbert/__pycache__/configuration_qdqbert.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e22e19e1cb55baea95b60cc78b2d111545c3cc39
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/qdqbert/__pycache__/configuration_qdqbert.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/qdqbert/__pycache__/modeling_qdqbert.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/qdqbert/__pycache__/modeling_qdqbert.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4446be648219fa9208cc25fc3545daa19dd182bd
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/qdqbert/__pycache__/modeling_qdqbert.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/qdqbert/configuration_qdqbert.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/qdqbert/configuration_qdqbert.py
new file mode 100644
index 0000000000000000000000000000000000000000..91ac82bc5a0292355ac659548c7c3d4336f2536c
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/qdqbert/configuration_qdqbert.py
@@ -0,0 +1,123 @@
+# coding=utf-8
+# Copyright 2021 NVIDIA Corporation and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""QDQBERT model configuration"""
+
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class QDQBertConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`QDQBertModel`]. It is used to instantiate an
+    QDQBERT model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the BERT
+    [google-bert/bert-base-uncased](https://huggingface.co/google-bert/bert-base-uncased) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the QDQBERT model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`QDQBertModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimension of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`QDQBertModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+
+    Examples:
+
+    ```python
+    >>> from transformers import QDQBertModel, QDQBertConfig
+
+    >>> # Initializing a QDQBERT google-bert/bert-base-uncased style configuration
+    >>> configuration = QDQBertConfig()
+
+    >>> # Initializing a model from the google-bert/bert-base-uncased style configuration
+    >>> model = QDQBertModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "qdqbert"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        use_cache=True,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.type_vocab_size = type_vocab_size
+        self.layer_norm_eps = layer_norm_eps
+        self.use_cache = use_cache
+
+
+__all__ = ["QDQBertConfig"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/qdqbert/modeling_qdqbert.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/qdqbert/modeling_qdqbert.py
new file mode 100644
index 0000000000000000000000000000000000000000..f92bc07a8bfb1c495773be09799a4f06744849f9
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/qdqbert/modeling_qdqbert.py
@@ -0,0 +1,1736 @@
+# coding=utf-8
+# Copyright 2021 NVIDIA Corporation and The HuggingFace Team.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch QDQBERT model."""
+
+import math
+import os
+import warnings
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ....activations import ACT2FN
+from ....cache_utils import Cache
+from ....modeling_layers import GradientCheckpointingLayer
+from ....modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    NextSentencePredictorOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from ....modeling_utils import PreTrainedModel
+from ....pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ....utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_pytorch_quantization_available,
+    logging,
+    replace_return_docstrings,
+    requires_backends,
+)
+from ....utils.deprecation import deprecate_kwarg
+from .configuration_qdqbert import QDQBertConfig
+
+
+logger = logging.get_logger(__name__)
+
+# soft dependency
+if is_pytorch_quantization_available():
+    try:
+        from pytorch_quantization import nn as quant_nn
+        from pytorch_quantization.nn.modules.tensor_quantizer import TensorQuantizer
+    except OSError:
+        logger.error(
+            "QDQBERT model are not usable since `pytorch_quantization` can't be loaded. Please try to reinstall it"
+            " following the instructions here:"
+            " https://github.com/NVIDIA/TensorRT/tree/master/tools/pytorch-quantization."
+        )
+
+_CHECKPOINT_FOR_DOC = "google-bert/bert-base-uncased"
+_CONFIG_FOR_DOC = "QDQBertConfig"
+
+
+def load_tf_weights_in_qdqbert(model, tf_checkpoint_path):
+    """Load tf checkpoints in a pytorch model."""
+    try:
+        import re
+
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        logger.info(f"Loading TF weight {name} with shape {shape}")
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+
+    for name, array in zip(names, arrays):
+        name = name.split("/")
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(
+            n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
+            for n in name
+        ):
+            logger.info(f"Skipping {'/'.join(name)}")
+            continue
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
+                scope_names = re.split(r"_(\d+)", m_name)
+            else:
+                scope_names = [m_name]
+            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
+                pointer = getattr(pointer, "bias")
+            elif scope_names[0] == "output_weights":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "squad":
+                pointer = getattr(pointer, "classifier")
+            else:
+                try:
+                    pointer = getattr(pointer, scope_names[0])
+                except AttributeError:
+                    logger.info(f"Skipping {'/'.join(name)}")
+                    continue
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
+                pointer = pointer[num]
+        if m_name[-11:] == "_embeddings":
+            pointer = getattr(pointer, "weight")
+        elif m_name == "kernel":
+            array = np.transpose(array)
+        try:
+            if pointer.shape != array.shape:
+                raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        logger.info(f"Initialize PyTorch weight {name}")
+        pointer.data = torch.from_numpy(array)
+    return model
+
+
+class QDQBertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+        self.register_buffer(
+            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
+        )
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        past_key_values_length: int = 0,
+    ) -> torch.Tensor:
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
+
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class QDQBertSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = quant_nn.QuantLinear(config.hidden_size, self.all_head_size)
+        self.key = quant_nn.QuantLinear(config.hidden_size, self.all_head_size)
+        self.value = quant_nn.QuantLinear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+
+        self.matmul_q_input_quantizer = TensorQuantizer(quant_nn.QuantLinear.default_quant_desc_input)
+        self.matmul_k_input_quantizer = TensorQuantizer(quant_nn.QuantLinear.default_quant_desc_input)
+        self.matmul_v_input_quantizer = TensorQuantizer(quant_nn.QuantLinear.default_quant_desc_input)
+        self.matmul_a_input_quantizer = TensorQuantizer(quant_nn.QuantLinear.default_quant_desc_input)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        output_attentions=False,
+    ):
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_values is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_values[0]
+            value_layer = past_key_values[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_values is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_values[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_values[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_values` is always `None`
+            past_key_values = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(
+            self.matmul_q_input_quantizer(query_layer), self.matmul_k_input_quantizer(key_layer.transpose(-1, -2))
+        )
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in QDQBertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(
+            self.matmul_a_input_quantizer(attention_probs), self.matmul_v_input_quantizer(value_layer)
+        )
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_values,)
+        return outputs
+
+
+class QDQBertSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        # Quantize Linear layer
+        self.dense = quant_nn.QuantLinear(config.hidden_size, config.hidden_size)
+
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        # Quantize the inputs to the residual add
+        self.add_local_input_quantizer = TensorQuantizer(quant_nn.QuantLinear.default_quant_desc_input)
+        self.add_residual_input_quantizer = TensorQuantizer(quant_nn.QuantLinear.default_quant_desc_input)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        # Quantize the inputs to the residual add
+        add_local = self.add_local_input_quantizer(hidden_states)
+        add_residual = self.add_residual_input_quantizer(input_tensor)
+        hidden_states = self.LayerNorm(add_local + add_residual)
+        return hidden_states
+
+
+# Based on transformers.models.bert.modeling_bert.BertAttention with Bert -> QDQBert
+class QDQBertAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.self = QDQBertSelfAttention(config)
+        self.output = QDQBertSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        output_attentions=False,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_values,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class QDQBertIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        # Quantize Linear layer
+        self.dense = quant_nn.QuantLinear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class QDQBertOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        # Quantize Linear layer
+        self.dense = quant_nn.QuantLinear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        # Quantize the inputs to the residual add
+        self.add_local_input_quantizer = TensorQuantizer(quant_nn.QuantLinear.default_quant_desc_input)
+        self.add_residual_input_quantizer = TensorQuantizer(quant_nn.QuantLinear.default_quant_desc_input)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        # Quantize the inputs to the residual add
+        add_local = self.add_local_input_quantizer(hidden_states)
+        add_residual = self.add_residual_input_quantizer(input_tensor)
+        hidden_states = self.LayerNorm(add_local + add_residual)
+        return hidden_states
+
+
+# Based on transformers.models.bert.modeling_bert.BertLayer with Bert -> QDQBert
+class QDQBertLayer(GradientCheckpointingLayer):
+    def __init__(self, config):
+        super().__init__()
+        self.seq_len_dim = 1
+        self.attention = QDQBertAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
+            self.crossattention = QDQBertAttention(config)
+        self.intermediate = QDQBertIntermediate(config)
+        self.output = QDQBertOutput(config)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        output_attentions=False,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_values[:2] if past_key_values is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_values=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
+                    " by setting `config.add_cross_attention=True`"
+                )
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_values tuple
+            cross_attn_past_key_value = past_key_values[-2:] if past_key_values is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        layer_output = self.feed_forward_chunk(attention_output)
+        outputs = (layer_output,) + outputs
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+# Based on transformers.models.bert.modeling_bert.BertEncoder with Bert -> QDQBert
+class QDQBertEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([QDQBertLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            layer_outputs = layer_module(
+                hidden_states,
+                attention_mask,
+                layer_head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                past_key_values[i] if past_key_values is not None else None,
+                output_attentions,
+            )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class QDQBertPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class QDQBertPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+# Based on transformers.models.bert.modeling_bert.BertLMPredictionHead with Bert -> QDQBert
+class QDQBertLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = QDQBertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def _tie_weights(self):
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+# Based on transformers.models.bert.modeling_bert.BertOnlyMLMHead with Bert -> QDQBert
+class QDQBertOnlyMLMHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = QDQBertLMPredictionHead(config)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class QDQBertOnlyNSPHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, pooled_output):
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+
+
+# Based on transformers.models.bert.modeling_bert.BertPreTrainingHeads with Bert -> QDQBert
+class QDQBertPreTrainingHeads(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = QDQBertLMPredictionHead(config)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+# Based on transformers.models.bert.modeling_bert.BertPreTrainedModel with Bert -> QDQBert
+class QDQBertPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config: QDQBertConfig
+    load_tf_weights = load_tf_weights_in_qdqbert
+    base_model_prefix = "bert"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+QDQBERT_START_DOCSTRING = r"""
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`QDQBertConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+QDQBERT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare QDQBERT Model transformer outputting raw hidden-states without any specific head on top.",
+    QDQBERT_START_DOCSTRING,
+)
+class QDQBertModel(QDQBertPreTrainedModel):
+    """
+
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
+    all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
+    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
+    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
+    """
+
+    def __init__(self, config, add_pooling_layer: bool = True):
+        requires_backends(self, "pytorch_quantization")
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = QDQBertEmbeddings(config)
+        self.encoder = QDQBertEncoder(config)
+
+        self.pooler = QDQBertPooler(config) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune: dict[int, list[int]]):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(QDQBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPoolingAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutputWithPoolingAndCrossAttentions]:
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+            batch_size, seq_length = input_shape
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            batch_size, seq_length = input_shape
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        # past_key_values_length
+        past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0
+
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, "token_type_ids"):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    """QDQBERT Model with a `language modeling` head on top for CLM fine-tuning.""", QDQBERT_START_DOCSTRING
+)
+class QDQBertLMHeadModel(QDQBertPreTrainedModel):
+    _tied_weights_keys = ["predictions.decoder.weight", "predictions.decoder.bias"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        if not config.is_decoder:
+            logger.warning("If you want to use `QDQBertLMHeadModel` as a standalone, add `is_decoder=True.`")
+
+        self.bert = QDQBertModel(config, add_pooling_layer=False)
+        self.cls = QDQBertOnlyMLMHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+        self.cls.predictions.bias = new_embeddings.bias
+
+    @add_start_docstrings_to_model_forward(QDQBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[tuple[tuple[torch.LongTensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, CausalLMOutputWithCrossAttentions]:
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, QDQBertLMHeadModel, QDQBertConfig
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
+        >>> config = QDQBertConfig.from_pretrained("google-bert/bert-base-cased")
+        >>> config.is_decoder = True
+        >>> model = QDQBertLMHeadModel.from_pretrained("google-bert/bert-base-cased", config=config)
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> prediction_logits = outputs.logits
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        lm_loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss()
+            lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: Optional[torch.LongTensor],
+        past_key_values=None,
+        attention_mask: Optional[torch.Tensor] = None,
+        **model_kwargs,
+    ):
+        input_shape = input_ids.shape
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_shape)
+
+        # cut decoder_input_ids if past_key_values is used
+        if past_key_values is not None:
+            past_length = past_key_values.get_seq_length()
+
+            # Some generation methods already pass only the last input ID
+            if input_ids.shape[1] > past_length:
+                remove_prefix_length = past_length
+            else:
+                # Default to old behavior: keep only final ID
+                remove_prefix_length = input_ids.shape[1] - 1
+
+            input_ids = input_ids[:, remove_prefix_length:]
+
+        return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
+
+    def _reorder_cache(self, past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+
+
+@add_start_docstrings("""QDQBERT Model with a `language modeling` head on top.""", QDQBERT_START_DOCSTRING)
+class QDQBertForMaskedLM(QDQBertPreTrainedModel):
+    _tied_weights_keys = ["predictions.decoder.weight", "predictions.decoder.bias"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        if config.is_decoder:
+            logger.warning(
+                "If you want to use `QDQBertForMaskedLM` make sure `config.is_decoder=False` for "
+                "bi-directional self-attention."
+            )
+
+        self.bert = QDQBertModel(config, add_pooling_layer=False)
+        self.cls = QDQBertOnlyMLMHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+        self.cls.predictions.bias = new_embeddings.bias
+
+    @add_start_docstrings_to_model_forward(QDQBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MaskedLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, MaskedLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids: torch.LongTensor, attention_mask: Optional[torch.FloatTensor] = None, **model_kwargs
+    ):
+        input_shape = input_ids.shape
+        effective_batch_size = input_shape[0]
+
+        #  add a dummy token
+        if self.config.pad_token_id is None:
+            raise ValueError("The PAD token should be defined for generation")
+
+        attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1)
+        dummy_token = torch.full(
+            (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device
+        )
+        input_ids = torch.cat([input_ids, dummy_token], dim=1)
+
+        return {"input_ids": input_ids, "attention_mask": attention_mask}
+
+
+@add_start_docstrings(
+    """Bert Model with a `next sentence prediction (classification)` head on top.""",
+    QDQBERT_START_DOCSTRING,
+)
+class QDQBertForNextSentencePrediction(QDQBertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = QDQBertModel(config)
+        self.cls = QDQBertOnlyNSPHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(QDQBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[tuple, NextSentencePredictorOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
+            (see `input_ids` docstring). Indices should be in `[0, 1]`:
+
+            - 0 indicates sequence B is a continuation of sequence A,
+            - 1 indicates sequence B is a random sequence.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, QDQBertForNextSentencePrediction
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+        >>> model = QDQBertForNextSentencePrediction.from_pretrained("google-bert/bert-base-uncased")
+
+        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+        >>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt")
+
+        >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
+        >>> logits = outputs.logits
+        >>> assert logits[0, 0] < logits[0, 1]  # next sentence was random
+        ```"""
+
+        if "next_sentence_label" in kwargs:
+            warnings.warn(
+                "The `next_sentence_label` argument is deprecated and will be removed in a future version, use"
+                " `labels` instead.",
+                FutureWarning,
+            )
+            labels = kwargs.pop("next_sentence_label")
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        seq_relationship_scores = self.cls(pooled_output)
+
+        next_sentence_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            next_sentence_loss = loss_fct(seq_relationship_scores.view(-1, 2), labels.view(-1))
+
+        if not return_dict:
+            output = (seq_relationship_scores,) + outputs[2:]
+            return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output
+
+        return NextSentencePredictorOutput(
+            loss=next_sentence_loss,
+            logits=seq_relationship_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
+    output) e.g. for GLUE tasks.
+    """,
+    QDQBERT_START_DOCSTRING,
+)
+class QDQBertForSequenceClassification(QDQBertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+
+        self.bert = QDQBertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(QDQBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Bert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    QDQBERT_START_DOCSTRING,
+)
+class QDQBertForMultipleChoice(QDQBertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = QDQBertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(QDQBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, MultipleChoiceModelOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+            `input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+
+        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    QDQBERT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    QDQBERT_START_DOCSTRING,
+)
+class QDQBertForTokenClassification(QDQBertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.bert = QDQBertModel(config, add_pooling_layer=False)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(QDQBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    QDQBERT Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    QDQBERT_START_DOCSTRING,
+)
+class QDQBertForQuestionAnswering(QDQBertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.bert = QDQBertModel(config, add_pooling_layer=False)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(QDQBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=QuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, QuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = [
+    "QDQBertForMaskedLM",
+    "QDQBertForMultipleChoice",
+    "QDQBertForNextSentencePrediction",
+    "QDQBertForQuestionAnswering",
+    "QDQBertForSequenceClassification",
+    "QDQBertForTokenClassification",
+    "QDQBertLayer",
+    "QDQBertLMHeadModel",
+    "QDQBertModel",
+    "QDQBertPreTrainedModel",
+    "load_tf_weights_in_qdqbert",
+]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/realm/__init__.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/realm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cdfdeb5d179c8e954c9c6822d3f154d632394964
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/realm/__init__.py
@@ -0,0 +1,30 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ....utils import _LazyModule
+from ....utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_realm import *
+    from .modeling_realm import *
+    from .retrieval_realm import *
+    from .tokenization_realm import *
+    from .tokenization_realm_fast import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/realm/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/realm/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d0173420a74aedd5e4178a49dc8ea60b8e87948c
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/realm/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/realm/__pycache__/configuration_realm.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/realm/__pycache__/configuration_realm.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e49cd34401f2ac7d5bbb735c1414af7ea75a6578
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/realm/__pycache__/configuration_realm.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/realm/__pycache__/modeling_realm.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/realm/__pycache__/modeling_realm.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..21647161e9eb4a833909bd71dbd74a25b2c55908
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/realm/__pycache__/modeling_realm.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/realm/__pycache__/retrieval_realm.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/realm/__pycache__/retrieval_realm.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9e0b6ad89b8e1cf9bea00e0c16bd906f7125838d
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/realm/__pycache__/retrieval_realm.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/realm/__pycache__/tokenization_realm.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/realm/__pycache__/tokenization_realm.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d9adb0e6e24b11ea73ec1ac3191495d21c10594b
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/realm/__pycache__/tokenization_realm.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/realm/__pycache__/tokenization_realm_fast.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/realm/__pycache__/tokenization_realm_fast.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..36e431bc359ddba95267aa8aa35694b5bce2f675
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/realm/__pycache__/tokenization_realm_fast.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/realm/configuration_realm.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/realm/configuration_realm.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbf32378a604beca939ff6e0241f69f4c4382120
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/realm/configuration_realm.py
@@ -0,0 +1,169 @@
+# coding=utf-8
+# Copyright 2022 The REALM authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""REALM model configuration."""
+
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class RealmConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of
+
+    1. [`RealmEmbedder`]
+    2. [`RealmScorer`]
+    3. [`RealmKnowledgeAugEncoder`]
+    4. [`RealmRetriever`]
+    5. [`RealmReader`]
+    6. [`RealmForOpenQA`]
+
+    It is used to instantiate an REALM model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the REALM
+    [google/realm-cc-news-pretrained-embedder](https://huggingface.co/google/realm-cc-news-pretrained-embedder)
+    architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the REALM model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`RealmEmbedder`], [`RealmScorer`], [`RealmKnowledgeAugEncoder`], or
+            [`RealmReader`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimension of the encoder layers and the pooler layer.
+        retriever_proj_size (`int`, *optional*, defaults to 128):
+            Dimension of the retriever(embedder) projection.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_candidates (`int`, *optional*, defaults to 8):
+            Number of candidates inputted to the RealmScorer or RealmKnowledgeAugEncoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_new"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`RealmEmbedder`], [`RealmScorer`],
+            [`RealmKnowledgeAugEncoder`], or [`RealmReader`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        span_hidden_size (`int`, *optional*, defaults to 256):
+            Dimension of the reader's spans.
+        max_span_width (`int`, *optional*, defaults to 10):
+            Max span width of the reader.
+        reader_layer_norm_eps (`float`, *optional*, defaults to 1e-3):
+            The epsilon used by the reader's layer normalization layers.
+        reader_beam_size (`int`, *optional*, defaults to 5):
+            Beam size of the reader.
+        reader_seq_len (`int`, *optional*, defaults to 288+32):
+            Maximum sequence length of the reader.
+        num_block_records (`int`, *optional*, defaults to 13353718):
+            Number of block records.
+        searcher_beam_size (`int`, *optional*, defaults to 5000):
+            Beam size of the searcher. Note that when eval mode is enabled, *searcher_beam_size* will be the same as
+            *reader_beam_size*.
+
+    Example:
+
+    ```python
+    >>> from transformers import RealmConfig, RealmEmbedder
+
+    >>> # Initializing a REALM realm-cc-news-pretrained-* style configuration
+    >>> configuration = RealmConfig()
+
+    >>> # Initializing a model (with random weights) from the google/realm-cc-news-pretrained-embedder style configuration
+    >>> model = RealmEmbedder(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "realm"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        retriever_proj_size=128,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_candidates=8,
+        intermediate_size=3072,
+        hidden_act="gelu_new",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        span_hidden_size=256,
+        max_span_width=10,
+        reader_layer_norm_eps=1e-3,
+        reader_beam_size=5,
+        reader_seq_len=320,  # 288 + 32
+        num_block_records=13353718,
+        searcher_beam_size=5000,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        # Common config
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.retriever_proj_size = retriever_proj_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_candidates = num_candidates
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.type_vocab_size = type_vocab_size
+        self.layer_norm_eps = layer_norm_eps
+
+        # Reader config
+        self.span_hidden_size = span_hidden_size
+        self.max_span_width = max_span_width
+        self.reader_layer_norm_eps = reader_layer_norm_eps
+        self.reader_beam_size = reader_beam_size
+        self.reader_seq_len = reader_seq_len
+
+        # Retrieval config
+        self.num_block_records = num_block_records
+        self.searcher_beam_size = searcher_beam_size
+
+
+__all__ = ["RealmConfig"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/realm/modeling_realm.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/realm/modeling_realm.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e2de5c9c1c4491390a0a31ce9c0613e9500096b
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/realm/modeling_realm.py
@@ -0,0 +1,1855 @@
+# coding=utf-8
+# Copyright 2022 The REALM authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch REALM model."""
+
+import math
+import os
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from ....activations import ACT2FN
+from ....cache_utils import Cache
+from ....modeling_layers import GradientCheckpointingLayer
+from ....modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    MaskedLMOutput,
+    ModelOutput,
+)
+from ....modeling_utils import PreTrainedModel
+from ....pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ....utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from ....utils.deprecation import deprecate_kwarg
+from .configuration_realm import RealmConfig
+
+
+logger = logging.get_logger(__name__)
+_EMBEDDER_CHECKPOINT_FOR_DOC = "google/realm-cc-news-pretrained-embedder"
+_ENCODER_CHECKPOINT_FOR_DOC = "google/realm-cc-news-pretrained-encoder"
+_SCORER_CHECKPOINT_FOR_DOC = "google/realm-cc-news-pretrained-scorer"
+_CONFIG_FOR_DOC = "RealmConfig"
+
+
+def load_tf_weights_in_realm(model, config, tf_checkpoint_path):
+    """Load tf checkpoints in a pytorch model."""
+    try:
+        import re
+
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+
+    for name, shape in init_vars:
+        logger.info(f"Loading TF weight {name} with shape {shape}")
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+
+    for name, array in zip(names, arrays):
+        if isinstance(model, RealmReader) and "reader" not in name:
+            logger.info(f"Skipping {name} as it is not {model.__class__.__name__}'s parameter")
+            continue
+
+        # For pretrained openqa reader
+        if (name.startswith("bert") or name.startswith("cls")) and isinstance(model, RealmForOpenQA):
+            name = name.replace("bert/", "reader/realm/")
+            name = name.replace("cls/", "reader/cls/")
+
+        # For pretrained encoder
+        if (name.startswith("bert") or name.startswith("cls")) and isinstance(model, RealmKnowledgeAugEncoder):
+            name = name.replace("bert/", "realm/")
+
+        # For finetuned reader
+        if name.startswith("reader"):
+            reader_prefix = "" if isinstance(model, RealmReader) else "reader/"
+            name = name.replace("reader/module/bert/", f"{reader_prefix}realm/")
+            name = name.replace("reader/module/cls/", f"{reader_prefix}cls/")
+            name = name.replace("reader/dense/", f"{reader_prefix}qa_outputs/dense_intermediate/")
+            name = name.replace("reader/dense_1/", f"{reader_prefix}qa_outputs/dense_output/")
+            name = name.replace("reader/layer_normalization", f"{reader_prefix}qa_outputs/layer_normalization")
+
+        # For embedder and scorer
+        if name.startswith("module/module/module/"):  # finetuned
+            embedder_prefix = "" if isinstance(model, RealmEmbedder) else "embedder/"
+            name = name.replace("module/module/module/module/bert/", f"{embedder_prefix}realm/")
+            name = name.replace("module/module/module/LayerNorm/", f"{embedder_prefix}cls/LayerNorm/")
+            name = name.replace("module/module/module/dense/", f"{embedder_prefix}cls/dense/")
+            name = name.replace("module/module/module/module/cls/predictions/", f"{embedder_prefix}cls/predictions/")
+            name = name.replace("module/module/module/bert/", f"{embedder_prefix}realm/")
+            name = name.replace("module/module/module/cls/predictions/", f"{embedder_prefix}cls/predictions/")
+        elif name.startswith("module/module/"):  # pretrained
+            embedder_prefix = "" if isinstance(model, RealmEmbedder) else "embedder/"
+            name = name.replace("module/module/LayerNorm/", f"{embedder_prefix}cls/LayerNorm/")
+            name = name.replace("module/module/dense/", f"{embedder_prefix}cls/dense/")
+
+        name = name.split("/")
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(
+            n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
+            for n in name
+        ):
+            logger.info(f"Skipping {'/'.join(name)}")
+            continue
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
+                scope_names = re.split(r"_(\d+)", m_name)
+            else:
+                scope_names = [m_name]
+            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
+                pointer = getattr(pointer, "bias")
+            else:
+                try:
+                    pointer = getattr(pointer, scope_names[0])
+                except AttributeError:
+                    logger.info(f"Skipping {'/'.join(name)}")
+                    continue
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
+                pointer = pointer[num]
+        if m_name[-11:] == "_embeddings":
+            pointer = getattr(pointer, "weight")
+        elif m_name == "kernel":
+            array = np.transpose(array)
+        try:
+            assert pointer.shape == array.shape, (
+                f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
+            )
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        logger.info(f"Initialize PyTorch weight {name}")
+        pointer.data = torch.from_numpy(array)
+    return model
+
+
+class RealmEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+        self.register_buffer(
+            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
+        )
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        past_key_values_length: int = 0,
+    ) -> torch.Tensor:
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
+
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class RealmSelfAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = position_embedding_type or getattr(
+            config, "position_embedding_type", "absolute"
+        )
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.Tensor]:
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_values is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_values[0]
+            value_layer = past_key_values[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_values is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_values[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_values[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        use_cache = past_key_values is not None
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_values` is always `None`
+            past_key_values = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            query_length, key_length = query_layer.shape[2], key_layer.shape[2]
+            if use_cache:
+                position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view(
+                    -1, 1
+                )
+            else:
+                position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in RealmModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_values,)
+        return outputs
+
+
+class RealmSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+REALM_SELF_ATTENTION_CLASSES = {
+    "eager": RealmSelfAttention,
+}
+
+
+class RealmAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        self.self = REALM_SELF_ATTENTION_CLASSES[config._attn_implementation](
+            config, position_embedding_type=position_embedding_type
+        )
+        self.output = RealmSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.Tensor]:
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_values,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class RealmIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class RealmOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class RealmLayer(GradientCheckpointingLayer):
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = RealmAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
+            self.crossattention = RealmAttention(config, position_embedding_type="absolute")
+        self.intermediate = RealmIntermediate(config)
+        self.output = RealmOutput(config)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.Tensor]:
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_values[:2] if past_key_values is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_values=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
+                    " by setting `config.add_cross_attention=True`"
+                )
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_values tuple
+            cross_attn_past_key_value = past_key_values[-2:] if past_key_values is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class RealmEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([RealmLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            layer_outputs = layer_module(
+                hidden_states,
+                attention_mask,
+                layer_head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                past_key_values[i] if past_key_values is not None else None,
+                output_attentions,
+            )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class RealmPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+@dataclass
+class RealmEmbedderOutput(ModelOutput):
+    """
+    Outputs of [`RealmEmbedder`] models.
+
+    Args:
+        projected_score (`torch.FloatTensor` of shape `(batch_size, config.retriever_proj_size)`):
+
+            Projected score.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    projected_score: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class RealmScorerOutput(ModelOutput):
+    """
+    Outputs of [`RealmScorer`] models.
+
+    Args:
+        relevance_score (`torch.FloatTensor` of shape `(batch_size, config.num_candidates)`):
+            The relevance score of document candidates (before softmax).
+        query_score (`torch.FloatTensor` of shape `(batch_size, config.retriever_proj_size)`):
+            Query score derived from the query embedder.
+        candidate_score (`torch.FloatTensor` of shape `(batch_size, config.num_candidates, config.retriever_proj_size)`):
+            Candidate score derived from the embedder.
+    """
+
+    relevance_score: Optional[torch.FloatTensor] = None
+    query_score: Optional[torch.FloatTensor] = None
+    candidate_score: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+class RealmReaderOutput(ModelOutput):
+    """
+    Outputs of [`RealmReader`] models.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `start_positions`, `end_positions`, `has_answers` are provided):
+            Total loss.
+        retriever_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `start_positions`, `end_positions`, `has_answers` are provided):
+            Retriever loss.
+        reader_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `start_positions`, `end_positions`, `has_answers` are provided):
+            Reader loss.
+        retriever_correct (`torch.BoolTensor` of shape `(config.searcher_beam_size,)`, *optional*):
+            Whether or not an evidence block contains answer.
+        reader_correct (`torch.BoolTensor` of shape `(config.reader_beam_size, num_candidates)`, *optional*):
+            Whether or not a span candidate contains answer.
+        block_idx (`torch.LongTensor` of shape `()`):
+            The index of the retrieved evidence block in which the predicted answer is most likely.
+        candidate (`torch.LongTensor` of shape `()`):
+            The index of the retrieved span candidates in which the predicted answer is most likely.
+        start_pos (`torch.IntTensor` of shape `()`):
+            Predicted answer starting position in *RealmReader*'s inputs.
+        end_pos (`torch.IntTensor` of shape `()`):
+            Predicted answer ending position in *RealmReader*'s inputs.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    retriever_loss: Optional[torch.FloatTensor] = None
+    reader_loss: Optional[torch.FloatTensor] = None
+    retriever_correct: Optional[torch.BoolTensor] = None
+    reader_correct: Optional[torch.BoolTensor] = None
+    block_idx: Optional[torch.LongTensor] = None
+    candidate: Optional[torch.LongTensor] = None
+    start_pos: Optional[torch.IntTensor] = None
+    end_pos: Optional[torch.IntTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class RealmForOpenQAOutput(ModelOutput):
+    """
+
+    Outputs of [`RealmForOpenQA`] models.
+
+    Args:
+        reader_output (`dict`):
+            Reader output.
+        predicted_answer_ids (`torch.LongTensor` of shape `(answer_sequence_length)`):
+            Predicted answer ids.
+    """
+
+    reader_output: Optional[dict] = None
+    predicted_answer_ids: Optional[torch.LongTensor] = None
+
+
+class RealmPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class RealmLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = RealmPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def _tie_weights(self):
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+class RealmOnlyMLMHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = RealmLMPredictionHead(config)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class RealmScorerProjection(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = RealmLMPredictionHead(config)
+        self.dense = nn.Linear(config.hidden_size, config.retriever_proj_size)
+        self.LayerNorm = nn.LayerNorm(config.retriever_proj_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class RealmReaderProjection(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.dense_intermediate = nn.Linear(config.hidden_size, config.span_hidden_size * 2)
+        self.dense_output = nn.Linear(config.span_hidden_size, 1)
+        self.layer_normalization = nn.LayerNorm(config.span_hidden_size, eps=config.reader_layer_norm_eps)
+        self.relu = nn.ReLU()
+
+    def forward(self, hidden_states, block_mask):
+        def span_candidates(masks):
+            """
+            Generate span candidates.
+
+            Args:
+                masks: <bool> [num_retrievals, max_sequence_len]
+
+            Returns:
+                starts: <int32> [num_spans] ends: <int32> [num_spans] span_masks: <int32> [num_retrievals, num_spans]
+                whether spans locate in evidence block.
+            """
+            _, max_sequence_len = masks.shape
+
+            def _spans_given_width(width):
+                current_starts = torch.arange(max_sequence_len - width + 1, device=masks.device)
+                current_ends = torch.arange(width - 1, max_sequence_len, device=masks.device)
+                return current_starts, current_ends
+
+            starts, ends = zip(*(_spans_given_width(w + 1) for w in range(self.config.max_span_width)))
+
+            # [num_spans]
+            starts = torch.cat(starts, 0)
+            ends = torch.cat(ends, 0)
+
+            # [num_retrievals, num_spans]
+            start_masks = torch.index_select(masks, dim=-1, index=starts)
+            end_masks = torch.index_select(masks, dim=-1, index=ends)
+            span_masks = start_masks * end_masks
+
+            return starts, ends, span_masks
+
+        def mask_to_score(mask, dtype=torch.float32):
+            return (1.0 - mask.type(dtype)) * torch.finfo(dtype).min
+
+        # [reader_beam_size, max_sequence_len, span_hidden_size * 2]
+        hidden_states = self.dense_intermediate(hidden_states)
+        # [reader_beam_size, max_sequence_len, span_hidden_size]
+        start_projection, end_projection = hidden_states.chunk(2, dim=-1)
+
+        candidate_starts, candidate_ends, candidate_mask = span_candidates(block_mask)
+
+        candidate_start_projections = torch.index_select(start_projection, dim=1, index=candidate_starts)
+        candidate_end_projections = torch.index_select(end_projection, dim=1, index=candidate_ends)
+        candidate_hidden = candidate_start_projections + candidate_end_projections
+
+        # [reader_beam_size, num_candidates, span_hidden_size]
+        candidate_hidden = self.relu(candidate_hidden)
+        # [reader_beam_size, num_candidates, span_hidden_size]
+        candidate_hidden = self.layer_normalization(candidate_hidden)
+        # [reader_beam_size, num_candidates]
+        reader_logits = self.dense_output(candidate_hidden).squeeze(-1)
+        # [reader_beam_size, num_candidates]
+        reader_logits += mask_to_score(candidate_mask, dtype=reader_logits.dtype)
+
+        return reader_logits, candidate_starts, candidate_ends
+
+
+REALM_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`RealmConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+REALM_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class RealmPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config: RealmConfig
+    load_tf_weights = load_tf_weights_in_realm
+    base_model_prefix = "realm"
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def _flatten_inputs(self, *inputs):
+        """Flatten inputs' shape to (-1, input_shape[-1])"""
+        flattened_inputs = []
+        for tensor in inputs:
+            if tensor is None:
+                flattened_inputs.append(None)
+            else:
+                input_shape = tensor.shape
+                if len(input_shape) > 2:
+                    tensor = tensor.view((-1, input_shape[-1]))
+                flattened_inputs.append(tensor)
+        return flattened_inputs
+
+
+class RealmBertModel(RealmPreTrainedModel):
+    """
+    Same as the original BertModel but remove docstrings.
+    """
+
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = RealmEmbeddings(config)
+        self.encoder = RealmEncoder(config)
+
+        self.pooler = RealmPooler(config) if add_pooling_layer else None
+
+        # Weights initialization is mostly managed by other Realm models,
+        # but we also have them initialized here to keep a consistency.
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        # past_key_values_length
+        past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0
+
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, "token_type_ids"):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    "The embedder of REALM outputting projected score that will be used to calculate relevance score.",
+    REALM_START_DOCSTRING,
+)
+class RealmEmbedder(RealmPreTrainedModel):
+    _tied_weights_keys = ["cls.predictions.decoder.bias"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.realm = RealmBertModel(self.config)
+        self.cls = RealmScorerProjection(self.config)
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.realm.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.realm.embeddings.word_embeddings = value
+
+    @add_start_docstrings_to_model_forward(REALM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=RealmEmbedderOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, RealmEmbedderOutput]:
+        r"""
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, RealmEmbedder
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/realm-cc-news-pretrained-embedder")
+        >>> model = RealmEmbedder.from_pretrained("google/realm-cc-news-pretrained-embedder")
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> projected_score = outputs.projected_score
+        ```
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        realm_outputs = self.realm(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        # [batch_size, hidden_size]
+        pooler_output = realm_outputs[1]
+        # [batch_size, retriever_proj_size]
+        projected_score = self.cls(pooler_output)
+
+        if not return_dict:
+            return (projected_score,) + realm_outputs[2:4]
+        else:
+            return RealmEmbedderOutput(
+                projected_score=projected_score,
+                hidden_states=realm_outputs.hidden_states,
+                attentions=realm_outputs.attentions,
+            )
+
+
+@add_start_docstrings(
+    "The scorer of REALM outputting relevance scores representing the score of document candidates (before softmax).",
+    REALM_START_DOCSTRING,
+)
+class RealmScorer(RealmPreTrainedModel):
+    r"""
+    Args:
+        query_embedder ([`RealmEmbedder`]):
+            Embedder for input sequences. If not specified, it will use the same embedder as candidate sequences.
+    """
+
+    def __init__(self, config, query_embedder=None):
+        super().__init__(config)
+
+        self.embedder = RealmEmbedder(self.config)
+
+        self.query_embedder = query_embedder if query_embedder is not None else self.embedder
+
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(REALM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=RealmScorerOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        candidate_input_ids: Optional[torch.LongTensor] = None,
+        candidate_attention_mask: Optional[torch.FloatTensor] = None,
+        candidate_token_type_ids: Optional[torch.LongTensor] = None,
+        candidate_inputs_embeds: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, RealmScorerOutput]:
+        r"""
+        candidate_input_ids (`torch.LongTensor` of shape `(batch_size, num_candidates, sequence_length)`):
+            Indices of candidate input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        candidate_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_candidates, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        candidate_token_type_ids (`torch.LongTensor` of shape `(batch_size, num_candidates, sequence_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        candidate_inputs_embeds (`torch.FloatTensor` of shape `(batch_size * num_candidates, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `candidate_input_ids` you can choose to directly pass an embedded
+            representation. This is useful if you want more control over how to convert *candidate_input_ids* indices
+            into associated vectors than the model's internal embedding lookup matrix.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> import torch
+        >>> from transformers import AutoTokenizer, RealmScorer
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/realm-cc-news-pretrained-scorer")
+        >>> model = RealmScorer.from_pretrained("google/realm-cc-news-pretrained-scorer", num_candidates=2)
+
+        >>> # batch_size = 2, num_candidates = 2
+        >>> input_texts = ["How are you?", "What is the item in the picture?"]
+        >>> candidates_texts = [["Hello world!", "Nice to meet you!"], ["A cute cat.", "An adorable dog."]]
+
+        >>> inputs = tokenizer(input_texts, return_tensors="pt")
+        >>> candidates_inputs = tokenizer.batch_encode_candidates(candidates_texts, max_length=10, return_tensors="pt")
+
+        >>> outputs = model(
+        ...     **inputs,
+        ...     candidate_input_ids=candidates_inputs.input_ids,
+        ...     candidate_attention_mask=candidates_inputs.attention_mask,
+        ...     candidate_token_type_ids=candidates_inputs.token_type_ids,
+        ... )
+        >>> relevance_score = outputs.relevance_score
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is None and inputs_embeds is None:
+            raise ValueError("You have to specify either input_ids or input_embeds.")
+
+        if candidate_input_ids is None and candidate_inputs_embeds is None:
+            raise ValueError("You have to specify either candidate_input_ids or candidate_inputs_embeds.")
+
+        query_outputs = self.query_embedder(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        # [batch_size * num_candidates, candidate_seq_len]
+        (flattened_input_ids, flattened_attention_mask, flattened_token_type_ids) = self._flatten_inputs(
+            candidate_input_ids, candidate_attention_mask, candidate_token_type_ids
+        )
+
+        candidate_outputs = self.embedder(
+            flattened_input_ids,
+            attention_mask=flattened_attention_mask,
+            token_type_ids=flattened_token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=candidate_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        # [batch_size, retriever_proj_size]
+        query_score = query_outputs[0]
+        # [batch_size * num_candidates, retriever_proj_size]
+        candidate_score = candidate_outputs[0]
+        # [batch_size, num_candidates, retriever_proj_size]
+        candidate_score = candidate_score.view(-1, self.config.num_candidates, self.config.retriever_proj_size)
+        # [batch_size, num_candidates]
+        relevance_score = torch.einsum("bd,bnd->bn", query_score, candidate_score)
+
+        if not return_dict:
+            return relevance_score, query_score, candidate_score
+
+        return RealmScorerOutput(
+            relevance_score=relevance_score, query_score=query_score, candidate_score=candidate_score
+        )
+
+
+@add_start_docstrings(
+    "The knowledge-augmented encoder of REALM outputting masked language model logits and marginal log-likelihood"
+    " loss.",
+    REALM_START_DOCSTRING,
+)
+class RealmKnowledgeAugEncoder(RealmPreTrainedModel):
+    _tied_weights_keys = ["cls.predictions.decoder"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.realm = RealmBertModel(self.config)
+        self.cls = RealmOnlyMLMHead(self.config)
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.realm.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.realm.embeddings.word_embeddings = value
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+        self.cls.predictions.bias = new_embeddings.bias
+
+    @add_start_docstrings_to_model_forward(
+        REALM_INPUTS_DOCSTRING.format("batch_size, num_candidates, sequence_length")
+    )
+    @replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        relevance_score: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        mlm_mask: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, MaskedLMOutput]:
+        r"""
+        relevance_score (`torch.FloatTensor` of shape `(batch_size, num_candidates)`, *optional*):
+            Relevance score derived from RealmScorer, must be specified if you want to compute the masked language
+            modeling loss.
+
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+
+        mlm_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid calculating joint loss on certain positions. If not specified, the loss will not be masked.
+            Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> import torch
+        >>> from transformers import AutoTokenizer, RealmKnowledgeAugEncoder
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/realm-cc-news-pretrained-encoder")
+        >>> model = RealmKnowledgeAugEncoder.from_pretrained(
+        ...     "google/realm-cc-news-pretrained-encoder", num_candidates=2
+        ... )
+
+        >>> # batch_size = 2, num_candidates = 2
+        >>> text = [["Hello world!", "Nice to meet you!"], ["The cute cat.", "The adorable dog."]]
+
+        >>> inputs = tokenizer.batch_encode_candidates(text, max_length=10, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> logits = outputs.logits
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if labels is not None and relevance_score is None:
+            raise ValueError(
+                "You have to specify `relevance_score` when `labels` is specified in order to compute loss."
+            )
+
+        (flattened_input_ids, flattened_attention_mask, flattened_token_type_ids) = self._flatten_inputs(
+            input_ids, attention_mask, token_type_ids
+        )
+
+        joint_outputs = self.realm(
+            flattened_input_ids,
+            attention_mask=flattened_attention_mask,
+            token_type_ids=flattened_token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        # [batch_size * num_candidates, joint_seq_len, hidden_size]
+        joint_output = joint_outputs[0]
+        # [batch_size * num_candidates, joint_seq_len, vocab_size]
+        prediction_scores = self.cls(joint_output)
+        # [batch_size, num_candidates]
+        candidate_score = relevance_score
+
+        masked_lm_loss = None
+        if labels is not None:
+            batch_size, seq_length = labels.size()
+
+            if mlm_mask is None:
+                mlm_mask = torch.ones_like(labels, dtype=torch.float32)
+            else:
+                mlm_mask = mlm_mask.type(torch.float32)
+
+            # Compute marginal log-likelihood
+            loss_fct = CrossEntropyLoss(reduction="none")  # -100 index = padding token
+
+            # [batch_size * num_candidates * joint_seq_len, vocab_size]
+            mlm_logits = prediction_scores.view(-1, self.config.vocab_size)
+            # [batch_size * num_candidates * joint_seq_len]
+            mlm_targets = labels.tile(1, self.config.num_candidates).view(-1)
+            # [batch_size, num_candidates, joint_seq_len]
+            masked_lm_log_prob = -loss_fct(mlm_logits, mlm_targets).view(
+                batch_size, self.config.num_candidates, seq_length
+            )
+            # [batch_size, num_candidates, 1]
+            candidate_log_prob = candidate_score.log_softmax(-1).unsqueeze(-1)
+            # [batch_size, num_candidates, joint_seq_len]
+            joint_gold_log_prob = candidate_log_prob + masked_lm_log_prob
+            # [batch_size, joint_seq_len]
+            marginal_gold_log_probs = joint_gold_log_prob.logsumexp(1)
+            # []
+            masked_lm_loss = -torch.nansum(torch.sum(marginal_gold_log_probs * mlm_mask) / torch.sum(mlm_mask))
+
+        if not return_dict:
+            output = (prediction_scores,) + joint_outputs[2:4]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=joint_outputs.hidden_states,
+            attentions=joint_outputs.attentions,
+        )
+
+
+@add_start_docstrings("The reader of REALM.", REALM_START_DOCSTRING)
+class RealmReader(RealmPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.realm = RealmBertModel(config)
+        self.cls = RealmOnlyMLMHead(config)
+        self.qa_outputs = RealmReaderProjection(config)
+
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(REALM_INPUTS_DOCSTRING.format("reader_beam_size, sequence_length"))
+    @replace_return_docstrings(output_type=RealmReaderOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        relevance_score: Optional[torch.FloatTensor] = None,
+        block_mask: Optional[torch.BoolTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        has_answers: Optional[torch.BoolTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, RealmReaderOutput]:
+        r"""
+        relevance_score (`torch.FloatTensor` of shape `(searcher_beam_size,)`, *optional*):
+            Relevance score, which must be specified if you want to compute the logits and marginal log loss.
+        block_mask (`torch.BoolTensor` of shape `(searcher_beam_size, sequence_length)`, *optional*):
+            The mask of the evidence block, which must be specified if you want to compute the logits and marginal log
+            loss.
+        start_positions (`torch.LongTensor` of shape `(searcher_beam_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(searcher_beam_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        has_answers (`torch.BoolTensor` of shape `(searcher_beam_size,)`, *optional*):
+            Whether or not the evidence block has answer(s).
+
+        Returns:
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if relevance_score is None:
+            raise ValueError("You have to specify `relevance_score` to calculate logits and loss.")
+        if block_mask is None:
+            raise ValueError("You have to specify `block_mask` to separate question block and evidence block.")
+        if token_type_ids.size(1) < self.config.max_span_width:
+            raise ValueError("The input sequence length must be greater than or equal to config.max_span_width.")
+        outputs = self.realm(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        # [reader_beam_size, joint_seq_len, hidden_size]
+        sequence_output = outputs[0]
+
+        # [reader_beam_size, num_candidates], [num_candidates], [num_candidates]
+        reader_logits, candidate_starts, candidate_ends = self.qa_outputs(
+            sequence_output, block_mask[0 : self.config.reader_beam_size]
+        )
+        # [searcher_beam_size, 1]
+        retriever_logits = torch.unsqueeze(relevance_score[0 : self.config.reader_beam_size], -1)
+        # [reader_beam_size, num_candidates]
+        reader_logits += retriever_logits
+        # []
+        predicted_block_index = torch.argmax(torch.max(reader_logits, dim=1).values)
+        # []
+        predicted_candidate = torch.argmax(torch.max(reader_logits, dim=0).values)
+        # [1]
+        predicted_start = torch.index_select(candidate_starts, dim=0, index=predicted_candidate)
+        # [1]
+        predicted_end = torch.index_select(candidate_ends, dim=0, index=predicted_candidate)
+
+        total_loss = None
+        retriever_loss = None
+        reader_loss = None
+        retriever_correct = None
+        reader_correct = None
+        if start_positions is not None and end_positions is not None and has_answers is not None:
+
+            def compute_correct_candidates(candidate_starts, candidate_ends, gold_starts, gold_ends):
+                """Compute correct span."""
+                # [reader_beam_size, num_answers, num_candidates]
+                is_gold_start = torch.eq(
+                    torch.unsqueeze(torch.unsqueeze(candidate_starts, 0), 0), torch.unsqueeze(gold_starts, -1)
+                )
+                is_gold_end = torch.eq(
+                    torch.unsqueeze(torch.unsqueeze(candidate_ends, 0), 0), torch.unsqueeze(gold_ends, -1)
+                )
+
+                # [reader_beam_size, num_candidates]
+                return torch.any(torch.logical_and(is_gold_start, is_gold_end), 1)
+
+            def marginal_log_loss(logits, is_correct):
+                """Loss based on the negative marginal log-likelihood."""
+
+                def mask_to_score(mask, dtype=torch.float32):
+                    return (1.0 - mask.type(dtype)) * torch.finfo(dtype).min
+
+                # []
+                log_numerator = torch.logsumexp(logits + mask_to_score(is_correct, dtype=logits.dtype), dim=-1)
+                log_denominator = torch.logsumexp(logits, dim=-1)
+                return log_denominator - log_numerator
+
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            # `-1` is reserved for no answer.
+            ignored_index = sequence_output.size(1)
+            start_positions = start_positions.clamp(-1, ignored_index)
+            end_positions = end_positions.clamp(-1, ignored_index)
+
+            retriever_correct = has_answers
+            any_retriever_correct = torch.any(retriever_correct)
+
+            reader_correct = compute_correct_candidates(
+                candidate_starts=candidate_starts,
+                candidate_ends=candidate_ends,
+                gold_starts=start_positions[0 : self.config.reader_beam_size],
+                gold_ends=end_positions[0 : self.config.reader_beam_size],
+            )
+            any_reader_correct = torch.any(reader_correct)
+
+            retriever_loss = marginal_log_loss(relevance_score, retriever_correct)
+            reader_loss = marginal_log_loss(reader_logits.view(-1), reader_correct.view(-1))
+            retriever_loss *= any_retriever_correct.type(torch.float32)
+            reader_loss *= any_reader_correct.type(torch.float32)
+
+            total_loss = (retriever_loss + reader_loss).mean()
+
+        if not return_dict:
+            output = (predicted_block_index, predicted_candidate, predicted_start, predicted_end) + outputs[2:]
+            return (
+                ((total_loss, retriever_loss, reader_loss, retriever_correct, reader_correct) + output)
+                if total_loss is not None
+                else output
+            )
+
+        return RealmReaderOutput(
+            loss=total_loss,
+            retriever_loss=retriever_loss,
+            reader_loss=reader_loss,
+            retriever_correct=retriever_correct,
+            reader_correct=reader_correct,
+            block_idx=predicted_block_index,
+            candidate=predicted_candidate,
+            start_pos=predicted_start,
+            end_pos=predicted_end,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+REALM_FOR_OPEN_QA_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token (should not be used in this model by design).
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        answer_ids (`list` of shape `(num_answers, answer_length)`, *optional*):
+            Answer ids for computing the marginal log-likelihood loss. Indices should be in `[-1, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-1` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "`RealmForOpenQA` for end-to-end open domain question answering.",
+    REALM_START_DOCSTRING,
+)
+class RealmForOpenQA(RealmPreTrainedModel):
+    def __init__(self, config, retriever=None):
+        super().__init__(config)
+        self.embedder = RealmEmbedder(config)
+        self.reader = RealmReader(config)
+        self.register_buffer(
+            "block_emb",
+            torch.zeros(()).new_empty(
+                size=(config.num_block_records, config.retriever_proj_size),
+                dtype=torch.float32,
+                device=torch.device("cpu"),
+            ),
+        )
+        self.retriever = retriever
+
+        self.post_init()
+
+    @property
+    def searcher_beam_size(self):
+        if self.training:
+            return self.config.searcher_beam_size
+        return self.config.reader_beam_size
+
+    def block_embedding_to(self, device):
+        """Send `self.block_emb` to a specific device.
+
+        Args:
+            device (`str` or `torch.device`):
+                The device to which `self.block_emb` will be sent.
+        """
+
+        self.block_emb = self.block_emb.to(device)
+
+    @add_start_docstrings_to_model_forward(REALM_FOR_OPEN_QA_DOCSTRING.format("1, sequence_length"))
+    @replace_return_docstrings(output_type=RealmForOpenQAOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor],
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        answer_ids: Optional[torch.LongTensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, RealmForOpenQAOutput]:
+        r"""
+        Returns:
+
+        Example:
+
+        ```python
+        >>> import torch
+        >>> from transformers import RealmForOpenQA, RealmRetriever, AutoTokenizer
+
+        >>> retriever = RealmRetriever.from_pretrained("google/realm-orqa-nq-openqa")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/realm-orqa-nq-openqa")
+        >>> model = RealmForOpenQA.from_pretrained("google/realm-orqa-nq-openqa", retriever=retriever)
+
+        >>> question = "Who is the pioneer in modern computer science?"
+        >>> question_ids = tokenizer([question], return_tensors="pt")
+        >>> answer_ids = tokenizer(
+        ...     ["alan mathison turing"],
+        ...     add_special_tokens=False,
+        ...     return_token_type_ids=False,
+        ...     return_attention_mask=False,
+        ... ).input_ids
+
+        >>> reader_output, predicted_answer_ids = model(**question_ids, answer_ids=answer_ids, return_dict=False)
+        >>> predicted_answer = tokenizer.decode(predicted_answer_ids)
+        >>> loss = reader_output.loss
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and input_ids.shape[0] != 1:
+            raise ValueError("The batch_size of the inputs must be 1.")
+
+        question_outputs = self.embedder(
+            input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, return_dict=True
+        )
+        # [1, projection_size]
+        question_projection = question_outputs[0]
+
+        # CPU computation starts.
+        # [1, block_emb_size]
+        batch_scores = torch.einsum("BD,QD->QB", self.block_emb, question_projection.to(self.block_emb.device))
+        # [1, searcher_beam_size]
+        _, retrieved_block_ids = torch.topk(batch_scores, k=self.searcher_beam_size, dim=-1)
+        # [searcher_beam_size]
+        retrieved_block_ids = retrieved_block_ids.squeeze()
+        # [searcher_beam_size, projection_size]
+        retrieved_block_emb = torch.index_select(self.block_emb, dim=0, index=retrieved_block_ids)
+        # CPU computation ends.
+
+        # Retrieve possible answers
+        has_answers, start_pos, end_pos, concat_inputs = self.retriever(
+            retrieved_block_ids.cpu(), input_ids, answer_ids, max_length=self.config.reader_seq_len
+        )
+
+        concat_inputs = concat_inputs.to(self.reader.device)
+        block_mask = concat_inputs.special_tokens_mask.type(torch.bool).to(device=self.reader.device)
+        block_mask.logical_not_().logical_and_(concat_inputs.token_type_ids.type(torch.bool))
+
+        if has_answers is not None:
+            has_answers = torch.tensor(has_answers, dtype=torch.bool, device=self.reader.device)
+            start_pos = torch.tensor(start_pos, dtype=torch.long, device=self.reader.device)
+            end_pos = torch.tensor(end_pos, dtype=torch.long, device=self.reader.device)
+
+        # [searcher_beam_size]
+        retrieved_logits = torch.einsum(
+            "D,BD->B", question_projection.squeeze(), retrieved_block_emb.to(self.reader.device)
+        )
+
+        reader_output = self.reader(
+            input_ids=concat_inputs.input_ids[0 : self.config.reader_beam_size],
+            attention_mask=concat_inputs.attention_mask[0 : self.config.reader_beam_size],
+            token_type_ids=concat_inputs.token_type_ids[0 : self.config.reader_beam_size],
+            relevance_score=retrieved_logits,
+            block_mask=block_mask,
+            has_answers=has_answers,
+            start_positions=start_pos,
+            end_positions=end_pos,
+            return_dict=True,
+        )
+
+        predicted_block = concat_inputs.input_ids[reader_output.block_idx]
+        predicted_answer_ids = predicted_block[reader_output.start_pos : reader_output.end_pos + 1]
+
+        if not return_dict:
+            return reader_output, predicted_answer_ids
+
+        return RealmForOpenQAOutput(
+            reader_output=reader_output,
+            predicted_answer_ids=predicted_answer_ids,
+        )
+
+
+__all__ = [
+    "RealmEmbedder",
+    "RealmForOpenQA",
+    "RealmKnowledgeAugEncoder",
+    "RealmPreTrainedModel",
+    "RealmReader",
+    "RealmScorer",
+    "load_tf_weights_in_realm",
+]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/realm/retrieval_realm.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/realm/retrieval_realm.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5e47abb11791aedccd5621677713136a037be8c
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/realm/retrieval_realm.py
@@ -0,0 +1,176 @@
+# coding=utf-8
+# Copyright 2022 The REALM authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""REALM Retriever model implementation."""
+
+import os
+from typing import Optional, Union
+
+import numpy as np
+from huggingface_hub import hf_hub_download
+
+from transformers import AutoTokenizer
+
+from ....utils import logging, strtobool
+
+
+_REALM_BLOCK_RECORDS_FILENAME = "block_records.npy"
+
+
+logger = logging.get_logger(__name__)
+
+
+def convert_tfrecord_to_np(block_records_path: str, num_block_records: int) -> np.ndarray:
+    import tensorflow.compat.v1 as tf
+
+    blocks_dataset = tf.data.TFRecordDataset(block_records_path, buffer_size=512 * 1024 * 1024)
+    blocks_dataset = blocks_dataset.batch(num_block_records, drop_remainder=True)
+    np_record = next(blocks_dataset.take(1).as_numpy_iterator())
+
+    return np_record
+
+
+class ScaNNSearcher:
+    """Note that ScaNNSearcher cannot currently be used within the model. In future versions, it might however be included."""
+
+    def __init__(
+        self,
+        db,
+        num_neighbors,
+        dimensions_per_block=2,
+        num_leaves=1000,
+        num_leaves_to_search=100,
+        training_sample_size=100000,
+    ):
+        """Build scann searcher."""
+
+        from scann.scann_ops.py.scann_ops_pybind import builder as Builder
+
+        builder = Builder(db=db, num_neighbors=num_neighbors, distance_measure="dot_product")
+        builder = builder.tree(
+            num_leaves=num_leaves, num_leaves_to_search=num_leaves_to_search, training_sample_size=training_sample_size
+        )
+        builder = builder.score_ah(dimensions_per_block=dimensions_per_block)
+
+        self.searcher = builder.build()
+
+    def search_batched(self, question_projection):
+        retrieved_block_ids, _ = self.searcher.search_batched(question_projection.detach().cpu())
+        return retrieved_block_ids.astype("int64")
+
+
+class RealmRetriever:
+    """The retriever of REALM outputting the retrieved evidence block and whether the block has answers as well as answer
+    positions."
+
+        Parameters:
+            block_records (`np.ndarray`):
+                A numpy array which contains evidence texts.
+            tokenizer ([`RealmTokenizer`]):
+                The tokenizer to encode retrieved texts.
+    """
+
+    def __init__(self, block_records, tokenizer):
+        super().__init__()
+        self.block_records = block_records
+        self.tokenizer = tokenizer
+
+    def __call__(self, retrieved_block_ids, question_input_ids, answer_ids, max_length=None, return_tensors="pt"):
+        retrieved_blocks = np.take(self.block_records, indices=retrieved_block_ids, axis=0)
+
+        question = self.tokenizer.decode(question_input_ids[0], skip_special_tokens=True)
+
+        text = []
+        text_pair = []
+        for retrieved_block in retrieved_blocks:
+            text.append(question)
+            text_pair.append(retrieved_block.decode())
+
+        concat_inputs = self.tokenizer(
+            text, text_pair, padding=True, truncation=True, return_special_tokens_mask=True, max_length=max_length
+        )
+        concat_inputs_tensors = concat_inputs.convert_to_tensors(return_tensors)
+
+        if answer_ids is not None:
+            return self.block_has_answer(concat_inputs, answer_ids) + (concat_inputs_tensors,)
+        else:
+            return (None, None, None, concat_inputs_tensors)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *init_inputs, **kwargs):
+        if os.path.isdir(pretrained_model_name_or_path):
+            block_records_path = os.path.join(pretrained_model_name_or_path, _REALM_BLOCK_RECORDS_FILENAME)
+        else:
+            block_records_path = hf_hub_download(
+                repo_id=pretrained_model_name_or_path, filename=_REALM_BLOCK_RECORDS_FILENAME, **kwargs
+            )
+        if not strtobool(os.environ.get("TRUST_REMOTE_CODE", "False")):
+            raise ValueError(
+                "This part uses `pickle.load` which is insecure and will execute arbitrary code that is "
+                "potentially malicious. It's recommended to never unpickle data that could have come from an "
+                "untrusted source, or that could have been tampered with. If you already verified the pickle "
+                "data and decided to use it, you can set the environment variable "
+                "`TRUST_REMOTE_CODE` to `True` to allow it."
+            )
+        block_records = np.load(block_records_path, allow_pickle=True)
+
+        tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, *init_inputs, **kwargs)
+
+        return cls(block_records, tokenizer)
+
+    def save_pretrained(self, save_directory):
+        # save block records
+        np.save(os.path.join(save_directory, _REALM_BLOCK_RECORDS_FILENAME), self.block_records)
+        # save tokenizer
+        self.tokenizer.save_pretrained(save_directory)
+
+    def block_has_answer(self, concat_inputs, answer_ids):
+        """check if retrieved_blocks has answers."""
+        has_answers = []
+        start_pos = []
+        end_pos = []
+        max_answers = 0
+
+        for input_id in concat_inputs.input_ids:
+            input_id_list = input_id.tolist()
+            # Check answers between two [SEP] tokens
+            first_sep_idx = input_id_list.index(self.tokenizer.sep_token_id)
+            second_sep_idx = first_sep_idx + 1 + input_id_list[first_sep_idx + 1 :].index(self.tokenizer.sep_token_id)
+
+            start_pos.append([])
+            end_pos.append([])
+            for answer in answer_ids:
+                for idx in range(first_sep_idx + 1, second_sep_idx):
+                    if answer[0] == input_id_list[idx]:
+                        if input_id_list[idx : idx + len(answer)] == answer:
+                            start_pos[-1].append(idx)
+                            end_pos[-1].append(idx + len(answer) - 1)
+
+            if len(start_pos[-1]) == 0:
+                has_answers.append(False)
+            else:
+                has_answers.append(True)
+                if len(start_pos[-1]) > max_answers:
+                    max_answers = len(start_pos[-1])
+
+        # Pad -1 to max_answers
+        for start_pos_, end_pos_ in zip(start_pos, end_pos):
+            if len(start_pos_) < max_answers:
+                padded = [-1] * (max_answers - len(start_pos_))
+                start_pos_ += padded
+                end_pos_ += padded
+        return has_answers, start_pos, end_pos
+
+
+__all__ = ["RealmRetriever"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/realm/tokenization_realm.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/realm/tokenization_realm.py
new file mode 100644
index 0000000000000000000000000000000000000000..af7cdc1a5bac3afea8ed0651ae2892adcc03cbc3
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/realm/tokenization_realm.py
@@ -0,0 +1,534 @@
+# coding=utf-8
+# Copyright 2022 The REALM authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for REALM."""
+
+import collections
+import os
+import unicodedata
+from typing import Optional
+
+from ....tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
+from ....tokenization_utils_base import BatchEncoding
+from ....utils import PaddingStrategy, logging
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
+
+
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        tokens = reader.readlines()
+    for index, token in enumerate(tokens):
+        token = token.rstrip("\n")
+        vocab[token] = index
+    return vocab
+
+
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a piece of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+
+
+class RealmTokenizer(PreTrainedTokenizer):
+    r"""
+    Construct a REALM tokenizer.
+
+    [`RealmTokenizer`] is identical to [`BertTokenizer`] and runs end-to-end tokenization: punctuation splitting and
+    wordpiece.
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            File containing the vocabulary.
+        do_lower_case (`bool`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+        do_basic_tokenize (`bool`, *optional*, defaults to `True`):
+            Whether or not to do basic tokenization before WordPiece.
+        never_split (`Iterable`, *optional*):
+            Collection of tokens which will never be split during tokenization. Only has an effect when
+            `do_basic_tokenize=True`
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
+            Whether or not to tokenize Chinese characters.
+
+            This should likely be deactivated for Japanese (see this
+            [issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents (`bool`, *optional*):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for `lowercase` (as in the original BERT).
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=True,
+        do_basic_tokenize=True,
+        never_split=None,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        **kwargs,
+    ):
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
+                " model use `tokenizer = RealmTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
+            )
+        self.vocab = load_vocab(vocab_file)
+        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
+        self.do_basic_tokenize = do_basic_tokenize
+        if do_basic_tokenize:
+            self.basic_tokenizer = BasicTokenizer(
+                do_lower_case=do_lower_case,
+                never_split=never_split,
+                tokenize_chinese_chars=tokenize_chinese_chars,
+                strip_accents=strip_accents,
+            )
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
+        super().__init__(
+            do_lower_case=do_lower_case,
+            do_basic_tokenize=do_basic_tokenize,
+            never_split=never_split,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            **kwargs,
+        )
+
+    @property
+    def do_lower_case(self):
+        return self.basic_tokenizer.do_lower_case
+
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+
+    def get_vocab(self):
+        return dict(self.vocab, **self.added_tokens_encoder)
+
+    def _tokenize(self, text):
+        split_tokens = []
+        if self.do_basic_tokenize:
+            for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
+                # If the token is part of the never_split set
+                if token in self.basic_tokenizer.never_split:
+                    split_tokens.append(token)
+                else:
+                    split_tokens += self.wordpiece_tokenizer.tokenize(token)
+        else:
+            split_tokens = self.wordpiece_tokenizer.tokenize(text)
+        return split_tokens
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.ids_to_tokens.get(index, self.unk_token)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        out_string = " ".join(tokens).replace(" ##", "").strip()
+        return out_string
+
+    def batch_encode_candidates(self, text, **kwargs):
+        r"""
+        Encode a batch of text or text pair. This method is similar to regular __call__ method but has the following
+        differences:
+
+            1. Handle additional num_candidate axis. (batch_size, num_candidates, text)
+            2. Always pad the sequences to *max_length*.
+            3. Must specify *max_length* in order to stack packs of candidates into a batch.
+
+            - single sequence: `[CLS] X [SEP]`
+            - pair of sequences: `[CLS] A [SEP] B [SEP]`
+
+        Args:
+            text (`List[List[str]]`):
+                The batch of sequences to be encoded. Each sequence must be in this format: (batch_size,
+                num_candidates, text).
+            text_pair (`List[List[str]]`, *optional*):
+                The batch of sequences to be encoded. Each sequence must be in this format: (batch_size,
+                num_candidates, text).
+            **kwargs:
+                Keyword arguments of the __call__ method.
+
+        Returns:
+            [`BatchEncoding`]: Encoded text or text pair.
+
+        Example:
+
+        ```python
+        >>> from transformers import RealmTokenizer
+
+        >>> # batch_size = 2, num_candidates = 2
+        >>> text = [["Hello world!", "Nice to meet you!"], ["The cute cat.", "The adorable dog."]]
+
+        >>> tokenizer = RealmTokenizer.from_pretrained("google/realm-cc-news-pretrained-encoder")
+        >>> tokenized_text = tokenizer.batch_encode_candidates(text, max_length=10, return_tensors="pt")
+        ```"""
+
+        # Always using a fixed sequence length to encode in order to stack candidates into a batch.
+        kwargs["padding"] = PaddingStrategy.MAX_LENGTH
+
+        batch_text = text
+        batch_text_pair = kwargs.pop("text_pair", None)
+        return_tensors = kwargs.pop("return_tensors", None)
+
+        output_data = {
+            "input_ids": [],
+            "attention_mask": [],
+            "token_type_ids": [],
+        }
+
+        for idx, candidate_text in enumerate(batch_text):
+            if batch_text_pair is not None:
+                candidate_text_pair = batch_text_pair[idx]
+            else:
+                candidate_text_pair = None
+
+            encoded_candidates = super().__call__(candidate_text, candidate_text_pair, return_tensors=None, **kwargs)
+
+            encoded_input_ids = encoded_candidates.get("input_ids")
+            encoded_attention_mask = encoded_candidates.get("attention_mask")
+            encoded_token_type_ids = encoded_candidates.get("token_type_ids")
+
+            if encoded_input_ids is not None:
+                output_data["input_ids"].append(encoded_input_ids)
+            if encoded_attention_mask is not None:
+                output_data["attention_mask"].append(encoded_attention_mask)
+            if encoded_token_type_ids is not None:
+                output_data["token_type_ids"].append(encoded_token_type_ids)
+
+        output_data = {key: item for key, item in output_data.items() if len(item) != 0}
+
+        return BatchEncoding(output_data, tensor_type=return_tensors)
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A REALM sequence has the following format:
+
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
+    ) -> list[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        index = 0
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(
+                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+            )
+        else:
+            vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
+                        " Please check that the vocabulary is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(token + "\n")
+                index += 1
+        return (vocab_file,)
+
+
+class BasicTokenizer:
+    """
+    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
+
+    Args:
+        do_lower_case (`bool`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+        never_split (`Iterable`, *optional*):
+            Collection of tokens which will never be split during tokenization. Only has an effect when
+            `do_basic_tokenize=True`
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
+            Whether or not to tokenize Chinese characters.
+
+            This should likely be deactivated for Japanese (see this
+            [issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents (`bool`, *optional*):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for `lowercase` (as in the original BERT).
+    """
+
+    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
+        if never_split is None:
+            never_split = []
+        self.do_lower_case = do_lower_case
+        self.never_split = set(never_split)
+        self.tokenize_chinese_chars = tokenize_chinese_chars
+        self.strip_accents = strip_accents
+
+    def tokenize(self, text, never_split=None):
+        """
+        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
+        WordPieceTokenizer.
+
+        Args:
+            never_split (`List[str]`, *optional*)
+                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
+                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
+        """
+        # union() returns a new set by concatenating the two sets.
+        never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
+        text = self._clean_text(text)
+
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        if self.tokenize_chinese_chars:
+            text = self._tokenize_chinese_chars(text)
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if token not in never_split:
+                if self.do_lower_case:
+                    token = token.lower()
+                    if self.strip_accents is not False:
+                        token = self._run_strip_accents(token)
+                elif self.strip_accents:
+                    token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token, never_split))
+
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
+    def _run_split_on_punc(self, text, never_split=None):
+        """Splits punctuation on a piece of text."""
+        if never_split is not None and text in never_split:
+            return [text]
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return ["".join(x) for x in output]
+
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if (
+            (cp >= 0x4E00 and cp <= 0x9FFF)
+            or (cp >= 0x3400 and cp <= 0x4DBF)
+            or (cp >= 0x20000 and cp <= 0x2A6DF)
+            or (cp >= 0x2A700 and cp <= 0x2B73F)
+            or (cp >= 0x2B740 and cp <= 0x2B81F)
+            or (cp >= 0x2B820 and cp <= 0x2CEAF)
+            or (cp >= 0xF900 and cp <= 0xFAFF)
+            or (cp >= 0x2F800 and cp <= 0x2FA1F)
+        ):
+            return True
+
+        return False
+
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xFFFD or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+
+class WordpieceTokenizer:
+    """Runs WordPiece tokenization."""
+
+    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+
+    def tokenize(self, text):
+        """
+        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
+        tokenization using the given vocabulary.
+
+        For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`.
+
+        Args:
+            text: A single token or whitespace separated tokens. This should have
+                already been passed through *BasicTokenizer*.
+
+        Returns:
+            A list of wordpiece tokens.
+        """
+
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+
+
+__all__ = ["RealmTokenizer"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/realm/tokenization_realm_fast.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/realm/tokenization_realm_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..53a1d99e0ac804ae196e9c65f0e2979dcb2401c8
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/realm/tokenization_realm_fast.py
@@ -0,0 +1,223 @@
+# coding=utf-8
+# Copyright 2022 The REALM authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Tokenization classes for REALM."""
+
+import json
+from typing import Optional
+
+from tokenizers import normalizers
+
+from ....tokenization_utils_base import BatchEncoding
+from ....tokenization_utils_fast import PreTrainedTokenizerFast
+from ....utils import PaddingStrategy, logging
+from .tokenization_realm import RealmTokenizer
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
+
+
+class RealmTokenizerFast(PreTrainedTokenizerFast):
+    r"""
+    Construct a "fast" REALM tokenizer (backed by HuggingFace's *tokenizers* library). Based on WordPiece.
+
+    [`RealmTokenizerFast`] is identical to [`BertTokenizerFast`] and runs end-to-end tokenization: punctuation
+    splitting and wordpiece.
+
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            File containing the vocabulary.
+        do_lower_case (`bool`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        clean_text (`bool`, *optional*, defaults to `True`):
+            Whether or not to clean the text before tokenization by removing any control characters and replacing all
+            whitespaces by the classic one.
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
+            Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this
+            issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents (`bool`, *optional*):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for `lowercase` (as in the original BERT).
+        wordpieces_prefix (`str`, *optional*, defaults to `"##"`):
+            The prefix for subwords.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    slow_tokenizer_class = RealmTokenizer
+
+    def __init__(
+        self,
+        vocab_file=None,
+        tokenizer_file=None,
+        do_lower_case=True,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        **kwargs,
+    ):
+        super().__init__(
+            vocab_file,
+            tokenizer_file=tokenizer_file,
+            do_lower_case=do_lower_case,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            **kwargs,
+        )
+
+        normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
+        if (
+            normalizer_state.get("lowercase", do_lower_case) != do_lower_case
+            or normalizer_state.get("strip_accents", strip_accents) != strip_accents
+            or normalizer_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars
+        ):
+            normalizer_class = getattr(normalizers, normalizer_state.pop("type"))
+            normalizer_state["lowercase"] = do_lower_case
+            normalizer_state["strip_accents"] = strip_accents
+            normalizer_state["handle_chinese_chars"] = tokenize_chinese_chars
+            self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state)
+
+        self.do_lower_case = do_lower_case
+
+    def batch_encode_candidates(self, text, **kwargs):
+        r"""
+        Encode a batch of text or text pair. This method is similar to regular __call__ method but has the following
+        differences:
+
+            1. Handle additional num_candidate axis. (batch_size, num_candidates, text)
+            2. Always pad the sequences to *max_length*.
+            3. Must specify *max_length* in order to stack packs of candidates into a batch.
+
+            - single sequence: `[CLS] X [SEP]`
+            - pair of sequences: `[CLS] A [SEP] B [SEP]`
+
+        Args:
+            text (`List[List[str]]`):
+                The batch of sequences to be encoded. Each sequence must be in this format: (batch_size,
+                num_candidates, text).
+            text_pair (`List[List[str]]`, *optional*):
+                The batch of sequences to be encoded. Each sequence must be in this format: (batch_size,
+                num_candidates, text).
+            **kwargs:
+                Keyword arguments of the __call__ method.
+
+        Returns:
+            [`BatchEncoding`]: Encoded text or text pair.
+
+        Example:
+
+        ```python
+        >>> from transformers import RealmTokenizerFast
+
+        >>> # batch_size = 2, num_candidates = 2
+        >>> text = [["Hello world!", "Nice to meet you!"], ["The cute cat.", "The adorable dog."]]
+
+        >>> tokenizer = RealmTokenizerFast.from_pretrained("google/realm-cc-news-pretrained-encoder")
+        >>> tokenized_text = tokenizer.batch_encode_candidates(text, max_length=10, return_tensors="pt")
+        ```"""
+
+        # Always using a fixed sequence length to encode in order to stack candidates into a batch.
+        kwargs["padding"] = PaddingStrategy.MAX_LENGTH
+
+        batch_text = text
+        batch_text_pair = kwargs.pop("text_pair", None)
+        return_tensors = kwargs.pop("return_tensors", None)
+
+        output_data = {
+            "input_ids": [],
+            "attention_mask": [],
+            "token_type_ids": [],
+        }
+
+        for idx, candidate_text in enumerate(batch_text):
+            if batch_text_pair is not None:
+                candidate_text_pair = batch_text_pair[idx]
+            else:
+                candidate_text_pair = None
+
+            encoded_candidates = super().__call__(candidate_text, candidate_text_pair, return_tensors=None, **kwargs)
+
+            encoded_input_ids = encoded_candidates.get("input_ids")
+            encoded_attention_mask = encoded_candidates.get("attention_mask")
+            encoded_token_type_ids = encoded_candidates.get("token_type_ids")
+
+            if encoded_input_ids is not None:
+                output_data["input_ids"].append(encoded_input_ids)
+            if encoded_attention_mask is not None:
+                output_data["attention_mask"].append(encoded_attention_mask)
+            if encoded_token_type_ids is not None:
+                output_data["token_type_ids"].append(encoded_token_type_ids)
+
+        output_data = {key: item for key, item in output_data.items() if len(item) != 0}
+
+        return BatchEncoding(output_data, tensor_type=return_tensors)
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A REALM sequence has the following format:
+
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+
+        if token_ids_1 is not None:
+            output += token_ids_1 + [self.sep_token_id]
+
+        return output
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
+        return tuple(files)
+
+
+__all__ = ["RealmTokenizerFast"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/retribert/__init__.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/retribert/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a875576607430c041abab01eccaf468a6cc9272e
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/retribert/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ....utils import _LazyModule
+from ....utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_retribert import *
+    from .modeling_retribert import *
+    from .tokenization_retribert import *
+    from .tokenization_retribert_fast import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/retribert/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/retribert/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0fa5541f4ec1cb64dc883477c7402e5aa4a39c4f
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/retribert/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/retribert/__pycache__/configuration_retribert.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/retribert/__pycache__/configuration_retribert.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..472bd540aa464f3ba7d4f41f611ff7f59ab448e6
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/retribert/__pycache__/configuration_retribert.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/retribert/__pycache__/modeling_retribert.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/retribert/__pycache__/modeling_retribert.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1b94d9af7f55e6d48047a6eb2958a9bff1c8315d
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/retribert/__pycache__/modeling_retribert.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/retribert/__pycache__/tokenization_retribert.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/retribert/__pycache__/tokenization_retribert.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8a87ab863de14652c67a0c55351f146d41f6ebef
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/retribert/__pycache__/tokenization_retribert.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/retribert/__pycache__/tokenization_retribert_fast.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/retribert/__pycache__/tokenization_retribert_fast.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..676b0b0912af15f8e113f8f7f23ca55e525268d2
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/retribert/__pycache__/tokenization_retribert_fast.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/retribert/configuration_retribert.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/retribert/configuration_retribert.py
new file mode 100644
index 0000000000000000000000000000000000000000..80d755a16961450cae783d834385e5e6873dc24e
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/retribert/configuration_retribert.py
@@ -0,0 +1,108 @@
+# coding=utf-8
+# Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""RetriBERT model configuration"""
+
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class RetriBertConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`RetriBertModel`]. It is used to instantiate a
+    RetriBertModel model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the RetriBERT
+    [yjernite/retribert-base-uncased](https://huggingface.co/yjernite/retribert-base-uncased) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the RetriBERT model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`RetriBertModel`]
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the *token_type_ids* passed into [`BertModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        share_encoders (`bool`, *optional*, defaults to `True`):
+            Whether or not to use the same Bert-type encoder for the queries and document
+        projection_dim (`int`, *optional*, defaults to 128):
+            Final dimension of the query and document representation after projection
+    """
+
+    model_type = "retribert"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=8,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        share_encoders=True,
+        projection_dim=128,
+        pad_token_id=0,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.share_encoders = share_encoders
+        self.projection_dim = projection_dim
+
+
+__all__ = ["RetriBertConfig"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/retribert/modeling_retribert.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/retribert/modeling_retribert.py
new file mode 100644
index 0000000000000000000000000000000000000000..06806e8e6d0bb01fa31c6849ccb23f3e29bf74aa
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/retribert/modeling_retribert.py
@@ -0,0 +1,217 @@
+# coding=utf-8
+# Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+RetriBERT model
+"""
+
+import math
+from typing import Optional
+
+import torch
+import torch.utils.checkpoint as checkpoint
+from torch import nn
+
+from ....modeling_utils import PreTrainedModel
+from ....utils import add_start_docstrings, logging
+from ...bert.modeling_bert import BertModel
+from .configuration_retribert import RetriBertConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+# INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL #
+class RetriBertPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config: RetriBertConfig
+    load_tf_weights = None
+    base_model_prefix = "retribert"
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+RETRIBERT_START_DOCSTRING = r"""
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`RetriBertConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    """Bert Based model to embed queries or document for document retrieval.""",
+    RETRIBERT_START_DOCSTRING,
+)
+class RetriBertModel(RetriBertPreTrainedModel):
+    def __init__(self, config: RetriBertConfig) -> None:
+        super().__init__(config)
+        self.projection_dim = config.projection_dim
+
+        self.bert_query = BertModel(config)
+        self.bert_doc = None if config.share_encoders else BertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.project_query = nn.Linear(config.hidden_size, config.projection_dim, bias=False)
+        self.project_doc = nn.Linear(config.hidden_size, config.projection_dim, bias=False)
+
+        self.ce_loss = nn.CrossEntropyLoss(reduction="mean")
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def embed_sentences_checkpointed(
+        self,
+        input_ids,
+        attention_mask,
+        sent_encoder,
+        checkpoint_batch_size=-1,
+    ):
+        # reproduces BERT forward pass with checkpointing
+        if checkpoint_batch_size < 0 or input_ids.shape[0] < checkpoint_batch_size:
+            return sent_encoder(input_ids, attention_mask=attention_mask)[1]
+        else:
+            # prepare implicit variables
+            device = input_ids.device
+            input_shape = input_ids.size()
+            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+            head_mask = [None] * sent_encoder.config.num_hidden_layers
+            extended_attention_mask: torch.Tensor = sent_encoder.get_extended_attention_mask(
+                attention_mask, input_shape
+            )
+
+            # define function for checkpointing
+            def partial_encode(*inputs):
+                encoder_outputs = sent_encoder.encoder(
+                    inputs[0],
+                    attention_mask=inputs[1],
+                    head_mask=head_mask,
+                )
+                sequence_output = encoder_outputs[0]
+                pooled_output = sent_encoder.pooler(sequence_output)
+                return pooled_output
+
+            # run embedding layer on everything at once
+            embedding_output = sent_encoder.embeddings(
+                input_ids=input_ids, position_ids=None, token_type_ids=token_type_ids, inputs_embeds=None
+            )
+            # run encoding and pooling on one mini-batch at a time
+            pooled_output_list = []
+            for b in range(math.ceil(input_ids.shape[0] / checkpoint_batch_size)):
+                b_embedding_output = embedding_output[b * checkpoint_batch_size : (b + 1) * checkpoint_batch_size]
+                b_attention_mask = extended_attention_mask[b * checkpoint_batch_size : (b + 1) * checkpoint_batch_size]
+                pooled_output = checkpoint.checkpoint(partial_encode, b_embedding_output, b_attention_mask)
+                pooled_output_list.append(pooled_output)
+            return torch.cat(pooled_output_list, dim=0)
+
+    def embed_questions(
+        self,
+        input_ids,
+        attention_mask=None,
+        checkpoint_batch_size=-1,
+    ):
+        q_reps = self.embed_sentences_checkpointed(
+            input_ids,
+            attention_mask,
+            self.bert_query,
+            checkpoint_batch_size,
+        )
+        return self.project_query(q_reps)
+
+    def embed_answers(
+        self,
+        input_ids,
+        attention_mask=None,
+        checkpoint_batch_size=-1,
+    ):
+        a_reps = self.embed_sentences_checkpointed(
+            input_ids,
+            attention_mask,
+            self.bert_query if self.bert_doc is None else self.bert_doc,
+            checkpoint_batch_size,
+        )
+        return self.project_doc(a_reps)
+
+    def forward(
+        self,
+        input_ids_query: torch.LongTensor,
+        attention_mask_query: Optional[torch.FloatTensor],
+        input_ids_doc: torch.LongTensor,
+        attention_mask_doc: Optional[torch.FloatTensor],
+        checkpoint_batch_size: int = -1,
+    ) -> torch.FloatTensor:
+        r"""
+        Args:
+            input_ids_query (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary for the queries in a batch.
+
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask_query (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            input_ids_doc (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary for the documents in a batch.
+            attention_mask_doc (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on documents padding token indices.
+            checkpoint_batch_size (`int`, *optional*, defaults to `-1`):
+                If greater than 0, uses gradient checkpointing to only compute sequence representation on
+                `checkpoint_batch_size` examples at a time on the GPU. All query representations are still compared to
+                all document representations in the batch.
+
+        Return:
+            `torch.FloatTensor``: The bidirectional cross-entropy loss obtained while trying to match each query to its
+            corresponding document and each document to its corresponding query in the batch
+        """
+        device = input_ids_query.device
+        q_reps = self.embed_questions(input_ids_query, attention_mask_query, checkpoint_batch_size)
+        a_reps = self.embed_answers(input_ids_doc, attention_mask_doc, checkpoint_batch_size)
+        compare_scores = torch.mm(q_reps, a_reps.t())
+        loss_qa = self.ce_loss(compare_scores, torch.arange(compare_scores.shape[1]).to(device))
+        loss_aq = self.ce_loss(compare_scores.t(), torch.arange(compare_scores.shape[0]).to(device))
+        loss = (loss_qa + loss_aq) / 2
+        return loss
+
+
+__all__ = ["RetriBertModel", "RetriBertPreTrainedModel"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/retribert/tokenization_retribert.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/retribert/tokenization_retribert.py
new file mode 100644
index 0000000000000000000000000000000000000000..288b46e267bccb358fa86e74ac2b98dcb99e8904
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/retribert/tokenization_retribert.py
@@ -0,0 +1,475 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for RetriBERT."""
+
+import collections
+import os
+import unicodedata
+from typing import Optional
+
+from ....tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
+from ....utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
+
+
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        tokens = reader.readlines()
+    for index, token in enumerate(tokens):
+        token = token.rstrip("\n")
+        vocab[token] = index
+    return vocab
+
+
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a piece of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+
+
+class RetriBertTokenizer(PreTrainedTokenizer):
+    r"""
+    Constructs a RetriBERT tokenizer.
+
+    [`RetriBertTokenizer`] is identical to [`BertTokenizer`] and runs end-to-end tokenization: punctuation splitting
+    and wordpiece.
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer
+    to: this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            File containing the vocabulary.
+        do_lower_case (`bool`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+        do_basic_tokenize (`bool`, *optional*, defaults to `True`):
+            Whether or not to do basic tokenization before WordPiece.
+        never_split (`Iterable`, *optional*):
+            Collection of tokens which will never be split during tokenization. Only has an effect when
+            `do_basic_tokenize=True`
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
+            Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see this
+            [issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents (`bool`, *optional*):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for `lowercase` (as in the original BERT).
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=True,
+        do_basic_tokenize=True,
+        never_split=None,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        **kwargs,
+    ):
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
+                " model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
+            )
+        self.vocab = load_vocab(vocab_file)
+        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
+        self.do_basic_tokenize = do_basic_tokenize
+        if do_basic_tokenize:
+            self.basic_tokenizer = BasicTokenizer(
+                do_lower_case=do_lower_case,
+                never_split=never_split,
+                tokenize_chinese_chars=tokenize_chinese_chars,
+                strip_accents=strip_accents,
+            )
+
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
+
+        super().__init__(
+            do_lower_case=do_lower_case,
+            do_basic_tokenize=do_basic_tokenize,
+            never_split=never_split,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            **kwargs,
+        )
+
+    @property
+    def do_lower_case(self):
+        return self.basic_tokenizer.do_lower_case
+
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+
+    def get_vocab(self):
+        return dict(self.vocab, **self.added_tokens_encoder)
+
+    def _tokenize(self, text, split_special_tokens=False):
+        split_tokens = []
+        if self.do_basic_tokenize:
+            for token in self.basic_tokenizer.tokenize(
+                text, never_split=self.all_special_tokens if not split_special_tokens else None
+            ):
+                # If the token is part of the never_split set
+                if token in self.basic_tokenizer.never_split:
+                    split_tokens.append(token)
+                else:
+                    split_tokens += self.wordpiece_tokenizer.tokenize(token)
+        else:
+            split_tokens = self.wordpiece_tokenizer.tokenize(text)
+        return split_tokens
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.ids_to_tokens.get(index, self.unk_token)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        out_string = " ".join(tokens).replace(" ##", "").strip()
+        return out_string
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A BERT sequence has the following format:
+
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
+    ) -> list[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        index = 0
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(
+                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+            )
+        else:
+            vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
+                        " Please check that the vocabulary is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(token + "\n")
+                index += 1
+        return (vocab_file,)
+
+
+class BasicTokenizer:
+    """
+    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
+
+    Args:
+        do_lower_case (`bool`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+        never_split (`Iterable`, *optional*):
+            Collection of tokens which will never be split during tokenization. Only has an effect when
+            `do_basic_tokenize=True`
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
+            Whether or not to tokenize Chinese characters.
+
+            This should likely be deactivated for Japanese (see this
+            [issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents (`bool`, *optional*):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for `lowercase` (as in the original BERT).
+        do_split_on_punc (`bool`, *optional*, defaults to `True`):
+            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
+            the full context of the words, such as contractions.
+    """
+
+    def __init__(
+        self,
+        do_lower_case=True,
+        never_split=None,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        do_split_on_punc=True,
+    ):
+        if never_split is None:
+            never_split = []
+        self.do_lower_case = do_lower_case
+        self.never_split = set(never_split)
+        self.tokenize_chinese_chars = tokenize_chinese_chars
+        self.strip_accents = strip_accents
+        self.do_split_on_punc = do_split_on_punc
+
+    def tokenize(self, text, never_split=None):
+        """
+        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
+
+        Args:
+            never_split (`List[str]`, *optional*)
+                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
+                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
+        """
+        # union() returns a new set by concatenating the two sets.
+        never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
+        text = self._clean_text(text)
+
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        if self.tokenize_chinese_chars:
+            text = self._tokenize_chinese_chars(text)
+        # prevents treating the same character with different unicode codepoints as different characters
+        unicode_normalized_text = unicodedata.normalize("NFC", text)
+        orig_tokens = whitespace_tokenize(unicode_normalized_text)
+        split_tokens = []
+        for token in orig_tokens:
+            if token not in never_split:
+                if self.do_lower_case:
+                    token = token.lower()
+                    if self.strip_accents is not False:
+                        token = self._run_strip_accents(token)
+                elif self.strip_accents:
+                    token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token, never_split))
+
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
+    def _run_split_on_punc(self, text, never_split=None):
+        """Splits punctuation on a piece of text."""
+        if not self.do_split_on_punc or (never_split is not None and text in never_split):
+            return [text]
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return ["".join(x) for x in output]
+
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if (
+            (cp >= 0x4E00 and cp <= 0x9FFF)
+            or (cp >= 0x3400 and cp <= 0x4DBF)
+            or (cp >= 0x20000 and cp <= 0x2A6DF)
+            or (cp >= 0x2A700 and cp <= 0x2B73F)
+            or (cp >= 0x2B740 and cp <= 0x2B81F)
+            or (cp >= 0x2B820 and cp <= 0x2CEAF)
+            or (cp >= 0xF900 and cp <= 0xFAFF)
+            or (cp >= 0x2F800 and cp <= 0x2FA1F)
+        ):
+            return True
+
+        return False
+
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xFFFD or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+
+class WordpieceTokenizer:
+    """Runs WordPiece tokenization."""
+
+    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+
+    def tokenize(self, text):
+        """
+        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
+        tokenization using the given vocabulary.
+
+        For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`.
+
+        Args:
+            text: A single token or whitespace separated tokens. This should have
+                already been passed through *BasicTokenizer*.
+
+        Returns:
+            A list of wordpiece tokens.
+        """
+
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+
+
+__all__ = ["RetriBertTokenizer"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/retribert/tokenization_retribert_fast.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/retribert/tokenization_retribert_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fcbc395a1b7ba7f659a9befae14c8ba658c27d0
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/retribert/tokenization_retribert_fast.py
@@ -0,0 +1,150 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for RetriBERT."""
+
+import json
+from typing import Optional
+
+from tokenizers import normalizers
+
+from ....tokenization_utils_fast import PreTrainedTokenizerFast
+from ....utils import logging
+from .tokenization_retribert import RetriBertTokenizer
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
+
+
+class RetriBertTokenizerFast(PreTrainedTokenizerFast):
+    r"""
+    Construct a "fast" RetriBERT tokenizer (backed by HuggingFace's *tokenizers* library).
+
+    [`RetriBertTokenizerFast`] is identical to [`BertTokenizerFast`] and runs end-to-end tokenization: punctuation
+    splitting and wordpiece.
+
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            File containing the vocabulary.
+        do_lower_case (`bool`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        clean_text (`bool`, *optional*, defaults to `True`):
+            Whether or not to clean the text before tokenization by removing any control characters and replacing all
+            whitespaces by the classic one.
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
+            Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this
+            issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents (`bool`, *optional*):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for `lowercase` (as in the original BERT).
+        wordpieces_prefix (`str`, *optional*, defaults to `"##"`):
+            The prefix for subwords.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    slow_tokenizer_class = RetriBertTokenizer
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file=None,
+        tokenizer_file=None,
+        do_lower_case=True,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        **kwargs,
+    ):
+        super().__init__(
+            vocab_file,
+            tokenizer_file=tokenizer_file,
+            do_lower_case=do_lower_case,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            **kwargs,
+        )
+
+        normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
+        if (
+            normalizer_state.get("lowercase", do_lower_case) != do_lower_case
+            or normalizer_state.get("strip_accents", strip_accents) != strip_accents
+            or normalizer_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars
+        ):
+            normalizer_class = getattr(normalizers, normalizer_state.pop("type"))
+            normalizer_state["lowercase"] = do_lower_case
+            normalizer_state["strip_accents"] = strip_accents
+            normalizer_state["handle_chinese_chars"] = tokenize_chinese_chars
+            self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state)
+
+        self.do_lower_case = do_lower_case
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A BERT sequence has the following format:
+
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+
+        if token_ids_1 is not None:
+            output += token_ids_1 + [self.sep_token_id]
+
+        return output
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
+        return tuple(files)
+
+
+__all__ = ["RetriBertTokenizerFast"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/speech_to_text_2/__init__.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/speech_to_text_2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..78c549b6e294e1ca97a718ef601472d4d400e12a
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/speech_to_text_2/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ....utils import _LazyModule
+from ....utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_speech_to_text_2 import *
+    from .modeling_speech_to_text_2 import *
+    from .processing_speech_to_text_2 import *
+    from .tokenization_speech_to_text_2 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/speech_to_text_2/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/speech_to_text_2/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..34c8db6d3b5b5ca0b94af1307a54fb90dc477b21
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/speech_to_text_2/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/speech_to_text_2/__pycache__/configuration_speech_to_text_2.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/speech_to_text_2/__pycache__/configuration_speech_to_text_2.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..655e6aedc0017baa439bebf3eddbcbc1b77c12ae
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/speech_to_text_2/__pycache__/configuration_speech_to_text_2.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/speech_to_text_2/__pycache__/modeling_speech_to_text_2.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/speech_to_text_2/__pycache__/modeling_speech_to_text_2.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3b55f4da722a6541ba19356ea6fd6631362d3e4f
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/speech_to_text_2/__pycache__/modeling_speech_to_text_2.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/speech_to_text_2/__pycache__/processing_speech_to_text_2.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/speech_to_text_2/__pycache__/processing_speech_to_text_2.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e0a0c77295e5445e986e0af5a649d8effd3e5bb6
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/speech_to_text_2/__pycache__/processing_speech_to_text_2.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/speech_to_text_2/__pycache__/tokenization_speech_to_text_2.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/speech_to_text_2/__pycache__/tokenization_speech_to_text_2.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0af7bfac9d08c5a4fef50328734967bbf700b91c
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/speech_to_text_2/__pycache__/tokenization_speech_to_text_2.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/speech_to_text_2/configuration_speech_to_text_2.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/speech_to_text_2/configuration_speech_to_text_2.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7cde922740a9a2173c64311aec8f6ad5496ac03
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/speech_to_text_2/configuration_speech_to_text_2.py
@@ -0,0 +1,134 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Speech2Text model configuration"""
+
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class Speech2Text2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Speech2Text2ForCausalLM`]. It is used to
+    instantiate an Speech2Text2 model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the Speech2Text2
+    [facebook/s2t-wav2vec2-large-en-de](https://huggingface.co/facebook/s2t-wav2vec2-large-en-de) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50265):
+            Vocabulary size of the Speech2Text model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`Speech2TextModel`]
+        d_model (`int`, *optional*, defaults to 1024):
+            Dimensionality of the layers and the pooler layer.
+        decoder_layers (`int`, *optional*, defaults to 12):
+            Number of decoder layers.
+        decoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the pooler. If string, `"gelu"`, `"relu"`,
+            `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+            https://huggingface.co/papers/1909.11556>`__ for more details.
+        decoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://huggingface.co/papers/1909.11556)
+            for more details.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        max_target_positions (`int`, *optional*, defaults to 1024):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+
+    Example:
+
+    ```python
+    >>> from transformers import Speech2Text2Config, Speech2Text2ForCausalLM
+
+    >>> # Initializing a Speech2Text2 s2t_transformer_s style configuration
+    >>> configuration = Speech2Text2Config()
+
+    >>> # Initializing a model (with random weights) from the s2t_transformer_s style configuration
+    >>> model = Speech2Text2ForCausalLM(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "speech_to_text_2"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {"num_attention_heads": "decoder_attention_heads", "hidden_size": "d_model"}
+
+    def __init__(
+        self,
+        vocab_size=10000,
+        decoder_layers=6,
+        decoder_ffn_dim=2048,
+        decoder_attention_heads=4,
+        decoder_layerdrop=0.0,
+        use_cache=True,
+        activation_function="relu",
+        d_model=256,
+        dropout=0.1,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        init_std=0.02,
+        decoder_start_token_id=2,
+        scale_embedding=True,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        max_target_positions=1024,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.d_model = d_model
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.decoder_layerdrop = decoder_layerdrop
+        self.use_cache = use_cache
+        self.num_hidden_layers = decoder_layers
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+        self.max_target_positions = max_target_positions
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            decoder_start_token_id=decoder_start_token_id,
+            **kwargs,
+        )
+
+
+__all__ = ["Speech2Text2Config"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/speech_to_text_2/modeling_speech_to_text_2.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/speech_to_text_2/modeling_speech_to_text_2.py
new file mode 100644
index 0000000000000000000000000000000000000000..854f21c0655046124fb439ec3e4a44395c33abcc
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/speech_to_text_2/modeling_speech_to_text_2.py
@@ -0,0 +1,905 @@
+# coding=utf-8
+# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Speech2Text2 model."""
+
+import math
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from ....activations import ACT2FN
+from ....cache_utils import Cache
+from ....modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
+from ....modeling_layers import GradientCheckpointingLayer
+from ....modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions
+from ....modeling_utils import PreTrainedModel
+from ....utils import add_start_docstrings, logging, replace_return_docstrings
+from ....utils.deprecation import deprecate_kwarg
+from .configuration_speech_to_text_2 import Speech2Text2Config
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "Speech2Text2Config"
+_CHECKPOINT_FOR_DOC = "facebook/s2t-wav2vec2-large-en-de"
+
+
+class Speech2Text2SinusoidalPositionalEmbedding(nn.Module):
+    """This module produces sinusoidal positional embeddings of any length."""
+
+    def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        super().__init__()
+        self.offset = 2
+        self.embedding_dim = embedding_dim
+        self.padding_idx = padding_idx
+
+    def make_weights(self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        emb_weights = self.get_embedding(num_embeddings, embedding_dim, padding_idx)
+        if hasattr(self, "weights"):
+            # in forward put the weights on the correct dtype and device of the param
+            emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device)
+
+        self.weights = nn.Parameter(emb_weights)
+        self.weights.requires_grad = False
+        self.weights.detach_()
+
+    @staticmethod
+    def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        """
+        Build sinusoidal embeddings. This matches the implementation in tensor2tensor, but differs slightly from the
+        description in Section 3.5 of "Attention Is All You Need".
+        """
+        half_dim = embedding_dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=torch.int64).float() * -emb)
+        emb = torch.arange(num_embeddings, dtype=torch.int64).float().unsqueeze(1) * emb.unsqueeze(0)
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1)
+        if embedding_dim % 2 == 1:
+            # zero pad
+            emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
+        if padding_idx is not None:
+            emb[padding_idx, :] = 0
+        return emb.to(torch.get_default_dtype())
+
+    @torch.no_grad()
+    def forward(self, input_ids: torch.Tensor, past_key_values_length: int = 0):
+        bsz, seq_len = input_ids.size()
+        # Create the position ids from the input token ids. Any padded tokens remain padded.
+        position_ids = self.create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length).to(
+            input_ids.device
+        )
+
+        # expand embeddings if needed
+        max_pos = self.padding_idx + 1 + seq_len
+        if max_pos > self.weights.size(0):
+            self.make_weights(max_pos + self.offset, self.embedding_dim, self.padding_idx)
+
+        return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, -1).detach()
+
+    def create_position_ids_from_input_ids(
+        self, input_ids: torch.Tensor, padding_idx: int, past_key_values_length: Optional[int] = 0
+    ):
+        """
+        Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding
+        symbols are ignored. This is modified from fairseq's `utils.make_positions`.
+
+        Args:
+            x: torch.Tensor x:
+        Returns: torch.Tensor
+        """
+        # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
+        mask = input_ids.ne(padding_idx).int()
+        incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
+        return incremental_indices.long() + padding_idx
+
+
+class Speech2Text2Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+        is_causal: bool = False,
+        config: Optional[Speech2Text2Config] = None,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        self.config = config
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+        self.is_causal = is_causal
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        # `past_key_values[0].shape[2] == key_value_states.shape[1]`
+        # is checking that the `sequence_length` of the `past_key_values` is the same as
+        # the provided `key_value_states` to support prefix tuning
+        if (
+            is_cross_attention
+            and past_key_values is not None
+            and past_key_values[0].shape[2] == key_value_states.shape[1]
+        ):
+            # reuse k,v, cross_attentions
+            key_states = past_key_values[0]
+            value_states = past_key_values[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_values is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_values[0], key_states], dim=2)
+            value_states = torch.cat([past_key_values[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_values` is always `None`
+            past_key_values = (key_states, value_states)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.reshape(*proj_shape)
+        value_states = value_states.reshape(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                    f" {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to be reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned across GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped, past_key_values
+
+
+class Speech2Text2DecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: Speech2Text2Config):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        self.self_attn = Speech2Text2Attention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+
+        if config.is_decoder:
+            self.encoder_attn = Speech2Text2Attention(
+                self.embed_dim,
+                config.decoder_attention_heads,
+                dropout=config.attention_dropout,
+                is_decoder=True,
+            )
+            self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = True,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            encoder_hidden_states (`torch.FloatTensor`):
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
+            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
+                size *(decoder_attention_heads,)*.
+            past_key_values (`Tuple(torch.FloatTensor)`): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        # Self Attention
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_values[:2] if past_key_values is not None else None
+        # add present self-attn cache to positions 1,2 of present_key_value tuple
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_values=self_attn_past_key_value,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Cross-Attention Block
+        cross_attn_present_key_value = None
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+
+            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
+            cross_attn_past_key_value = past_key_values[-2:] if past_key_values is not None else None
+            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_values=cross_attn_past_key_value,
+                output_attentions=output_attentions,
+            )
+            hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+            hidden_states = residual + hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+            # add cross-attn to positions 3,4 of present_key_value tuple
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+class Speech2Text2PreTrainedModel(PreTrainedModel):
+    config: Speech2Text2Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        std = self.config.init_std
+        if isinstance(module, (nn.Linear, nn.Conv1d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, Speech2Text2SinusoidalPositionalEmbedding):
+            weight = module.get_embedding(*module.weight.shape, module.padding_idx)
+            weight = nn.Parameter(weight, requires_grad=False)
+            weight.detach_()
+            module.weight = weight
+
+
+SPEECH_TO_TEXT_2_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`Speech2Text2Config`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+class Speech2Text2Decoder(Speech2Text2PreTrainedModel):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`Speech2Text2DecoderLayer`]
+
+    Args:
+        config: Speech2Text2Config
+        embed_tokens (nn.Embedding): output embedding
+    """
+
+    def __init__(self, config: Speech2Text2Config):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+        self.padding_idx = config.pad_token_id
+        self.max_target_positions = config.max_target_positions
+        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
+
+        self.embed_positions = Speech2Text2SinusoidalPositionalEmbedding(
+            self.max_target_positions,
+            config.d_model,
+            self.padding_idx,
+        )
+
+        self.layers = nn.ModuleList([Speech2Text2DecoderLayer(config) for _ in range(config.decoder_layers)])
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        head_mask=None,
+        cross_attn_head_mask=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using [`Speech2Text2Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
+                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
+                selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules in encoder to avoid performing cross-attention
+                on hidden heads. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
+                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
+                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
+                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        # past_key_values_length
+        past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+
+        attention_mask = _prepare_4d_causal_attention_mask(
+            attention_mask, input_shape, inputs_embeds, past_key_values_length
+        )
+
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _prepare_4d_attention_mask(
+                encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+            )
+
+        # embed positions
+        positions = self.embed_positions(input_ids, past_key_values_length=past_key_values_length)
+
+        hidden_states = inputs_embeds + positions
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`..."
+                )
+                use_cache = False
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+        next_decoder_cache = () if use_cache else None
+
+        # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
+        for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
+            if attn_mask is not None:
+                if attn_mask.size()[0] != (len(self.layers)):
+                    raise ValueError(
+                        f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
+                        f" {head_mask.size()[0]}."
+                    )
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                cross_attn_layer_head_mask=(cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None),
+                past_key_values=past_key_values[idx] if past_key_values is not None else None,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    "The Speech2Text2 Model with a language modeling head. Can be used for summarization.",
+    SPEECH_TO_TEXT_2_START_DOCSTRING,
+)
+class Speech2Text2DecoderWrapper(Speech2Text2PreTrainedModel):
+    """
+    This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
+    used in combination with the [`EncoderDecoderModel`] framework.
+    """
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.decoder = Speech2Text2Decoder(config)
+
+    def forward(self, *args, **kwargs):
+        return self.decoder(*args, **kwargs)
+
+
+@add_start_docstrings(
+    "The Speech2Text2 Decoder with a language modeling head. Can be used as the decoder part of"
+    " [`EncoderDecoderModel`] and [`SpeechEncoderDecoder`].",
+    SPEECH_TO_TEXT_2_START_DOCSTRING,
+)
+class Speech2Text2ForCausalLM(Speech2Text2PreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config):
+        config.is_decoder = True
+        config.is_encoder_decoder = False
+        super().__init__(config)
+        self.model = Speech2Text2DecoderWrapper(config)
+
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.decoder.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.decoder.embed_tokens = value
+
+    def set_decoder(self, decoder):
+        self.model.decoder = decoder
+
+    def get_decoder(self):
+        return self.model.decoder
+
+    @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.FloatTensor], CausalLMOutputWithCrossAttentions]:
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using [`Speech2Text2Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                if the model is configured as a decoder.
+            encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used
+                in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two additional
+                tensors are only required when the model is used as a decoder in a Sequence to Sequence model.
+
+                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
+                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
+                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
+                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import (
+        ...     SpeechEncoderDecoderModel,
+        ...     Speech2Text2ForCausalLM,
+        ...     Wav2Vec2Model,
+        ...     Speech2Text2Config,
+        ...     Wav2Vec2Config,
+        ...     Wav2Vec2FeatureExtractor,
+        ...     Speech2Text2Tokenizer,
+        ... )
+        >>> from datasets import load_dataset
+
+        >>> feature_extractor = Wav2Vec2FeatureExtractor()
+        >>> tokenizer = Speech2Text2Tokenizer.from_pretrained("facebook/s2t-wav2vec2-large-en-de")
+
+        >>> encoder = Wav2Vec2Model(Wav2Vec2Config())
+        >>> decoder = Speech2Text2ForCausalLM(Speech2Text2Config())
+        >>> # init random speech2text model
+
+        >>> model = SpeechEncoderDecoderModel(encoder=encoder, decoder=decoder)
+        >>> model.config.pad_token_id = tokenizer.pad_token_id
+        >>> model.config.decoder_start_token_id = tokenizer.bos_token_id
+        >>> # pre-process inputs and labels
+
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> inputs = feature_extractor(
+        ...     ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt"
+        ... )
+        >>> input_values = inputs.input_values
+        >>> decoder_input_ids = tokenizer(ds[0]["text"], return_tensors="pt").input_ids
+        >>> # compute loss
+
+        >>> loss = model(inputs=input_values, labels=decoder_input_ids).loss
+        >>> # backprop loss
+
+        >>> loss.backward()  # doctest: +IGNORE_RESULT
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model.decoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            head_mask=head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        logits = self.lm_head(outputs[0])
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs
+    ):
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_ids.shape)
+
+        if past_key_values:
+            past_length = past_key_values.get_seq_length()
+
+            # Some generation methods already pass only the last input ID
+            if input_ids.shape[1] > past_length:
+                remove_prefix_length = past_length
+            else:
+                # Default to old behavior: keep only final ID
+                remove_prefix_length = input_ids.shape[1] - 1
+
+            input_ids = input_ids[:, remove_prefix_length:]
+        # first step, decoder_cached_states are empty
+        return {
+            "input_ids": input_ids,  # encoder_outputs is defined. input_ids not needed
+            "attention_mask": attention_mask,
+            "past_key_values": past_key_values,
+            "use_cache": use_cache,
+        }
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+
+
+__all__ = ["Speech2Text2ForCausalLM", "Speech2Text2PreTrainedModel"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/speech_to_text_2/processing_speech_to_text_2.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/speech_to_text_2/processing_speech_to_text_2.py
new file mode 100644
index 0000000000000000000000000000000000000000..b69420bf540cbe737d6509b79d943af9d41e6c24
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/speech_to_text_2/processing_speech_to_text_2.py
@@ -0,0 +1,105 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Speech processor class for Speech2Text2
+"""
+
+import warnings
+from contextlib import contextmanager
+
+from ....processing_utils import ProcessorMixin
+
+
+class Speech2Text2Processor(ProcessorMixin):
+    r"""
+    Constructs a Speech2Text2 processor which wraps a Speech2Text2 feature extractor and a Speech2Text2 tokenizer into
+    a single processor.
+
+    [`Speech2Text2Processor`] offers all the functionalities of [`AutoFeatureExtractor`] and [`Speech2Text2Tokenizer`].
+    See the [`~Speech2Text2Processor.__call__`] and [`~Speech2Text2Processor.decode`] for more information.
+
+    Args:
+        feature_extractor (`AutoFeatureExtractor`):
+            An instance of [`AutoFeatureExtractor`]. The feature extractor is a required input.
+        tokenizer (`Speech2Text2Tokenizer`):
+            An instance of [`Speech2Text2Tokenizer`]. The tokenizer is a required input.
+    """
+
+    feature_extractor_class = "AutoFeatureExtractor"
+    tokenizer_class = "Speech2Text2Tokenizer"
+
+    def __init__(self, feature_extractor, tokenizer):
+        super().__init__(feature_extractor, tokenizer)
+        self.current_processor = self.feature_extractor
+        self._in_target_context_manager = False
+
+    def __call__(self, *args, **kwargs):
+        """
+        When used in normal mode, this method forwards all its arguments to AutoFeatureExtractor's
+        [`~AutoFeatureExtractor.__call__`] and returns its output. If used in the context
+        [`~Speech2Text2Processor.as_target_processor`] this method forwards all its arguments to
+        Speech2Text2Tokenizer's [`~Speech2Text2Tokenizer.__call__`]. Please refer to the docstring of the above two
+        methods for more information.
+        """
+        # For backward compatibility
+        if self._in_target_context_manager:
+            return self.current_processor(*args, **kwargs)
+
+        if "raw_speech" in kwargs:
+            warnings.warn("Using `raw_speech` as a keyword argument is deprecated. Use `audio` instead.")
+            audio = kwargs.pop("raw_speech")
+        else:
+            audio = kwargs.pop("audio", None)
+        sampling_rate = kwargs.pop("sampling_rate", None)
+        text = kwargs.pop("text", None)
+        if len(args) > 0:
+            audio = args[0]
+            args = args[1:]
+
+        if audio is None and text is None:
+            raise ValueError("You need to specify either an `audio` or `text` input to process.")
+
+        if audio is not None:
+            inputs = self.feature_extractor(audio, *args, sampling_rate=sampling_rate, **kwargs)
+        if text is not None:
+            encodings = self.tokenizer(text, **kwargs)
+
+        if text is None:
+            return inputs
+        elif audio is None:
+            return encodings
+        else:
+            inputs["labels"] = encodings["input_ids"]
+            return inputs
+
+    @contextmanager
+    def as_target_processor(self):
+        """
+        Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning
+        Speech2Text2.
+        """
+        warnings.warn(
+            "`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your "
+            "labels by using the argument `text` of the regular `__call__` method (either in the same call as "
+            "your audio inputs, or in a separate call."
+        )
+        self._in_target_context_manager = True
+        self.current_processor = self.tokenizer
+        yield
+        self.current_processor = self.feature_extractor
+        self._in_target_context_manager = False
+
+
+__all__ = ["Speech2Text2Processor"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/speech_to_text_2/tokenization_speech_to_text_2.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/speech_to_text_2/tokenization_speech_to_text_2.py
new file mode 100644
index 0000000000000000000000000000000000000000..05c10080b7732601efd476a8c6e2640e74450a76
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/speech_to_text_2/tokenization_speech_to_text_2.py
@@ -0,0 +1,252 @@
+# coding=utf-8
+# Copyright 2021 The Facebook Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization class for Speech2Text2."""
+
+import json
+import os
+from typing import Optional
+
+from ....tokenization_utils import PreTrainedTokenizer
+from ....utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "tokenizer_config_file": "tokenizer_config.json",
+    "merges_file": "merges.txt",
+}
+
+
+BPE_TOKEN_MERGES = "</w>"
+BPE_TOKEN_VOCAB = "@@ "
+
+
+def get_pairs(word):
+    """
+    Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length
+    strings)
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+# Speech2Text2 has no max input length
+
+
+class Speech2Text2Tokenizer(PreTrainedTokenizer):
+    """
+    Constructs a Speech2Text2Tokenizer.
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains some of the main methods. Users should refer to
+    the superclass for more information regarding such methods.
+
+    Args:
+        vocab_file (`str`):
+            File containing the vocabulary.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The beginning of sentence token.
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sentence token.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+
+        **kwargs
+            Additional keyword arguments passed along to [`PreTrainedTokenizer`]
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        bos_token="<s>",
+        pad_token="<pad>",
+        eos_token="</s>",
+        unk_token="<unk>",
+        do_lower_case=False,
+        merges_file=None,
+        **kwargs,
+    ):
+        self.do_lower_case = do_lower_case
+
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
+        self.decoder = {v: k for k, v in self.encoder.items()}
+
+        if merges_file is None:
+            logger.info(f"No merges files provided. {self.__class__.__name__} can only be used for decoding.")
+
+            self.bpe_ranks = None
+            self.cache = None
+        else:
+            with open(merges_file, encoding="utf-8") as merges_handle:
+                merges = merges_handle.read().split("\n")[:-1]
+
+            merges = [tuple(merge.split()[:2]) for merge in merges]
+            self.bpe_ranks = dict(zip(merges, range(len(merges))))
+            self.cache = {}
+        super().__init__(
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            pad_token=pad_token,
+            do_lower_case=do_lower_case,
+            **kwargs,
+        )
+
+    @property
+    def vocab_size(self) -> int:
+        return len(self.decoder)
+
+    def get_vocab(self) -> dict:
+        return dict(self.encoder, **self.added_tokens_encoder)
+
+    def bpe(self, token):
+        word = tuple(token[:-1]) + (token[-1] + BPE_TOKEN_MERGES,)
+        if token in self.cache:
+            return self.cache[token]
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token
+
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                except ValueError:
+                    new_word.extend(word[i:])
+                    break
+                else:
+                    new_word.extend(word[i:j])
+                    i = j
+
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = " ".join(word)
+        if word == "\n  " + BPE_TOKEN_MERGES:
+            word = "\n" + BPE_TOKEN_MERGES
+
+        if word.endswith(BPE_TOKEN_MERGES):
+            word = word.replace(BPE_TOKEN_MERGES, "")
+
+        word = word.replace(" ", BPE_TOKEN_VOCAB)
+        self.cache[token] = word
+        return word
+
+    def _tokenize(self, text):
+        """Tokenize a string."""
+
+        if self.bpe_ranks is None:
+            raise ValueError(
+                "This tokenizer was instantiated without a `merges.txt` file, so"
+                " that it can only be used for decoding, not for encoding. "
+                "Make sure to provide `merges.txt` file at instantiation to enable "
+                "encoding."
+            )
+
+        if self.do_lower_case:
+            text = text.lower()
+
+        text = text.split()
+
+        split_tokens = []
+        for token in text:
+            if token:
+                split_tokens.extend(list(self.bpe(token).split(" ")))
+
+        return split_tokens
+
+    def _convert_token_to_id(self, token: str) -> int:
+        """Converts a token (str) in an index (integer) using the vocab."""
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+    def _convert_id_to_token(self, index: int) -> str:
+        """Converts an index (integer) in a token (str) using the vocab."""
+        result = self.decoder.get(index, self.unk_token)
+        return result
+
+    def convert_tokens_to_string(self, tokens: list[str]) -> str:
+        """
+        Converts a list of output tokens into a single string.
+        """
+        # combine tokens
+        string = " ".join(tokens)
+
+        # make sure @@ tokens are concatenated
+        string = "".join(string.split(BPE_TOKEN_VOCAB))
+
+        return string
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        merges_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
+        )
+
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
+
+        index = 0
+        if self.bpe_ranks is None:
+            return (vocab_file,)
+
+        with open(merges_file, "w", encoding="utf-8") as writer:
+            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {merges_file}: BPE merge indices are not consecutive."
+                        " Please check that the tokenizer is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(" ".join(bpe_tokens) + "\n")
+                index += 1
+
+        return (vocab_file, merges_file)
+
+
+__all__ = ["Speech2Text2Tokenizer"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/tapex/__init__.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/tapex/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b535eb1df2c40a7680bbf8d57fc70b78e23437f4
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/tapex/__init__.py
@@ -0,0 +1,26 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ....utils import _LazyModule
+from ....utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .tokenization_tapex import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/tapex/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/tapex/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..79fed43d69154c53235041da119468437109a2fd
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/tapex/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/tapex/__pycache__/tokenization_tapex.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/tapex/__pycache__/tokenization_tapex.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9a2c4b4ab0c4006214ed2d92e9b91a4740ae05e6
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/tapex/__pycache__/tokenization_tapex.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/tapex/tokenization_tapex.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/tapex/tokenization_tapex.py
new file mode 100644
index 0000000000000000000000000000000000000000..b32383ddd497828b6a429d39c5d4069a21154b9f
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/tapex/tokenization_tapex.py
@@ -0,0 +1,1470 @@
+# coding=utf-8
+# Copyright 2022 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for TAPEX."""
+
+import json
+import os
+import random
+from functools import lru_cache
+from typing import Optional, Union
+
+import regex as re
+
+from ....file_utils import ExplicitEnum, PaddingStrategy, TensorType, add_end_docstrings, is_pandas_available
+from ....tokenization_utils import AddedToken, PreTrainedTokenizer
+from ....tokenization_utils_base import ENCODE_KWARGS_DOCSTRING, BatchEncoding, TextInput, TruncationStrategy
+from ....utils import logging
+
+
+if is_pandas_available():
+    import pandas as pd
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}
+
+
+class TapexTruncationStrategy(ExplicitEnum):
+    """
+    Possible values for the `truncation` argument in [`~TapasTokenizer.__call__`]. Useful for tab-completion in an IDE.
+    """
+
+    DROP_ROWS_TO_FIT = "drop_rows_to_fit"
+
+
+TAPEX_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
+            add_special_tokens (`bool`, *optional*, defaults to `True`):
+                Whether or not to encode the sequences with the special tokens relative to their model.
+            padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `False`):
+                Activates and controls padding. Accepts the following values:
+
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            truncation (`bool`, `str`, [`TapexTruncationStrategy`] or [`~tokenization_utils_base.TruncationStrategy`],
+                   *optional*, defaults to `False`):
+
+                Activates and controls truncation. Accepts the following values:
+
+                - `'drop_rows_to_fit'`: Truncate to a maximum length specified with the argument `max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided. This will truncate
+                  row by row, removing rows from the table.
+                - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or
+                  to the maximum acceptable input length for the model if that argument is not provided. This will
+                  truncate token by token, removing a token from the longest sequence in the pair if a pair of
+                  sequences (or a batch of pairs) is provided.
+                - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided. This will only
+                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+                - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided. This will only
+                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+                - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
+                  greater than the model maximum admissible input size).
+            max_length (`int`, *optional*):
+                Controls the maximum length to use by one of the truncation/padding parameters. If left unset or set to
+                `None`, this will use the predefined model maximum length if a maximum length is required by one of the
+                truncation/padding parameters. If the model has no specific maximum input length (like XLNet)
+                truncation/padding to a maximum length will be deactivated.
+            stride (`int`, *optional*, defaults to 0):
+                If set to a number along with `max_length`, the overflowing tokens returned when
+                `return_overflowing_tokens=True` will contain some tokens from the end of the truncated sequence
+                returned to provide some overlap between truncated and overflowing sequences. The value of this
+                argument defines the number of overlapping tokens.
+            pad_to_multiple_of (`int`, *optional*):
+                If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
+                the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+"""
+
+
+@lru_cache
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
+    characters the bpe code barfs on. The reversible bpe codes work on unicode strings. This means you need a large #
+    of unicode characters in your vocab if you want to avoid UNKs. When you're at something like a 10B token dataset
+    you end up needing around 5K for decent coverage. This is a significant percentage of your normal, say, 32K bpe
+    vocab. To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    """
+    bs = (
+        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
+    )
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def get_pairs(word):
+    """
+    Return set of symbol pairs in a word. Word is represented as tuple of symbols (symbols being variable-length
+    strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+class IndexedRowTableLinearize:
+    """
+    FORMAT: col: col1 | col2 | col 3 row 1 : val1 | val2 | val3 row 2 : ...
+    """
+
+    def process_table(self, table_content: dict):
+        """
+        Given a table, TableLinearize aims at converting it into a flatten sequence with special symbols.
+        """
+        assert "header" in table_content and "rows" in table_content, self.PROMPT_MESSAGE
+        # process header
+        table_str = self.process_header(table_content["header"]) + " "
+        # process rows
+        for i, row_example in enumerate(table_content["rows"]):
+            # NOTE: the row should start from row 1 instead of 0
+            table_str += self.process_row(row_example, row_index=i + 1) + " "
+        return table_str.strip()
+
+    def process_header(self, headers: list):
+        """
+        Given a list of headers, TableLinearize aims at converting it into a flatten sequence with special symbols.
+        """
+        return "col : " + " | ".join(headers)
+
+    def process_row(self, row: list, row_index: int):
+        """
+        Given a row, TableLinearize aims at converting it into a flatten sequence with special symbols.
+        """
+        row_str = ""
+        row_cell_values = []
+        for cell_value in row:
+            if isinstance(cell_value, int):
+                row_cell_values.append(str(cell_value))
+            else:
+                row_cell_values.append(cell_value)
+        row_str += " | ".join(row_cell_values)
+        return "row " + str(row_index) + " : " + row_str
+
+
+class TapexTokenizer(PreTrainedTokenizer):
+    r"""
+    Construct a TAPEX tokenizer. Based on byte-level Byte-Pair-Encoding (BPE).
+
+    This tokenizer can be used to flatten one or more table(s) and concatenate them with one or more related sentences
+    to be used by TAPEX models. The format that the TAPEX tokenizer creates is the following:
+
+    sentence col: col1 | col2 | col 3 row 1 : val1 | val2 | val3 row 2 : ...
+
+    The tokenizer supports a single table + single query, a single table and multiple queries (in which case the table
+    will be duplicated for every query), a single query and multiple tables (in which case the query will be duplicated
+    for every table), and multiple tables and queries. In other words, you can provide a batch of tables + questions to
+    the tokenizer for instance to prepare them for the model.
+
+    Tokenization itself is based on the BPE algorithm. It is identical to the one used by BART, RoBERTa and GPT-2.
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        merges_file (`str`):
+            Path to the merges file.
+        do_lower_case (`bool`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+        errors (`str`, *optional*, defaults to `"replace"`):
+            Paradigm to follow when decoding bytes to UTF-8. See
+            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
+            The token used is the `sep_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        add_prefix_space (`bool`, *optional*, defaults to `False`):
+            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
+            other word. (BART tokenizer detect beginning of words by the preceding space).
+        max_cell_length (`int`, *optional*, defaults to 15):
+            Maximum number of characters per cell when linearizing a table. If this number is exceeded, truncation
+            takes place.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        do_lower_case=True,
+        errors="replace",
+        bos_token="<s>",
+        eos_token="</s>",
+        sep_token="</s>",
+        cls_token="<s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        mask_token="<mask>",
+        add_prefix_space=False,
+        max_cell_length=15,
+        **kwargs,
+    ):
+        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+        sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
+        cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
+        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
+
+        # Mask token behave like a normal word, i.e. include the space before it
+        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.errors = errors  # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        with open(merges_file, encoding="utf-8") as merges_handle:
+            bpe_merges = merges_handle.read().split("\n")[1:-1]
+        bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        self.cache = {}
+        self.add_prefix_space = add_prefix_space
+        self.do_lower_case = do_lower_case
+
+        # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
+        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+
+        # additional properties
+
+        super().__init__(
+            vocab_file=vocab_file,
+            merges_file=merges_file,
+            do_lower_case=do_lower_case,
+            errors=errors,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            add_prefix_space=add_prefix_space,
+            max_cell_length=max_cell_length,
+            **kwargs,
+        )
+
+        self.max_cell_length = max_cell_length
+        self.table_linearize = IndexedRowTableLinearize()
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A TAPEX sequence has the following format:
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s></s> B </s>`
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
+    ) -> list[int]:
+        """
+        Args:
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+            token_ids_0 (`list[int]`):
+                List of IDs.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+        Returns:
+            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if token_ids_1 is None:
+            return [1] + ([0] * len(token_ids_0)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Args:
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. TAPEX does not:
+        make use of token type ids, therefore a list of zeros is returned.
+            token_ids_0 (`list[int]`):
+                List of IDs.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `list[int]`: List of zeros.
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
+
+    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
+        add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
+        if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()):
+            text = " " + text
+        return (text, kwargs)
+
+    @property
+    def vocab_size(self):
+        return len(self.encoder)
+
+    def get_vocab(self):
+        return dict(self.encoder, **self.added_tokens_encoder)
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token
+
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                except ValueError:
+                    new_word.extend(word[i:])
+                    break
+                else:
+                    new_word.extend(word[i:j])
+                    i = j
+
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = " ".join(word)
+        self.cache[token] = word
+        return word
+
+    def _tokenize(self, text):
+        """Tokenize a string."""
+        bpe_tokens = []
+        for token in re.findall(self.pat, text):
+            token = "".join(
+                self.byte_encoder[b] for b in token.encode("utf-8")
+            )  # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
+            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
+        return bpe_tokens
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.decoder.get(index)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        text = "".join(tokens)
+        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
+        return text
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        merge_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
+        )
+
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
+
+        index = 0
+        with open(merge_file, "w", encoding="utf-8") as writer:
+            writer.write("#version: 0.2\n")
+            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
+                        " Please check that the tokenizer is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(" ".join(bpe_tokens) + "\n")
+                index += 1
+
+        return vocab_file, merge_file
+
+    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, TAPEX_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
+    def __call__(
+        self,
+        table: Union["pd.DataFrame", list["pd.DataFrame"]] = None,
+        query: Optional[Union[TextInput, list[TextInput]]] = None,
+        answer: Optional[Union[str, list[str]]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        """
+        Main method to tokenize and prepare for the model one or several table-sequence pair(s).
+
+        Args:
+            table (`pd.DataFrame`, `list[pd.DataFrame]`):
+                Table(s) containing tabular data.
+            query (`str` or `list[str]`, *optional*):
+                Sentence or batch of sentences related to one or more table(s) to be encoded. Note that the number of
+                sentences must match the number of tables.
+            answer (`str` or `list[str]`, *optional*):
+                Optionally, the corresponding answer to the questions as supervision.
+        """
+
+        if table is not None:
+            return self.source_call_func(
+                table=table,
+                query=query,
+                answer=answer,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs,
+            )
+        elif answer is not None:
+            return self.target_call_func(
+                answer=answer,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs,
+            )
+        else:
+            raise ValueError("You need to provide either a `table` or an `answer`.")
+
+    def source_call_func(
+        self,
+        table: Union["pd.DataFrame", list["pd.DataFrame"]],
+        query: Optional[Union[TextInput, list[TextInput]]] = None,
+        answer: Optional[Union[str, list[str]]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        # Input type checking for clearer error
+        valid_table = False
+        valid_query = False
+
+        # Check that table have a valid type
+        if isinstance(table, pd.DataFrame):
+            valid_table = True
+        elif isinstance(table, (list, tuple)) and isinstance(table[0], pd.DataFrame):
+            valid_table = True
+
+        # Check that query have a valid type
+        if query is None or isinstance(query, str):
+            valid_query = True
+        elif isinstance(query, (list, tuple)):
+            if len(query) == 0 or isinstance(query[0], str):
+                valid_query = True
+
+        if not valid_table:
+            raise ValueError(
+                "table input must of type `pd.DataFrame` (single example), `list[pd.DataFrame]` (batch of examples). "
+            )
+        if not valid_query:
+            raise ValueError("query input must of type `str` (single example), `list[str]` (batch of examples). ")
+        is_batched = isinstance(table, (list, tuple)) or isinstance(query, (list, tuple))
+
+        if is_batched:
+            return self.batch_encode_plus(
+                table=table,
+                query=query,
+                answer=answer,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs,
+            )
+        else:
+            return self.encode_plus(
+                table=table,
+                query=query,
+                answer=answer,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs,
+            )
+
+    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, TAPEX_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
+    def batch_encode_plus(
+        self,
+        table: Union["pd.DataFrame", list["pd.DataFrame"]],
+        query: Optional[list[TextInput]] = None,
+        answer: Optional[list[str]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Optional[Union[bool, str]] = None,
+        max_length: Optional[int] = None,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        """
+        <Tip warning={true}>
+
+        This method is deprecated, `__call__` should be used instead.
+
+        </Tip>
+        """
+        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
+        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            verbose=verbose,
+            **kwargs,
+        )
+
+        return self._batch_encode_plus(
+            table=table,
+            query=query,
+            answer=answer,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_tensors=return_tensors,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            **kwargs,
+        )
+
+    def _batch_encode_plus(
+        self,
+        table: Union["pd.DataFrame", list["pd.DataFrame"]],
+        query: Optional[list[TextInput]] = None,
+        answer: Optional[list[str]] = None,
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        if return_offsets_mapping:
+            raise NotImplementedError(
+                "return_offset_mapping is not available when using Python tokenizers. "
+                "To use this feature, change your tokenizer to one deriving from "
+                "transformers.PreTrainedTokenizerFast."
+            )
+
+        if isinstance(table, pd.DataFrame) and isinstance(query, (list, tuple)):
+            # single table, many queries case
+            # duplicate table for every query
+            table = [table] * len(query)
+        if isinstance(table, (list, tuple)) and isinstance(query, str):
+            # many tables, single query case
+            # duplicate query for every table
+            query = [query] * len(table)
+
+        batch_outputs = self._batch_prepare_for_model(
+            table=table,
+            query=query,
+            answer=answer,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_attention_mask=return_attention_mask,
+            return_token_type_ids=return_token_type_ids,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_length=return_length,
+            return_tensors=return_tensors,
+            verbose=verbose,
+        )
+
+        return BatchEncoding(batch_outputs)
+
+    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, TAPEX_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
+    def _batch_prepare_for_model(
+        self,
+        table: Union["pd.DataFrame", list["pd.DataFrame"]],
+        query: Optional[Union[TextInput, list[TextInput]]] = None,
+        answer: Optional[Union[str, list[str]]] = None,
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[str] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+    ) -> BatchEncoding:
+        """
+        This method adds special tokens, truncates sequences if overflowing while taking into account the special
+        tokens and manages a moving window (with user defined stride) for overflowing tokens.
+        """
+        batch_outputs = {}
+        if answer is None:
+            answer = [None] * len(table)
+        for _table, _query, _answer in zip(table, query, answer):
+            text = self.prepare_table_query(
+                _table, _query, _answer, truncation_strategy=truncation_strategy, max_length=max_length
+            )
+
+            if self.do_lower_case:
+                text = text.lower()
+
+            tokens = self.tokenize(text)
+            outputs = self.prepare_for_model(
+                ids=self.convert_tokens_to_ids(tokens),
+                add_special_tokens=add_special_tokens,
+                padding=PaddingStrategy.DO_NOT_PAD.value,  # we pad in batch afterwards
+                truncation=truncation_strategy.value,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=None,  # we pad in batch afterwards
+                return_attention_mask=False,  # we pad in batch afterwards
+                return_token_type_ids=return_token_type_ids,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_length=return_length,
+                return_tensors=None,  # We convert the whole batch to tensors at the end
+                prepend_batch_axis=False,
+                verbose=verbose,
+            )
+
+            for key, value in outputs.items():
+                if key not in batch_outputs:
+                    batch_outputs[key] = []
+                batch_outputs[key].append(value)
+
+        batch_outputs = self.pad(
+            batch_outputs,
+            padding=padding_strategy.value,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_attention_mask=return_attention_mask,
+        )
+
+        batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)
+
+        return batch_outputs
+
+    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING)
+    def encode(
+        self,
+        table: "pd.DataFrame",
+        query: Optional[TextInput] = None,
+        answer: Optional[str] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy, TapexTruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ) -> list[int]:
+        """
+        Prepare a table, a string and possible answer for the model. This method does not return token type IDs,
+        attention masks, etc. which are necessary for the model to work correctly. Use this method if you want to build
+        your processing on your own, otherwise refer to `__call__`.
+        """
+        encoded_inputs = self.encode_plus(
+            table,
+            query=query,
+            answer=answer,
+            add_special_tokens=add_special_tokens,
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            return_tensors=return_tensors,
+            **kwargs,
+        )
+
+        return encoded_inputs["input_ids"]
+
+    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, TAPEX_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
+    def encode_plus(
+        self,
+        table: "pd.DataFrame",
+        query: Optional[TextInput] = None,
+        answer: Optional[str] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Optional[Union[bool, str]] = None,
+        max_length: Optional[int] = None,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
+        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            verbose=verbose,
+            **kwargs,
+        )
+
+        return self._encode_plus(
+            table=table,
+            query=query,
+            answer=answer,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_tensors=return_tensors,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            **kwargs,
+        )
+
+    def _encode_plus(
+        self,
+        table: "pd.DataFrame",
+        query: Optional[TextInput] = None,
+        answer: Optional[str] = None,
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        if return_offsets_mapping:
+            raise NotImplementedError(
+                "return_offset_mapping is not available when using Python tokenizers. "
+                "To use this feature, change your tokenizer to one deriving from "
+                "transformers.PreTrainedTokenizerFast. "
+                "More information on available tokenizers at "
+                "https://github.com/huggingface/transformers/pull/2674"
+            )
+
+        text = self.prepare_table_query(
+            table, query, answer, truncation_strategy=truncation_strategy, max_length=max_length
+        )
+
+        # if necessary, perform lower case
+        if self.do_lower_case:
+            text = text.lower()
+
+        tokens = self.tokenize(text)
+
+        return self.prepare_for_model(
+            ids=self.convert_tokens_to_ids(tokens),
+            add_special_tokens=add_special_tokens,
+            padding=padding_strategy.value,
+            truncation=truncation_strategy.value,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_tensors=return_tensors,
+            prepend_batch_axis=True,
+            return_attention_mask=return_attention_mask,
+            return_token_type_ids=return_token_type_ids,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_length=return_length,
+            verbose=verbose,
+        )
+
+    def target_call_func(
+        self,
+        answer: Union[str, list[str]],
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        """
+        The method tokenizes and prepares the answer label for the model.
+
+        Args:
+            answer (`str` or `list[str]`):
+                Corresponding answer supervision to the queries for training the model.
+        """
+        is_batched = isinstance(answer, (list, tuple))
+
+        if is_batched:
+            return self.target_batch_encode_plus(
+                answer=answer,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs,
+            )
+        else:
+            return self.target_encode_plus(
+                answer=answer,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs,
+            )
+
+    def target_batch_encode_plus(
+        self,
+        answer: list[str],
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Optional[Union[bool, str]] = None,
+        max_length: Optional[int] = None,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        """
+        Prepare answer strings for the model.
+
+        Args:
+            answer `list[str]`:
+                Corresponding answer supervision to the queries for training the model.
+        """
+        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
+        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            verbose=verbose,
+            **kwargs,
+        )
+
+        return self._target_batch_encode_plus(
+            answer=answer,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_tensors=return_tensors,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            **kwargs,
+        )
+
+    def _target_batch_encode_plus(
+        self,
+        answer: list[str],
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        batch_outputs = {}
+        for text in answer:
+            if self.do_lower_case:
+                text = text.lower()
+
+            tokens = self.tokenize(text)
+            outputs = self.prepare_for_model(
+                ids=self.convert_tokens_to_ids(tokens),
+                add_special_tokens=add_special_tokens,
+                padding=PaddingStrategy.DO_NOT_PAD.value,  # we pad in batch afterwards
+                truncation=truncation_strategy.value,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=None,  # we pad in batch afterwards
+                return_attention_mask=False,  # we pad in batch afterwards
+                return_token_type_ids=return_token_type_ids,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_length=return_length,
+                return_tensors=None,  # We convert the whole batch to tensors at the end
+                prepend_batch_axis=False,
+                verbose=verbose,
+            )
+
+            for key, value in outputs.items():
+                if key not in batch_outputs:
+                    batch_outputs[key] = []
+                batch_outputs[key].append(value)
+
+        batch_outputs = self.pad(
+            batch_outputs,
+            padding=padding_strategy.value,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_attention_mask=return_attention_mask,
+        )
+
+        batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)
+
+        return BatchEncoding(batch_outputs)
+
+    def target_encode(
+        self,
+        answer: str,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy, TapexTruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ) -> list[int]:
+        """
+        Prepare the answer string for the model. This method does not return token type IDs, attention masks, etc.
+        which are necessary for the model to work correctly. Use this method if you want to build your processing on
+        your own, otherwise refer to `__call__`.
+
+        Args:
+            answer `str`:
+                Corresponding answer supervision to the queries for training the model
+        """
+        encoded_outputs = self.target_encode_plus(
+            answer=answer,
+            add_special_tokens=add_special_tokens,
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            return_tensors=return_tensors,
+            **kwargs,
+        )
+
+        return encoded_outputs["input_ids"]
+
+    def target_encode_plus(
+        self,
+        answer: str,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Optional[Union[bool, str]] = None,
+        max_length: Optional[int] = None,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        """
+        Prepare a answer string for the model.
+
+        Args:
+            answer `str`:
+                Corresponding answer supervision to the queries for training the model.
+        """
+        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
+        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            verbose=verbose,
+            **kwargs,
+        )
+
+        return self._target_encode_plus(
+            answer=answer,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_tensors=return_tensors,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            **kwargs,
+        )
+
+    def _target_encode_plus(
+        self,
+        answer: str,
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        if return_offsets_mapping:
+            raise NotImplementedError(
+                "return_offset_mapping is not available when using Python tokenizers. "
+                "To use this feature, change your tokenizer to one deriving from "
+                "transformers.PreTrainedTokenizerFast. "
+                "More information on available tokenizers at "
+                "https://github.com/huggingface/transformers/pull/2674"
+            )
+
+        text = answer
+
+        # if necessary, perform lower case
+        if self.do_lower_case:
+            text = text.lower()
+
+        tokens = self.tokenize(text)
+
+        return self.prepare_for_model(
+            ids=self.convert_tokens_to_ids(tokens),
+            add_special_tokens=add_special_tokens,
+            padding=padding_strategy.value,
+            truncation=truncation_strategy.value,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_tensors=return_tensors,
+            prepend_batch_axis=True,
+            return_attention_mask=return_attention_mask,
+            return_token_type_ids=return_token_type_ids,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_length=return_length,
+            verbose=verbose,
+        )
+
+    def prepare_table_query(
+        self,
+        table,
+        query,
+        answer=None,
+        truncation_strategy=Union[str, TruncationStrategy, TapexTruncationStrategy],
+        max_length=None,
+    ):
+        """
+        This method can be used to linearize a table and add a corresponding query.
+
+        Optionally, it also handles truncation of the table (cells).
+
+        An answer can be provided for more precise truncation.
+        """
+        if not table.empty:
+            # step 1: create table dictionary
+            table_content = {"header": list(table.columns), "rows": [list(row.values) for i, row in table.iterrows()]}
+
+            # step 2: modify table internally
+            # always truncate table cells based on self.max_cell_length
+            # optionally truncate rows if truncation_strategy is set to it
+            self.truncate_table_cells(table_content, query, answer)
+            if truncation_strategy == TapexTruncationStrategy.DROP_ROWS_TO_FIT:
+                self.truncate_table_rows(table_content, query, answer, max_length=max_length)
+
+            # step 3: linearize table
+            linear_table = self.table_linearize.process_table(table_content)
+        else:
+            linear_table = ""
+
+        if linear_table == "":
+            logger.warning(
+                "You provide an empty table, or all cells contain much tokens (e.g., >= 1024 tokens). "
+                + f"Please carefully check the corresponding table with the query : {query}."
+            )
+        if query == "":
+            logger.warning("You provide nothing to query with respect to the table.")
+        # step 4: concatenate query with linear_table
+        separator = " " if query and linear_table else ""
+        joint_input = (query + separator + linear_table) if query else linear_table
+
+        return joint_input
+
+    def truncate_table_cells(self, table_content: dict, question: str, answer: list):
+        # TODO (Qian): is it possible to revert the original cell if it is in the final answer?
+        cell_mapping = {}
+        for row in table_content["rows"]:
+            for i, cell in enumerate(row):
+                truncate_cell = self.truncate_cell(cell)
+                if truncate_cell is not None:
+                    cell_mapping[cell] = truncate_cell
+                    row[i] = truncate_cell
+
+        # modify the answer list
+        if answer is not None:
+            for i, case in enumerate(answer):
+                if case in cell_mapping:
+                    answer[i] = cell_mapping[case]
+
+    def truncate_cell(self, cell_value):
+        # do not process on these cases
+        if isinstance(cell_value, (int, float)):
+            return cell_value
+        if cell_value.strip() != "":
+            try_tokens = self.tokenize(cell_value)
+            if len(try_tokens) >= self.max_cell_length:
+                retain_tokens = try_tokens[: self.max_cell_length]
+                retain_cell_value = self.convert_tokens_to_string(retain_tokens)
+                return retain_cell_value
+            else:
+                return None
+        else:
+            return cell_value
+
+    def truncate_table_rows(
+        self, table_content: dict, question: str, answer: Optional[Union[str, list[str]]] = None, max_length=None
+    ):
+        """
+        Args:
+        table_content:
+            {"header": xxx, "rows": xxx, "id" (Optionally): xxx}
+
+        question:
+            natural language sentence
+
+        answer:
+            if for training, is the supervision; otherwise will be empty
+        """
+        delete_ratio, remain_token_len = self.estimate_delete_ratio(table_content, question, max_length)
+        # randomly delete unrelated rows
+        self.delete_unrelated_rows(table_content, question, answer, delete_ratio)
+        # guarantee the result < max_length
+        maximum_keep_rows = 0
+        for ind, row_example in enumerate(table_content["rows"]):
+            value_string = self.table_linearize.process_row(row_example, ind + 1)
+            value_token_len = len(self.tokenize(value_string))
+            # over the size limit, and take action
+            if value_token_len > remain_token_len:
+                break
+            remain_token_len -= value_token_len
+            maximum_keep_rows += 1
+        del table_content["rows"][maximum_keep_rows:]
+
+    def estimate_delete_ratio(self, table_content: dict, question: str, max_length=None):
+        if "header" not in table_content or "rows" not in table_content:
+            raise ValueError("The table content should contain both 'header' and 'rows' keys.")
+        # calculate the tokens of header, special tokens will only be pre-prepended into question
+        question_tokens = self.tokenize(question, add_special_tokens=True)
+        # calculate the tokens of header
+        header_string = self.table_linearize.process_header(table_content["header"])
+        header_tokens = self.tokenize(header_string, add_special_tokens=False)
+        # split all cell values into tokens and see how many can be accommodated
+        used_token_len = len(question_tokens) + len(header_tokens)
+        # remaining token space for rows
+        remain_token_len = max_length - used_token_len
+
+        value_string = ""
+        for _, row_example in enumerate(table_content["rows"]):
+            # use a general index to roughly estimate the overall token len
+            value_string += self.table_linearize.process_row(row_example, 100) + " "
+        value_token_len = len(self.tokenize(value_string))
+
+        if value_token_len < remain_token_len:
+            # no row will be deleted
+            return 0.0, remain_token_len
+        else:
+            # calc a roughly delete rate
+            return 1.0 - remain_token_len / value_token_len, remain_token_len
+
+    def delete_unrelated_rows(self, table_content: dict, question: str, answer: list, delete_ratio: float):
+        """
+        The argument answer is used only during training.
+        """
+        truncated_unrelated_indices = []
+        related_indices = []
+        if answer is None or len(answer) == 0:
+            answer_set = set()
+        else:
+            answer_set = {ans_ex.lower() for ans_ex in answer}
+        # add question key words into answer set
+        if question is not None:
+            answer_set.update(question.split())
+        question_set = set(question.strip("?!.,").split(" "))
+        row_max_len = len(table_content["rows"])
+        for _row_idx, row in enumerate(table_content["rows"]):
+            lower_row = {str(cell).lower() for cell in row}
+            if len(lower_row & answer_set) == 0 and len(lower_row & question_set) == 0:
+                truncated_unrelated_indices.append(_row_idx)
+            else:
+                # add neighbours to preserve information aggressively
+                related_indices.extend([_row_idx - 2, _row_idx - 1, _row_idx, _row_idx + 1, _row_idx + 2])
+
+        # remove the neighbours
+        truncated_unrelated_indices = [
+            _row_idx for _row_idx in truncated_unrelated_indices if _row_idx not in related_indices
+        ]
+        # select some cases to drop
+        drop_items = min(len(truncated_unrelated_indices), int(len(table_content["rows"]) * delete_ratio))
+        drop_row_indices = random.choices(truncated_unrelated_indices, k=drop_items)
+
+        for _row_idx in reversed(range(row_max_len)):
+            if _row_idx in drop_row_indices:
+                del table_content["rows"][_row_idx]
+
+        # only when the drop ratio is too large, logging for warning.
+        if "id" in table_content and len(drop_row_indices) > 0:
+            logger.warning("Delete {:.2f} rows in table {}".format(len(drop_row_indices), table_content["id"]))
+
+
+__all__ = ["TapexTokenizer"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/trajectory_transformer/__init__.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/trajectory_transformer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4bccdd12c9703313951e223d0ac1955f9ca1581
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/trajectory_transformer/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ....utils import _LazyModule
+from ....utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_trajectory_transformer import *
+    from .modeling_trajectory_transformer import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/trajectory_transformer/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/trajectory_transformer/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..382081120f19031dd0493e3bc0c067c29162d427
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/trajectory_transformer/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/trajectory_transformer/__pycache__/configuration_trajectory_transformer.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/trajectory_transformer/__pycache__/configuration_trajectory_transformer.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..875676f45ec913a07913053f3eb712ef8601b656
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/trajectory_transformer/__pycache__/configuration_trajectory_transformer.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/trajectory_transformer/__pycache__/modeling_trajectory_transformer.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/trajectory_transformer/__pycache__/modeling_trajectory_transformer.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..75faab8abf218275c25a6ae3c8b89c6b2d09bca3
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/trajectory_transformer/__pycache__/modeling_trajectory_transformer.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/trajectory_transformer/configuration_trajectory_transformer.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/trajectory_transformer/configuration_trajectory_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a267cc4c4b82c6703eaa3a732d3fca1606a60d7
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/trajectory_transformer/configuration_trajectory_transformer.py
@@ -0,0 +1,155 @@
+# coding=utf-8
+# Copyright 2022 The Trajectory Transformers paper authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TrajectoryTransformer model configuration"""
+
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class TrajectoryTransformerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`TrajectoryTransformerModel`]. It is used to
+    instantiate an TrajectoryTransformer model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the
+    TrajectoryTransformer
+    [CarlCochet/trajectory-transformer-halfcheetah-medium-v2](https://huggingface.co/CarlCochet/trajectory-transformer-halfcheetah-medium-v2)
+    architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 100):
+            Vocabulary size of the TrajectoryTransformer model. Defines the number of different tokens that can be
+            represented by the `trajectories` passed when calling [`TrajectoryTransformerModel`]
+        action_weight (`int`, *optional*, defaults to 5):
+            Weight of the action in the loss function
+        reward_weight (`int`, *optional*, defaults to 1):
+            Weight of the reward in the loss function
+        value_weight (`int`, *optional*, defaults to 1):
+            Weight of the value in the loss function
+        block_size (`int`, *optional*, defaults to 249):
+            Size of the blocks in the trajectory transformer.
+        action_dim (`int`, *optional*, defaults to 6):
+            Dimension of the action space.
+        observation_dim (`int`, *optional*, defaults to 17):
+            Dimension of the observation space.
+        transition_dim (`int`, *optional*, defaults to 25):
+            Dimension of the transition space.
+        n_layer (`int`, *optional*, defaults to 4):
+            Number of hidden layers in the Transformer encoder.
+        n_head (`int`, *optional*, defaults to 4):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        n_embd (`int`, *optional*, defaults to 128):
+            Dimensionality of the embeddings and hidden states.
+        resid_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        embd_pdrop (`int`, *optional*, defaults to 0.1):
+            The dropout ratio for the embeddings.
+        attn_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        kaiming_initializer_range (`float, *optional*, defaults to 1):
+            A coefficient scaling the negative slope of the kaiming initializer rectifier for EinLinear layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        Example:
+
+    ```python
+    >>> from transformers import TrajectoryTransformerConfig, TrajectoryTransformerModel
+
+    >>> # Initializing a TrajectoryTransformer CarlCochet/trajectory-transformer-halfcheetah-medium-v2 style configuration
+    >>> configuration = TrajectoryTransformerConfig()
+
+    >>> # Initializing a model (with random weights) from the CarlCochet/trajectory-transformer-halfcheetah-medium-v2 style configuration
+    >>> model = TrajectoryTransformerModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "trajectory_transformer"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "hidden_size": "n_embd",
+        "num_attention_heads": "n_head",
+        "num_hidden_layers": "n_layer",
+    }
+
+    def __init__(
+        self,
+        vocab_size=100,
+        action_weight=5,
+        reward_weight=1,
+        value_weight=1,
+        block_size=249,
+        action_dim=6,
+        observation_dim=17,
+        transition_dim=25,
+        n_layer=4,
+        n_head=4,
+        n_embd=128,
+        embd_pdrop=0.1,
+        attn_pdrop=0.1,
+        resid_pdrop=0.1,
+        learning_rate=0.0006,
+        max_position_embeddings=512,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        kaiming_initializer_range=1,
+        use_cache=True,
+        pad_token_id=1,
+        bos_token_id=50256,
+        eos_token_id=50256,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.action_weight = action_weight
+        self.reward_weight = reward_weight
+        self.value_weight = value_weight
+        self.max_position_embeddings = max_position_embeddings
+        self.block_size = block_size
+        self.action_dim = action_dim
+        self.observation_dim = observation_dim
+        self.transition_dim = transition_dim
+        self.learning_rate = learning_rate
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.n_embd = n_embd
+        self.embd_pdrop = embd_pdrop
+        self.attn_pdrop = attn_pdrop
+        self.resid_pdrop = resid_pdrop
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.kaiming_initializer_range = kaiming_initializer_range
+        self.use_cache = use_cache
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+
+__all__ = ["TrajectoryTransformerConfig"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/trajectory_transformer/modeling_trajectory_transformer.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/trajectory_transformer/modeling_trajectory_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6ae410c1474140c018c88bb5cbdaf8ebf076226
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/trajectory_transformer/modeling_trajectory_transformer.py
@@ -0,0 +1,602 @@
+# coding=utf-8
+# Copyright 2022 The Trajectory Transformers paper authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch TrajectoryTransformer model."""
+
+import math
+import os
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from ....cache_utils import Cache
+from ....modeling_layers import GradientCheckpointingLayer
+from ....modeling_utils import PreTrainedModel
+from ....utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_trajectory_transformer import TrajectoryTransformerConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "CarlCochet/trajectory-transformer-halfcheetah-medium-v2"
+_CONFIG_FOR_DOC = "TrajectoryTransformerConfig"
+
+
+def load_tf_weights_in_trajectory_transformer(model, config, tf_checkpoint_path):
+    """Load tf checkpoints in a pytorch model."""
+    try:
+        import re
+
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        logger.info(f"Loading TF weight {name} with shape {shape}")
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+
+    for name, array in zip(names, arrays):
+        name = name.split("/")
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(
+            n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
+            for n in name
+        ):
+            logger.info(f"Skipping {'/'.join(name)}")
+            continue
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
+                scope_names = re.split(r"_(\d+)", m_name)
+            else:
+                scope_names = [m_name]
+            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
+                pointer = getattr(pointer, "bias")
+            elif scope_names[0] == "output_weights":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "squad":
+                pointer = getattr(pointer, "classifier")
+            else:
+                try:
+                    pointer = getattr(pointer, scope_names[0])
+                except AttributeError:
+                    logger.info(f"Skipping {'/'.join(name)}")
+                    continue
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
+                pointer = pointer[num]
+        if m_name[-11:] == "_embeddings":
+            pointer = getattr(pointer, "weight")
+        elif m_name == "kernel":
+            array = np.transpose(array)
+        try:
+            if pointer.shape != array.shape:
+                raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        logger.info(f"Initialize PyTorch weight {name}")
+        pointer.data = torch.from_numpy(array)
+    return model
+
+
+@dataclass
+class TrajectoryTransformerOutput(ModelOutput):
+    """
+    Base class for model's outputs that also contains a pooling of the last hidden states.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss.
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        past_key_values (`tuple[tuple[torch.Tensor]]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of length `config.n_layers`, containing tuples of tensors of shape `(batch_size, num_heads,
+            sequence_length, embed_size_per_head)`). Contains pre-computed hidden-states (key and values in the
+            attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
+            plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. GPT2Attentions weights after the attention softmax, used to compute the weighted average
+            in the self-attention heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[Cache] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+
+
+class TrajectoryTransformerPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config: TrajectoryTransformerConfig
+    load_tf_weights = load_tf_weights_in_trajectory_transformer
+    base_model_prefix = "trajectory_transformer"
+    main_input_name = "trajectories"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, EinLinear):
+            for i in range(module.n_models):
+                nn.init.kaiming_uniform_(module.weight[i], a=math.sqrt(5) / self.config.kaiming_initializer_range)
+                if module.bias is not None:
+                    fan_in, _ = nn.init._calculate_fan_in_and_fan_out(module.weight[i])
+                    bound = (1 / math.sqrt(fan_in)) * self.config.initializer_range
+                    nn.init.uniform_(module.bias[i], -bound, bound)
+
+
+TRAJECTORY_TRANSFORMER_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`TrajectoryTransformerConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+TRAJECTORY_TRANSFORMER_INPUTS_DOCSTRING = r"""
+    Args:
+        trajectories (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Batch of trajectories, where a trajectory is a sequence of states, actions and rewards.
+        past_key_values (`tuple[tuple[torch.Tensor]]` of length `config.n_layers`, *optional*):
+            Contains precomputed hidden-states (key and values in the attention blocks) as computed by the model (see
+            `past_key_values` output below). Can be used to speed up sequential decoding. The `input_ids` which have
+            their past given to this model should not be passed as `input_ids` as they have already been computed.
+        targets (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Desired targets used to compute the loss.
+        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class EinLinear(nn.Module):
+    def __init__(self, n_models, in_features, out_features, bias):
+        super().__init__()
+        self.n_models = n_models
+        self.out_features = out_features
+        self.in_features = in_features
+        self.weight = nn.Parameter(torch.Tensor(n_models, out_features, in_features))
+        if bias:
+            self.bias = nn.Parameter(torch.Tensor(n_models, out_features))
+        else:
+            self.register_parameter("bias", None)
+
+    def reset_parameters(self):
+        for i in range(self.n_models):
+            nn.init.kaiming_uniform_(self.weight[i], a=math.sqrt(5))
+            if self.bias is not None:
+                fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight[i])
+                bound = 1 / math.sqrt(fan_in)
+                nn.init.uniform_(self.bias[i], -bound, bound)
+
+    def forward(self, input):
+        """
+        Args:
+            input (`torch.FloatTensor` of shape `(B, n_models, input_dim)`):
+                The input to the layer.
+        """
+        # [ batch_size x n_models x output_dim ]
+        output = torch.einsum("eoi,bei->beo", self.weight, input)
+        if self.bias is not None:
+            raise RuntimeError()
+        return output
+
+
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        if config.n_embd % config.n_head != 0:
+            raise ValueError(f"n_head ({config.n_head}) should be a divisor of n_embd ({config.n_embd})")
+
+        # key, query, value projections for all heads
+        self.key = nn.Linear(config.n_embd, config.n_embd)
+        self.query = nn.Linear(config.n_embd, config.n_embd)
+        self.value = nn.Linear(config.n_embd, config.n_embd)
+
+        # regularization
+        self.attn_drop = nn.Dropout(config.attn_pdrop)
+        self.resid_drop = nn.Dropout(config.resid_pdrop)
+
+        # output projection
+        self.proj = nn.Linear(config.n_embd, config.n_embd)
+
+        # causal mask to ensure that attention is only applied to the left in the input sequence
+        self.register_buffer(
+            "mask",
+            torch.tril(torch.ones(config.block_size, config.block_size)).view(
+                1, 1, config.block_size, config.block_size
+            ),
+            persistent=False,
+        )
+
+        # mask previous value estimates
+        joined_dim = config.observation_dim + config.action_dim + 2
+        self.mask.squeeze()[:, joined_dim - 1 :: joined_dim] = 0
+
+        self.n_head = config.n_head
+
+    def forward(
+        self,
+        hidden_states: Optional[tuple[torch.FloatTensor]],
+        layer_past: Optional[tuple[torch.Tensor]] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+    ):
+        batch_size, sequence_length, embedding_dim = hidden_states.size()
+
+        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
+        # [ batch_size x n_heads x sequence_length x head_dim ]
+        key = (
+            self.key(hidden_states)
+            .view(batch_size, sequence_length, self.n_head, embedding_dim // self.n_head)
+            .transpose(1, 2)
+        )
+        query = (
+            self.query(hidden_states)
+            .view(batch_size, sequence_length, self.n_head, embedding_dim // self.n_head)
+            .transpose(1, 2)
+        )
+        value = (
+            self.value(hidden_states)
+            .view(batch_size, sequence_length, self.n_head, embedding_dim // self.n_head)
+            .transpose(1, 2)
+        )
+
+        if layer_past is not None:
+            past_key, past_value = layer_past
+            key = torch.cat((past_key, key), dim=-2)
+            value = torch.cat((past_value, value), dim=-2)
+
+        if use_cache is True:
+            present = (key, value)
+        else:
+            present = None
+
+        # causal self-attention
+        # [ batch_size x n_heads x sequence_length x sequence_length ]
+        attn_weights = (torch.matmul(query, key.transpose(-2, -1))) * (1.0 / math.sqrt(key.size(-1)))
+        attn_weights = attn_weights.masked_fill(
+            self.mask[:, :, :sequence_length, :sequence_length] == 0, torch.finfo(attn_weights.dtype).min
+        )
+        attn_weights = F.softmax(attn_weights, dim=-1)
+        self._attn_map = attn_weights.clone()
+        attn_weights = self.attn_drop(attn_weights)
+
+        output = torch.matmul(attn_weights, value)
+        # [ batch_size x sequence_length x embedding_dim ]
+        # re-assemble all head outputs side by side
+        output = output.transpose(1, 2).contiguous().view(batch_size, sequence_length, embedding_dim)
+
+        # output projection
+        output = self.resid_drop(self.proj(output))
+
+        outputs = (output, present)
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class Block(GradientCheckpointingLayer):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config.n_embd)
+        self.ln2 = nn.LayerNorm(config.n_embd)
+        self.attn = CausalSelfAttention(config)
+
+        # MLP
+        self.l1 = nn.Linear(config.n_embd, 4 * config.n_embd)
+        self.act = nn.GELU()
+        self.l2 = nn.Linear(4 * config.n_embd, config.n_embd)
+        self.drop = nn.Dropout(config.resid_pdrop)
+
+    def forward(
+        self,
+        hidden_states: Optional[tuple[torch.FloatTensor]],
+        layer_past: Optional[tuple[torch.Tensor]] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+    ):
+        residual = hidden_states
+        hidden_states = self.ln1(hidden_states)
+
+        attn_outputs = self.attn(
+            hidden_states, layer_past=layer_past, use_cache=use_cache, output_attentions=output_attentions
+        )
+        attn_output = attn_outputs[0]
+        outputs = attn_outputs[1:]
+        hidden_states = attn_output + residual
+
+        residual = hidden_states
+        hidden_states = self.ln2(hidden_states)
+        hidden_states = self.l1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.l2(hidden_states)
+        hidden_states = residual + self.drop(hidden_states)
+
+        if use_cache:
+            outputs = (hidden_states,) + outputs
+        else:
+            outputs = (hidden_states,) + outputs[1:]
+
+        return outputs
+
+
+@add_start_docstrings(
+    "The bare TrajectoryTransformer Model transformer outputting raw hidden-states without any specific head on top.",
+    TRAJECTORY_TRANSFORMER_START_DOCSTRING,
+)
+class TrajectoryTransformerModel(TrajectoryTransformerPreTrainedModel):
+    """the full GPT language model, with a context size of block_size"""
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        # input embedding stem (+1 for stop token)
+        self.tok_emb = nn.Embedding(config.vocab_size * config.transition_dim + 1, config.n_embd)
+
+        self.pos_emb = nn.Parameter(torch.zeros(1, config.block_size, config.n_embd))
+        self.drop = nn.Dropout(config.embd_pdrop)
+        # transformer
+        self.blocks = nn.ModuleList([Block(config) for _ in range(config.n_layer)])
+        # decoder head
+        self.ln_f = nn.LayerNorm(config.n_embd)
+        self.head = EinLinear(config.transition_dim, config.n_embd, config.vocab_size + 1, bias=False)
+
+        self.vocab_size = config.vocab_size
+        self.stop_token = config.vocab_size * config.transition_dim
+        self.block_size = config.block_size
+
+        self.observation_dim = config.observation_dim
+        self.action_dim = config.action_dim
+        self.transition_dim = config.transition_dim
+        self.embedding_dim = config.n_embd
+
+        self.action_weight = config.action_weight
+        self.reward_weight = config.reward_weight
+        self.value_weight = config.value_weight
+
+        self.gradient_checkpointing = False
+
+        self.post_init()
+
+    def get_block_size(self):
+        return self.block_size
+
+    def offset_tokens(self, trajectories):
+        _, sequence_length = trajectories.shape
+
+        n_states = int(np.ceil(sequence_length / self.transition_dim))
+
+        offsets = torch.arange(self.transition_dim) * self.vocab_size
+        offsets = offsets.repeat(n_states).to(trajectories.device)
+
+        offset_trajectories = trajectories + offsets[:sequence_length]
+        offset_trajectories[trajectories == self.vocab_size] = self.stop_token
+        return offset_trajectories
+
+    def pad_to_full_observation(self, hidden_states):
+        batch_size, sequence_length, _ = hidden_states.shape
+
+        n_pad = (self.transition_dim - sequence_length % self.transition_dim) % self.transition_dim
+        padding = torch.zeros(batch_size, n_pad, self.embedding_dim, device=hidden_states.device)
+
+        # [ batch_size x padded_sequence_length' x embedding_dim ]
+        hidden_states_pad = torch.cat([hidden_states, padding], dim=1)
+        hidden_states_pad = hidden_states_pad.view(-1, self.transition_dim, self.embedding_dim)
+
+        return hidden_states_pad, n_pad
+
+    @add_start_docstrings_to_model_forward(
+        TRAJECTORY_TRANSFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length")
+    )
+    @replace_return_docstrings(output_type=TrajectoryTransformerOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        trajectories: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        targets: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], TrajectoryTransformerOutput]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import TrajectoryTransformerModel
+        >>> import torch
+
+        >>> model = TrajectoryTransformerModel.from_pretrained(
+        ...     "CarlCochet/trajectory-transformer-halfcheetah-medium-v2"
+        ... )
+        >>> model.to(device)
+        >>> model.eval()
+
+        >>> observations_dim, action_dim, batch_size = 17, 6, 256
+        >>> seq_length = observations_dim + action_dim + 1
+
+        >>> trajectories = torch.LongTensor([np.random.permutation(self.seq_length) for _ in range(batch_size)]).to(
+        ...     device
+        ... )
+        >>> targets = torch.LongTensor([np.random.permutation(self.seq_length) for _ in range(batch_size)]).to(device)
+
+        >>> outputs = model(
+        ...     trajectories,
+        ...     targets=targets,
+        ...     use_cache=True,
+        ...     output_attentions=True,
+        ...     output_hidden_states=True,
+        ...     return_dict=True,
+        ... )
+        ```
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        if past_key_values is None:
+            past_key_values = tuple([None] * len(self.blocks))
+
+        batch_size, sequence_length = trajectories.size()
+
+        if sequence_length > self.block_size:
+            raise ValueError("Cannot forward, model block size is exhausted.")
+
+        offset_trajectories = self.offset_tokens(trajectories)
+        # [ batch_size x sequence_length x embedding_dim ]
+        # forward the GPT model
+        token_embeddings = self.tok_emb(offset_trajectories)  # each index maps to a (learnable) vector
+        position_embeddings = self.pos_emb[:, :sequence_length, :]  # each position maps to a (learnable) vector
+
+        hidden_states = self.drop(token_embeddings + position_embeddings)
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        presents = () if use_cache else None
+        all_self_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+
+        for i, (block, layer_past) in enumerate(zip(self.blocks, past_key_values)):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            outputs = block(hidden_states, layer_past, use_cache, output_attentions)
+
+            hidden_states = outputs[0]
+            if use_cache is True:
+                presents = presents + (outputs[1],)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
+
+        # [ batch_size x sequence_length x embedding_dim ]
+        hidden_state = self.ln_f(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        hidden_states_pad, n_pad = self.pad_to_full_observation(hidden_state)
+
+        logits = self.head(hidden_states_pad)
+        logits = logits.reshape(batch_size, sequence_length + n_pad, self.vocab_size + 1)
+        logits = logits[:, :sequence_length]
+
+        # if we are given some desired targets also calculate the loss
+        if targets is not None:
+            loss = F.cross_entropy(logits.reshape(-1, logits.size(-1)), targets.view(-1), reduction="none")
+            if self.action_weight != 1 or self.reward_weight != 1 or self.value_weight != 1:
+                # make weights
+                n_states = int(np.ceil(sequence_length / self.transition_dim))
+                weights = torch.cat(
+                    [
+                        torch.ones(self.observation_dim, device=trajectories.device),
+                        torch.ones(self.action_dim, device=trajectories.device) * self.action_weight,
+                        torch.ones(1, device=trajectories.device) * self.reward_weight,
+                        torch.ones(1, device=trajectories.device) * self.value_weight,
+                    ]
+                )
+                weights = weights.repeat(n_states)
+                weights = weights[1:].repeat(batch_size, 1)
+                loss = loss * weights.view(-1)
+            loss = (loss * attention_mask.view(-1)).mean()
+        else:
+            loss = None
+
+        if not return_dict:
+            return tuple(v for v in [loss, logits, presents, all_hidden_states, all_self_attentions] if v is not None)
+
+        return TrajectoryTransformerOutput(
+            loss=loss,
+            logits=logits,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+__all__ = [
+    "TrajectoryTransformerModel",
+    "TrajectoryTransformerPreTrainedModel",
+    "load_tf_weights_in_trajectory_transformer",
+]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/transfo_xl/__init__.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/transfo_xl/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ac9a2cbf4766bd187d90fdaf46505db3e92b68f
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/transfo_xl/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ....utils import _LazyModule
+from ....utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_transfo_xl import *
+    from .modeling_tf_transfo_xl import *
+    from .modeling_transfo_xl import *
+    from .tokenization_transfo_xl import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aadb5d2529c43397da1205189cab08034470fe5b
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/configuration_transfo_xl.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/configuration_transfo_xl.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1b8dc8853600c48bf7c4dba9b9d9d3bdfdf900af
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/configuration_transfo_xl.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/modeling_tf_transfo_xl.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/modeling_tf_transfo_xl.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..925ed3b8dbf7a027d63a6496a56b9d99a373aae4
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/modeling_tf_transfo_xl.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/modeling_tf_transfo_xl_utilities.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/modeling_tf_transfo_xl_utilities.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e000e3198b21181260b3c4e40f44a6386f326e85
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/modeling_tf_transfo_xl_utilities.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/modeling_transfo_xl.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/modeling_transfo_xl.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a55a83152ccb865141494f775430a7f4df3de80b
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/modeling_transfo_xl.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/modeling_transfo_xl_utilities.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/modeling_transfo_xl_utilities.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..808913a7366280c1e452c8397785045d9e73fe45
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/modeling_transfo_xl_utilities.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/tokenization_transfo_xl.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/tokenization_transfo_xl.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f007520875f8088e41a330ec71fb1362c8ef91ee
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/transfo_xl/__pycache__/tokenization_transfo_xl.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/transfo_xl/configuration_transfo_xl.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/transfo_xl/configuration_transfo_xl.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7926246c4cbb37904fb3044507f4831bb7d9bb9
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/transfo_xl/configuration_transfo_xl.py
@@ -0,0 +1,189 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Transformer XL configuration"""
+
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class TransfoXLConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`TransfoXLModel`] or a [`TFTransfoXLModel`]. It is
+    used to instantiate a Transformer-XL model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the TransfoXL
+    [transfo-xl/transfo-xl-wt103](https://huggingface.co/transfo-xl/transfo-xl-wt103) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 267735):
+            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`TransfoXLModel`] or [`TFTransfoXLModel`].
+        cutoffs (`list[int]`, *optional*, defaults to `[20000, 40000, 200000]`):
+            Cutoffs for the adaptive softmax.
+        d_model (`int`, *optional*, defaults to 1024):
+            Dimensionality of the model's hidden states.
+        d_embed (`int`, *optional*, defaults to 1024):
+            Dimensionality of the embeddings
+        n_head (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        d_head (`int`, *optional*, defaults to 64):
+            Dimensionality of the model's heads.
+        d_inner (`int`, *optional*, defaults to 4096):
+            Inner dimension in FF
+        div_val (`int`, *optional*, defaults to 4):
+            Divident value for adaptive input and softmax
+        pre_lnorm (`boolean`, *optional*, defaults to `False`):
+            Whether or not to apply LayerNorm to the input instead of the output in the blocks.
+        n_layer (`int`, *optional*, defaults to 18):
+            Number of hidden layers in the Transformer encoder.
+        mem_len (`int`, *optional*, defaults to 1600):
+            Length of the retained previous heads.
+        clamp_len (`int`, *optional*, defaults to 1000):
+            Use the same pos embeddings after clamp_len.
+        same_length (`boolean`, *optional*, defaults to `True`):
+            Whether or not to use the same attn length for all tokens
+        proj_share_all_but_first (`boolean`, *optional*, defaults to `True`):
+            True to share all but first projs, False not to share.
+        attn_type (`int`, *optional*, defaults to 0):
+            Attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al.
+        sample_softmax (`int`, *optional*, defaults to -1):
+            Number of samples in the sampled softmax.
+        adaptive (`boolean`, *optional*, defaults to `True`):
+            Whether or not to use adaptive softmax.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        dropatt (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        untie_r (`boolean`, *optional*, defaults to `True`):
+            Whether ot not to untie relative position biases.
+        init (`str`, *optional*, defaults to `"normal"`):
+            Parameter initializer to use.
+        init_range (`float`, *optional*, defaults to 0.01):
+            Parameters initialized by U(-init_range, init_range).
+        proj_init_std (`float`, *optional*, defaults to 0.01):
+            Parameters initialized by N(0, init_std)
+        init_std (`float`, *optional*, defaults to 0.02):
+            Parameters initialized by N(0, init_std)
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-05):
+            The epsilon to use in the layer normalization layers
+        eos_token_id (`int`, *optional*, defaults to 0):
+            End of stream token id.
+
+    Examples:
+
+    ```python
+    >>> from transformers import TransfoXLConfig, TransfoXLModel
+
+    >>> # Initializing a Transformer XL configuration
+    >>> configuration = TransfoXLConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = TransfoXLModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "transfo-xl"
+    keys_to_ignore_at_inference = ["mems"]
+    attribute_map = {
+        "n_token": "vocab_size",
+        "hidden_size": "d_model",
+        "num_attention_heads": "n_head",
+        "num_hidden_layers": "n_layer",
+    }
+
+    def __init__(
+        self,
+        vocab_size=267735,
+        cutoffs=[20000, 40000, 200000],
+        d_model=1024,
+        d_embed=1024,
+        n_head=16,
+        d_head=64,
+        d_inner=4096,
+        div_val=4,
+        pre_lnorm=False,
+        n_layer=18,
+        mem_len=1600,
+        clamp_len=1000,
+        same_length=True,
+        proj_share_all_but_first=True,
+        attn_type=0,
+        sample_softmax=-1,
+        adaptive=True,
+        dropout=0.1,
+        dropatt=0.0,
+        untie_r=True,
+        init="normal",
+        init_range=0.01,
+        proj_init_std=0.01,
+        init_std=0.02,
+        layer_norm_epsilon=1e-5,
+        eos_token_id=0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.cutoffs = []
+        self.cutoffs.extend(cutoffs)
+        if proj_share_all_but_first:
+            self.tie_projs = [False] + [True] * len(self.cutoffs)
+        else:
+            self.tie_projs = [False] + [False] * len(self.cutoffs)
+        self.d_model = d_model
+        self.d_embed = d_embed
+        self.d_head = d_head
+        self.d_inner = d_inner
+        self.div_val = div_val
+        self.pre_lnorm = pre_lnorm
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.mem_len = mem_len
+        self.same_length = same_length
+        self.attn_type = attn_type
+        self.clamp_len = clamp_len
+        self.sample_softmax = sample_softmax
+        self.adaptive = adaptive
+        self.dropout = dropout
+        self.dropatt = dropatt
+        self.untie_r = untie_r
+        self.init = init
+        self.init_range = init_range
+        self.proj_init_std = proj_init_std
+        self.init_std = init_std
+        self.layer_norm_epsilon = layer_norm_epsilon
+        super().__init__(eos_token_id=eos_token_id, **kwargs)
+
+    @property
+    def max_position_embeddings(self):
+        # Message copied from Transformer-XL documentation
+        logger.info(f"The model {self.model_type} is one of the few models that has no sequence length limit.")
+        return -1
+
+    @max_position_embeddings.setter
+    def max_position_embeddings(self, value):
+        # Message copied from Transformer-XL documentation
+        raise NotImplementedError(
+            f"The model {self.model_type} is one of the few models that has no sequence length limit."
+        )
+
+
+__all__ = ["TransfoXLConfig"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c7830d633448bde3b8a420f0d84cdd070498882
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl.py
@@ -0,0 +1,1128 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+TF 2.0 Transformer XL model.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+import numpy as np
+import tensorflow as tf
+
+from ....modeling_tf_utils import (
+    TFModelInputType,
+    TFPreTrainedModel,
+    TFSequenceClassificationLoss,
+    get_initializer,
+    keras,
+    keras_serializable,
+    unpack_inputs,
+)
+from ....tf_utils import shape_list, stable_softmax
+from ....utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
+from .configuration_transfo_xl import TransfoXLConfig
+from .modeling_tf_transfo_xl_utilities import TFAdaptiveSoftmaxMask
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "transfo-xl/transfo-xl-wt103"
+_CONFIG_FOR_DOC = "TransfoXLConfig"
+
+
+class TFPositionalEmbedding(keras.layers.Layer):
+    def __init__(self, demb, **kwargs):
+        super().__init__(**kwargs)
+
+        self.inv_freq = 1 / (10000 ** (tf.range(0, demb, 2.0) / demb))
+
+    def call(self, pos_seq, bsz=None):
+        self.inv_freq = tf.cast(self.inv_freq, dtype=pos_seq.dtype)
+        sinusoid_inp = tf.einsum("i,j->ij", pos_seq, self.inv_freq)
+        pos_emb = tf.concat([tf.sin(sinusoid_inp), tf.cos(sinusoid_inp)], -1)
+
+        if bsz is not None:
+            return tf.tile(pos_emb[:, None, :], [1, bsz, 1])
+        else:
+            return pos_emb[:, None, :]
+
+
+class TFPositionwiseFF(keras.layers.Layer):
+    def __init__(self, d_model, d_inner, dropout, pre_lnorm=False, layer_norm_epsilon=1e-5, init_std=0.02, **kwargs):
+        super().__init__(**kwargs)
+
+        self.d_model = d_model
+        self.d_inner = d_inner
+        self.dropout = dropout
+
+        self.layer_1 = keras.layers.Dense(
+            d_inner, kernel_initializer=get_initializer(init_std), activation=tf.nn.relu, name="CoreNet_._0"
+        )
+        self.drop_1 = keras.layers.Dropout(dropout)
+        self.layer_2 = keras.layers.Dense(d_model, kernel_initializer=get_initializer(init_std), name="CoreNet_._3")
+        self.drop_2 = keras.layers.Dropout(dropout)
+
+        self.layer_norm = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layer_norm")
+
+        self.pre_lnorm = pre_lnorm
+
+    def call(self, inp, training=False):
+        if self.pre_lnorm:
+            # layer normalization + positionwise feed-forward
+            core_out = self.layer_norm(inp)
+            core_out = self.layer_1(core_out)
+            core_out = self.drop_1(core_out, training=training)
+            core_out = self.layer_2(core_out)
+            core_out = self.drop_2(core_out, training=training)
+
+            # residual connection
+            output = core_out + inp
+        else:
+            # positionwise feed-forward
+            core_out = self.layer_1(inp)
+            core_out = self.drop_1(core_out, training=training)
+            core_out = self.layer_2(core_out)
+            core_out = self.drop_2(core_out, training=training)
+
+            # residual connection + layer normalization
+            output = self.layer_norm(inp + core_out)
+
+        return output
+
+
+class TFRelPartialLearnableMultiHeadAttn(keras.layers.Layer):
+    def __init__(
+        self,
+        n_head,
+        d_model,
+        d_head,
+        dropout,
+        dropatt=0.0,
+        pre_lnorm=False,
+        r_r_bias=None,
+        r_w_bias=None,
+        layer_norm_epsilon=1e-5,
+        init_std=0.02,
+        output_attentions=False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.n_head = n_head
+        self.d_model = d_model
+        self.d_head = d_head
+        self.dropout = dropout
+        self.output_attentions = output_attentions
+
+        self.qkv_net = keras.layers.Dense(
+            3 * n_head * d_head, kernel_initializer=get_initializer(init_std), use_bias=False, name="qkv_net"
+        )
+
+        self.drop = keras.layers.Dropout(dropout)
+        self.dropatt = keras.layers.Dropout(dropatt)
+        self.o_net = keras.layers.Dense(
+            d_model, kernel_initializer=get_initializer(init_std), use_bias=False, name="o_net"
+        )
+
+        self.layer_norm = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layer_norm")
+
+        self.scale = 1 / (d_head**0.5)
+
+        self.pre_lnorm = pre_lnorm
+
+        if r_r_bias is not None and r_w_bias is not None:  # Biases are shared
+            self.r_r_bias = r_r_bias
+            self.r_w_bias = r_w_bias
+        else:
+            self.r_r_bias = None
+            self.r_w_bias = None
+
+        self.r_net = keras.layers.Dense(
+            self.n_head * self.d_head, kernel_initializer=get_initializer(init_std), use_bias=False, name="r_net"
+        )
+
+    def build(self, input_shape):
+        if self.r_r_bias is None or self.r_w_bias is None:  # Biases are not shared
+            self.r_r_bias = self.add_weight(
+                shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_r_bias"
+            )
+            self.r_w_bias = self.add_weight(
+                shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_w_bias"
+            )
+        super().build(input_shape)
+
+    def _rel_shift(self, x):
+        x_size = shape_list(x)
+
+        x = tf.pad(x, [[0, 0], [1, 0], [0, 0], [0, 0]])
+        x = tf.reshape(x, [x_size[1] + 1, x_size[0], x_size[2], x_size[3]])
+        x = tf.slice(x, [1, 0, 0, 0], [-1, -1, -1, -1])
+        x = tf.reshape(x, x_size)
+
+        return x
+
+    def call(self, w, r, attn_mask, mems, head_mask, output_attentions, training=False):
+        qlen, rlen, bsz = shape_list(w)[0], shape_list(r)[0], shape_list(w)[1]
+
+        if mems is not None:
+            mems = tf.cast(mems, dtype=w.dtype)
+            cat = tf.concat([mems, w], 0)
+            if self.pre_lnorm:
+                w_heads = self.qkv_net(self.layer_norm(cat))
+            else:
+                w_heads = self.qkv_net(cat)
+            r_head_k = self.r_net(r)
+
+            w_head_q, w_head_k, w_head_v = tf.split(w_heads, 3, axis=-1)
+            w_head_q = w_head_q[-qlen:]
+        else:
+            if self.pre_lnorm:
+                w_heads = self.qkv_net(self.layer_norm(w))
+            else:
+                w_heads = self.qkv_net(w)
+            r_head_k = self.r_net(r)
+
+            w_head_q, w_head_k, w_head_v = tf.split(w_heads, 3, axis=-1)
+
+        klen = shape_list(w_head_k)[0]
+
+        w_head_q = tf.reshape(w_head_q, (qlen, bsz, self.n_head, self.d_head))  # qlen x bsz x n_head x d_head
+        w_head_k = tf.reshape(w_head_k, (klen, bsz, self.n_head, self.d_head))  # qlen x bsz x n_head x d_head
+        w_head_v = tf.reshape(w_head_v, (klen, bsz, self.n_head, self.d_head))  # qlen x bsz x n_head x d_head
+
+        r_head_k = tf.reshape(r_head_k, (rlen, self.n_head, self.d_head))  # qlen x n_head x d_head
+
+        # compute attention score
+        rw_head_q = w_head_q + self.r_w_bias  # qlen x bsz x n_head x d_head
+        AC = tf.einsum("ibnd,jbnd->ijbn", rw_head_q, w_head_k)  # qlen x klen x bsz x n_head
+
+        rr_head_q = w_head_q + self.r_r_bias
+        BD = tf.einsum("ibnd,jnd->ijbn", rr_head_q, r_head_k)  # qlen x klen x bsz x n_head
+        BD = self._rel_shift(BD)
+
+        # [qlen x klen x bsz x n_head]
+        attn_score = AC + BD
+        attn_score = attn_score * self.scale
+
+        # compute attention probability
+        if attn_mask is not None:
+            attn_mask_t = attn_mask[:, :, None, None]
+            attn_mask_t = tf.cast(attn_mask_t, dtype=attn_score.dtype)
+            attn_score = attn_score * (1.0 - attn_mask_t) - 1e30 * attn_mask_t
+
+        # [qlen x klen x bsz x n_head]
+        attn_prob = stable_softmax(attn_score, axis=1)
+        attn_prob = self.dropatt(attn_prob, training=training)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_prob = attn_prob * head_mask
+
+        # compute attention vector
+        attn_vec = tf.einsum("ijbn,jbnd->ibnd", attn_prob, w_head_v)
+
+        # [qlen x bsz x n_head x d_head]
+        attn_vec_sizes = shape_list(attn_vec)
+        attn_vec = tf.reshape(attn_vec, (attn_vec_sizes[0], attn_vec_sizes[1], self.n_head * self.d_head))
+
+        # linear projection
+        attn_out = self.o_net(attn_vec)
+        attn_out = self.drop(attn_out, training=training)
+
+        if self.pre_lnorm:
+            # residual connection
+            outputs = [w + attn_out]
+        else:
+            # residual connection + layer normalization
+            outputs = [self.layer_norm(w + attn_out)]
+
+        if output_attentions:
+            outputs.append(attn_prob)
+
+        return outputs
+
+
+class TFRelPartialLearnableDecoderLayer(keras.layers.Layer):
+    def __init__(
+        self,
+        n_head,
+        d_model,
+        d_head,
+        d_inner,
+        dropout,
+        dropatt=0.0,
+        pre_lnorm=False,
+        r_w_bias=None,
+        r_r_bias=None,
+        layer_norm_epsilon=1e-5,
+        init_std=0.02,
+        output_attentions=False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.dec_attn = TFRelPartialLearnableMultiHeadAttn(
+            n_head,
+            d_model,
+            d_head,
+            dropout,
+            dropatt=dropatt,
+            pre_lnorm=pre_lnorm,
+            r_w_bias=r_w_bias,
+            r_r_bias=r_r_bias,
+            init_std=init_std,
+            layer_norm_epsilon=layer_norm_epsilon,
+            output_attentions=output_attentions,
+            name="dec_attn",
+        )
+        self.pos_ff = TFPositionwiseFF(
+            d_model,
+            d_inner,
+            dropout,
+            pre_lnorm=pre_lnorm,
+            init_std=init_std,
+            layer_norm_epsilon=layer_norm_epsilon,
+            name="pos_ff",
+        )
+
+    def call(self, dec_inp, r, dec_attn_mask, mems, head_mask, output_attentions, training=False):
+        attn_outputs = self.dec_attn(dec_inp, r, dec_attn_mask, mems, head_mask, output_attentions, training=training)
+        ff_output = self.pos_ff(attn_outputs[0], training=training)
+
+        outputs = [ff_output] + attn_outputs[1:]
+
+        return outputs
+
+
+class TFTransfoEmbeddings(keras.layers.Layer):
+    def __init__(self, vocab_size, emb_size, init_std, **kwargs):
+        super().__init__(**kwargs)
+
+        self.vocab_size = vocab_size
+        self.emb_size = emb_size
+        self.init_std = init_std
+
+    def build(self, input_shape):
+        self.weight = self.add_weight(
+            shape=(self.vocab_size, self.emb_size),
+            initializer=get_initializer(self.init_std),
+            name="embeddings",
+        )
+
+        super().build(input_shape)
+
+    def call(self, inputs):
+        return tf.gather(self.weight, inputs)
+
+
+class TFAdaptiveEmbedding(keras.layers.Layer):
+    def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, init_std=0.02, sample_softmax=False, **kwargs):
+        super().__init__(**kwargs)
+
+        self.n_token = n_token
+        self.d_embed = d_embed
+        self.init_std = init_std
+
+        self.cutoffs = cutoffs + [n_token]
+        self.div_val = div_val
+        self.d_proj = d_proj
+
+        self.emb_scale = d_proj**0.5
+
+        self.cutoff_ends = [0] + self.cutoffs
+
+        self.emb_layers = []
+        self.emb_projs = []
+
+        if div_val == 1:
+            raise NotImplementedError  # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
+        else:
+            for i in range(len(self.cutoffs)):
+                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
+                d_emb_i = d_embed // (div_val**i)
+                self.emb_layers.append(
+                    TFTransfoEmbeddings(
+                        r_idx - l_idx,
+                        d_emb_i,
+                        init_std,
+                        name=f"emb_layers_._{i}",
+                    )
+                )
+
+    def build(self, input_shape):
+        for i in range(len(self.cutoffs)):
+            d_emb_i = self.d_embed // (self.div_val**i)
+            self.emb_projs.append(
+                self.add_weight(
+                    shape=(d_emb_i, self.d_proj),
+                    initializer=get_initializer(self.init_std),
+                    trainable=True,
+                    name=f"emb_projs_._{i}",
+                )
+            )
+
+        super().build(input_shape)
+
+    def call(self, inp):
+        if self.div_val == 1:
+            raise NotImplementedError  # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
+        else:
+            inp_flat = tf.reshape(inp, (-1,))
+            emb_flat = tf.zeros([shape_list(inp_flat)[0], self.d_proj])
+            for i in range(len(self.cutoffs)):
+                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
+
+                mask_i = (inp_flat >= l_idx) & (inp_flat < r_idx)
+
+                inp_i = tf.boolean_mask(inp_flat, mask_i) - l_idx
+                emb_i = self.emb_layers[i](inp_i)
+                emb_i = tf.einsum("id,de->ie", emb_i, self.emb_projs[i])
+
+                mask_idx = tf.where(mask_i)
+                scatter = tf.scatter_nd(mask_idx, emb_i, shape_list(emb_flat))
+                emb_flat = tf.cast(emb_flat, dtype=scatter.dtype)
+                emb_flat += scatter
+
+            embed_shape = shape_list(inp) + [self.d_proj]
+            embed = tf.reshape(emb_flat, embed_shape)
+
+        embed *= self.emb_scale
+
+        return embed
+
+
+@keras_serializable
+class TFTransfoXLMainLayer(keras.layers.Layer):
+    config_class = TransfoXLConfig
+
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+
+        self.config = config
+        self.output_hidden_states = config.output_hidden_states
+        self.output_attentions = config.output_attentions
+        self.return_dict = config.use_return_dict
+
+        self.n_token = config.vocab_size
+
+        self.d_embed = config.d_embed
+        self.d_model = config.d_model
+        self.n_head = config.n_head
+        self.d_head = config.d_head
+        self.untie_r = config.untie_r
+
+        self.word_emb = TFAdaptiveEmbedding(
+            config.vocab_size,
+            config.d_embed,
+            config.d_model,
+            config.cutoffs,
+            div_val=config.div_val,
+            init_std=config.init_std,
+            name="word_emb",
+        )
+
+        self.drop = keras.layers.Dropout(config.dropout)
+
+        self.n_layer = config.n_layer
+        self.mem_len = config.mem_len
+        self.attn_type = config.attn_type
+
+        self.layers = []
+        if config.attn_type == 0:  # the default attention
+            for i in range(config.n_layer):
+                self.layers.append(
+                    TFRelPartialLearnableDecoderLayer(
+                        config.n_head,
+                        config.d_model,
+                        config.d_head,
+                        config.d_inner,
+                        config.dropout,
+                        dropatt=config.dropatt,
+                        pre_lnorm=config.pre_lnorm,
+                        r_w_bias=None if self.untie_r else self.r_w_bias,
+                        r_r_bias=None if self.untie_r else self.r_r_bias,
+                        layer_norm_epsilon=config.layer_norm_epsilon,
+                        init_std=config.init_std,
+                        output_attentions=self.output_attentions,
+                        name=f"layers_._{i}",
+                    )
+                )
+        else:  # learnable embeddings and absolute embeddings
+            raise NotImplementedError  # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
+
+        self.same_length = config.same_length
+        self.clamp_len = config.clamp_len
+
+        if self.attn_type == 0:  # default attention
+            self.pos_emb = TFPositionalEmbedding(self.d_model, name="pos_emb")
+        else:  # learnable embeddings and absolute embeddings
+            raise NotImplementedError  # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
+
+    def build(self, input_shape):
+        if not self.untie_r:
+            self.r_w_bias = self.add_weight(
+                shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_w_bias"
+            )
+            self.r_r_bias = self.add_weight(
+                shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_r_bias"
+            )
+        super().build(input_shape)
+
+    def get_input_embeddings(self):
+        return self.word_emb
+
+    def set_input_embeddings(self, value):
+        raise NotImplementedError
+
+    def backward_compatible(self):
+        self.sample_softmax = -1
+
+    def reset_memory_length(self, mem_len):
+        self.mem_len = mem_len
+
+    def _prune_heads(self, heads):
+        raise NotImplementedError
+
+    def init_mems(self, bsz):
+        if self.mem_len > 0:
+            mems = []
+            for i in range(self.n_layer):
+                empty = tf.zeros([self.mem_len, bsz, self.d_model])
+                mems.append(empty)
+
+            return mems
+        else:
+            return None
+
+    def _update_mems(self, hids, mems, mlen, qlen):
+        # does not deal with None
+        if mems is None:
+            return None
+
+        # mems is not None
+        assert len(hids) == len(mems), "len(hids) != len(mems)"
+
+        # There are `mlen + qlen` steps that can be cached into mems
+        new_mems = []
+        end_idx = mlen + tf.math.maximum(0, qlen)
+        beg_idx = tf.math.maximum(0, end_idx - tf.convert_to_tensor(self.mem_len))
+        for i in range(len(hids)):
+            mems[i] = tf.cast(mems[i], dtype=hids[i].dtype)
+            cat = tf.concat([mems[i], hids[i]], axis=0)
+            tf.stop_gradient(cat)
+            new_mems.append(cat[beg_idx:end_idx])
+
+        return new_mems
+
+    @unpack_inputs
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        mems: list[tf.Tensor] | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool = False,
+    ):
+        # the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library
+        # so we transpose here from shape [bsz, len] to shape [len, bsz]
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_ids = tf.transpose(input_ids, perm=(1, 0))
+            qlen, bsz = shape_list(input_ids)
+        elif inputs_embeds is not None:
+            inputs_embeds = tf.transpose(inputs_embeds, perm=(1, 0, 2))
+            qlen, bsz = shape_list(inputs_embeds)[:2]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if mems is None:
+            mems = self.init_mems(bsz)
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] (a head_mask for each layer)
+        # and head_mask is converted to shape [num_hidden_layers x qlen x klen x bsz x n_head]
+        if head_mask is not None:
+            raise NotImplementedError
+        else:
+            head_mask = [None] * self.n_layer
+
+        if inputs_embeds is not None:
+            word_emb = inputs_embeds
+        else:
+            word_emb = self.word_emb(input_ids)
+
+        mlen = shape_list(mems[0])[0] if mems is not None else 0
+        klen = mlen + qlen
+
+        # Compute decoder attention mask
+        all_ones = tf.ones([qlen, klen], dtype=tf.int32)
+        upper_mask = 1 - tf.linalg.band_part(tf.ones([qlen, klen], dtype=tf.int32), -1, mlen)
+        if self.same_length:
+            mask_len = klen - self.mem_len
+            mask_shift_len = qlen - tf.nn.relu(mask_len)  # Lazy clamping of negatives to zero
+
+            # Use an indicator variable instead of a conditional to keep the compiler happy
+            lower_mask = tf.linalg.band_part(all_ones, -1, 0) - (
+                tf.linalg.band_part(all_ones, mask_shift_len - 1, 0) * tf.cast(mask_shift_len != 0, tf.int32)
+            )
+            dec_attn_mask = upper_mask + lower_mask
+        else:
+            dec_attn_mask = upper_mask
+
+        hids = []
+        attentions = [] if output_attentions else None
+        if self.attn_type == 0:  # default
+            pos_seq = tf.range(klen - 1, -1, -1.0)
+            if self.clamp_len > 0:
+                pos_seq = tf.minimum(pos_seq, self.clamp_len)
+            pos_emb = self.pos_emb(pos_seq)
+
+            core_out = self.drop(word_emb, training=training)
+            pos_emb = self.drop(pos_emb, training=training)
+
+            for i, layer in enumerate(self.layers):
+                hids.append(core_out)
+                mems_i = None if mems is None else mems[i]
+                layer_outputs = layer(
+                    core_out,
+                    pos_emb,
+                    dec_attn_mask,
+                    mems_i,
+                    head_mask[i],
+                    output_attentions,
+                    training=training,
+                )
+                core_out = layer_outputs[0]
+                if output_attentions:
+                    attentions.append(layer_outputs[1])
+        else:  # learnable embeddings and absolute embeddings
+            raise NotImplementedError  # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
+
+        core_out = self.drop(core_out, training=training)
+
+        new_mems = self._update_mems(hids, mems, mlen, qlen)
+
+        # We transpose back here to shape [bsz, len, hidden_dim]
+        core_out = tf.transpose(core_out, perm=(1, 0, 2))
+
+        if output_hidden_states:
+            # Transpose to library standard shape [bsz, len, hidden_dim] and add last layer
+            hids = tuple(tf.transpose(t, perm=(1, 0, 2)) for t in hids)
+            hids = hids + (core_out,)
+        else:
+            hids = None
+        if output_attentions:
+            # Transpose to library standard shape [bsz, n_heads, query_seq_len, key_seq_len]
+            attentions = tuple(tf.transpose(t, perm=(2, 3, 0, 1)) for t in attentions)
+
+        if not return_dict:
+            return tuple(v for v in [core_out, new_mems, hids, attentions] if v is not None)
+
+        return TFTransfoXLModelOutput(
+            last_hidden_state=core_out,
+            mems=new_mems,
+            hidden_states=hids,
+            attentions=attentions,
+        )
+
+
+class TFTransfoXLPreTrainedModel(TFPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = TransfoXLConfig
+    base_model_prefix = "transformer"
+
+
+@dataclass
+class TFTransfoXLModelOutput(ModelOutput):
+    """
+    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
+
+    Args:
+        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        mems (`list[tf.Tensor]` of length `config.n_layers`):
+            Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `mems`
+            input) to speed up sequential decoding. The token ids which have their past given to this model should not
+            be passed as input ids as they have already been computed.
+        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    last_hidden_state: tf.Tensor | None = None
+    mems: list[tf.Tensor] = None
+    hidden_states: tuple[tf.Tensor] | None = None
+    attentions: tuple[tf.Tensor] | None = None
+
+
+@dataclass
+class TFTransfoXLLMHeadModelOutput(ModelOutput):
+    """
+    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
+
+    Args:
+        losses (`tf.Tensor` of shape *(batch_size, sequence_length-1)*, *optional*, returned when `labels` is provided):
+            Language modeling losses (not reduced).
+        prediction_scores (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token after SoftMax).
+        mems (`list[tf.Tensor]` of length `config.n_layers`):
+            Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `mems`
+            input) to speed up sequential decoding. The token ids which have their past given to this model should not
+            be passed as input ids as they have already been computed.
+        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    prediction_scores: tf.Tensor | None = None
+    mems: list[tf.Tensor] = None
+    hidden_states: tuple[tf.Tensor] | None = None
+    attentions: tuple[tf.Tensor] | None = None
+
+
+@dataclass
+class TFTransfoXLSequenceClassifierOutputWithPast(ModelOutput):
+    """
+    Base class for outputs of sentence classification models.
+
+    Args:
+        loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Classification (or regression if config.num_labels==1) loss.
+        logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        mems (`list[tf.Tensor]` of length `config.n_layers`):
+            Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `mems`
+            input) to speed up sequential decoding. The token ids which have their past given to this model should not
+            be passed as input ids as they have already been computed.
+        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: tf.Tensor | None = None
+    logits: tf.Tensor | None = None
+    mems: list[tf.Tensor] = None
+    hidden_states: tuple[tf.Tensor] | None = None
+    attentions: tuple[tf.Tensor] | None = None
+
+
+TRANSFO_XL_START_DOCSTRING = r"""
+
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
+    behavior.
+
+    <Tip>
+
+    TensorFlow models and layers in `transformers` accept two formats as input:
+
+    - having all inputs as keyword arguments (like PyTorch models), or
+    - having all inputs as a list, tuple or dict in the first positional argument.
+
+    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
+    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
+    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
+    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
+    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
+    positional argument:
+
+    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
+    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+    Note that when creating models and layers with
+    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
+    about any of this, as you can just pass inputs like you would to any other Python function!
+
+    </Tip>
+
+    Parameters:
+        config ([`TransfoXLConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+TRANSFO_XL_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            [`PreTrainedTokenizer.encode`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        mems (`list[tf.Tensor]` of length `config.n_layers`):
+            Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see
+            `mems` output below). Can be used to speed up sequential decoding. The token ids which have their mems
+            given to this model should not be passed as `input_ids` as they have already been computed.
+        head_mask (`tf.Tensor` or `Numpy array` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        inputs_embeds (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
+            config will be used instead.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
+            used instead.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
+            eager mode, in graph mode the value will always be set to True.
+        training (`bool`, *optional*, defaults to `False`):
+            Whether or not to use the model in training mode (some modules like dropout modules have different
+            behaviors between training and evaluation).
+"""
+
+
+@add_start_docstrings(
+    "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
+    TRANSFO_XL_START_DOCSTRING,
+)
+class TFTransfoXLModel(TFTransfoXLPreTrainedModel):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.transformer = TFTransfoXLMainLayer(config, name="transformer")
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(TRANSFO_XL_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFTransfoXLModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        mems: list[tf.Tensor] | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool = False,
+    ) -> TFTransfoXLModelOutput | tuple[tf.Tensor]:
+        outputs = self.transformer(
+            input_ids=input_ids,
+            mems=mems,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        return outputs
+
+
+@add_start_docstrings(
+    """
+    The Transformer-XL Model with a language modeling head on top (adaptive softmax with weights tied to the adaptive
+    input embeddings)
+    """,
+    TRANSFO_XL_START_DOCSTRING,
+)
+class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.transformer = TFTransfoXLMainLayer(config, name="transformer")
+        self.sample_softmax = config.sample_softmax
+        assert self.sample_softmax <= 0, (
+            "Sampling from the softmax is not implemented yet. Please look at issue: #3310:"
+            " https://github.com/huggingface/transformers/issues/3310"
+        )
+
+        self.crit = TFAdaptiveSoftmaxMask(
+            config.vocab_size, config.d_embed, config.d_model, config.cutoffs, div_val=config.div_val, name="crit"
+        )
+
+    def _resize_token_embeddings(self, new_num_tokens):
+        raise NotImplementedError()
+
+    def get_output_embeddings(self):
+        """Double-check if you are using adaptive softmax."""
+        if len(self.crit.out_layers) > 0:
+            return self.crit.out_layers[-1]
+        return None
+
+    def reset_memory_length(self, mem_len):
+        self.transformer.reset_memory_length(mem_len)
+
+    def init_mems(self, bsz):
+        return self.transformer.init_mems(bsz)
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(TRANSFO_XL_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFTransfoXLLMHeadModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        mems: list[tf.Tensor] | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool = False,
+    ) -> TFTransfoXLLMHeadModelOutput | tuple[tf.Tensor]:
+        if input_ids is not None:
+            bsz, tgt_len = shape_list(input_ids)[:2]
+        else:
+            bsz, tgt_len = shape_list(inputs_embeds)[:2]
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            mems,
+            head_mask,
+            inputs_embeds,
+            output_attentions,
+            output_hidden_states,
+            return_dict,
+            training=training,
+        )
+
+        last_hidden = transformer_outputs[0]
+        pred_hid = last_hidden[:, -tgt_len:]
+
+        softmax_output = self.crit(pred_hid, labels, training=training)
+        prediction_scores = softmax_output if labels is None else ()
+
+        if not return_dict:
+            return (prediction_scores,) + transformer_outputs[1:]
+
+        return TFTransfoXLLMHeadModelOutput(
+            prediction_scores=prediction_scores,
+            mems=transformer_outputs.mems,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **model_kwargs):
+        inputs = {}
+
+        # if past is defined in model kwargs then use it for faster decoding
+        if past_key_values:
+            input_ids = tf.expand_dims(input_ids[:, -1], axis=-1)
+        else:
+            input_ids = input_ids
+
+        return inputs
+
+    # Adapted from the torch tie_weights function
+    def tf_to_pt_weight_rename(self, tf_weight):
+        if self.config.tie_word_embeddings and "crit.out_layers" in tf_weight:
+            return tf_weight, tf_weight.replace("crit.out_layers", "transformer.word_emb.emb_layers")
+        elif self.config.tie_projs and "crit.out_projs" in tf_weight:
+            for i, tie_proj in enumerate(self.config.tie_projs):
+                if tie_proj and self.config.div_val == 1 and self.config.d_model != self.config.d_embed:
+                    # self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[0]
+                    return tf_weight, tf_weight.replace(f"crit.out_projs.{i}", "transformer.word_emb.emb_projs.0")
+                elif tie_proj and self.config.div_val != 1:
+                    # self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[i]
+                    return tf_weight, tf_weight.replace("crit.out_projs", "transformer.word_emb.emb_projs")
+        else:
+            return (tf_weight,)
+
+
+@add_start_docstrings(
+    """
+    The Transfo XL Model transformer with a sequence classification head on top (linear layer).
+
+    [`TFTransfoXLForSequenceClassification`] uses the last token in order to do the classification, as other causal
+    models (e.g. GPT-1,GPT-2) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    TRANSFO_XL_START_DOCSTRING,
+)
+class TFTransfoXLForSequenceClassification(TFTransfoXLPreTrainedModel, TFSequenceClassificationLoss):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+        self.score = keras.layers.Dense(
+            config.num_labels,
+            kernel_initializer=get_initializer(config.init_range),
+            name="score",
+            use_bias=False,
+        )
+        self.transformer = TFTransfoXLMainLayer(config, name="transformer")
+
+    def get_output_embeddings(self):
+        # Remove after transformers v4.32. Fix this model's `test_model_common_attributes` test too.
+        logger.warning(
+            "Sequence classification models do not have output embeddings. `.get_output_embeddings` will be removed "
+            "in transformers v4.32."
+        )
+        return self.transformer.word_emb
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(TRANSFO_XL_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFTransfoXLSequenceClassifierOutputWithPast,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        mems: list[tf.Tensor] | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool | None = False,
+    ) -> tuple | TFTransfoXLSequenceClassifierOutputWithPast:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the cross entropy classification loss. Indices should be in `[0, ...,
+            config.vocab_size - 1]`.
+        """
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            mems=mems,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+        in_logits = None
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                sequence_lengths = (
+                    tf.argmax(tf.cast(tf.math.equal(input_ids, self.config.pad_token_id), input_ids.dtype), axis=-1)
+                    - 1
+                )
+                sequence_lengths = tf.where(sequence_lengths >= 0, sequence_lengths, input_ids.shape[-1] - 1)
+                in_logits = tf.gather(logits, sequence_lengths, batch_dims=1, axis=1)
+            else:
+                sequence_lengths = -1
+                logger.warning_once(
+                    f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
+                    "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
+                )
+        loss = None
+
+        if labels is not None:
+            if input_ids is not None:
+                batch_size, sequence_length = shape_list(input_ids)[:2]
+            else:
+                batch_size, sequence_length = shape_list(inputs_embeds)[:2]
+            assert self.config.pad_token_id is not None or batch_size == 1, (
+                "Cannot handle batch sizes > 1 if no padding token is defined."
+            )
+
+            if not tf.is_tensor(sequence_lengths):
+                in_logits = logits[0:batch_size, sequence_lengths]
+
+            loss = self.hf_compute_loss(tf.reshape(labels, [-1, 1]), tf.reshape(in_logits, [-1, self.num_labels]))
+
+        pooled_logits = in_logits if in_logits is not None else logits
+
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFTransfoXLSequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            mems=transformer_outputs.mems,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+__all__ = [
+    "TFAdaptiveEmbedding",
+    "TFTransfoXLForSequenceClassification",
+    "TFTransfoXLLMHeadModel",
+    "TFTransfoXLMainLayer",
+    "TFTransfoXLModel",
+    "TFTransfoXLPreTrainedModel",
+]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl_utilities.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl_utilities.py
new file mode 100644
index 0000000000000000000000000000000000000000..48205e06fb20a473959544db4971dff0d3e58cbf
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl_utilities.py
@@ -0,0 +1,178 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+A TF 2.0 Adaptive Softmax for Transformer XL model.
+"""
+
+import tensorflow as tf
+
+from ....modeling_tf_utils import keras
+from ....tf_utils import shape_list
+
+
+class TFAdaptiveSoftmaxMask(keras.layers.Layer):
+    def __init__(self, vocab_size, d_embed, d_proj, cutoffs, div_val=1, keep_order=False, **kwargs):
+        super().__init__(**kwargs)
+
+        self.vocab_size = vocab_size
+        self.d_embed = d_embed
+        self.d_proj = d_proj
+
+        self.cutoffs = cutoffs + [vocab_size]
+        self.cutoff_ends = [0] + self.cutoffs
+        self.div_val = div_val
+
+        self.shortlist_size = self.cutoffs[0]
+        self.n_clusters = len(self.cutoffs) - 1
+        self.head_size = self.shortlist_size + self.n_clusters
+        self.keep_order = keep_order
+
+        self.out_layers = []
+        self.out_projs = []
+
+    def build(self, input_shape):
+        if self.n_clusters > 0:
+            self.cluster_weight = self.add_weight(
+                shape=(self.n_clusters, self.d_embed), initializer="zeros", trainable=True, name="cluster_weight"
+            )
+            self.cluster_bias = self.add_weight(
+                shape=(self.n_clusters,), initializer="zeros", trainable=True, name="cluster_bias"
+            )
+
+        if self.div_val == 1:
+            for i in range(len(self.cutoffs)):
+                if self.d_proj != self.d_embed:
+                    weight = self.add_weight(
+                        shape=(self.d_embed, self.d_proj),
+                        initializer="zeros",
+                        trainable=True,
+                        name=f"out_projs_._{i}",
+                    )
+                    self.out_projs.append(weight)
+                else:
+                    self.out_projs.append(None)
+                weight = self.add_weight(
+                    shape=(self.vocab_size, self.d_embed),
+                    initializer="zeros",
+                    trainable=True,
+                    name=f"out_layers_._{i}_._weight",
+                )
+                bias = self.add_weight(
+                    shape=(self.vocab_size,),
+                    initializer="zeros",
+                    trainable=True,
+                    name=f"out_layers_._{i}_._bias",
+                )
+                self.out_layers.append((weight, bias))
+        else:
+            for i in range(len(self.cutoffs)):
+                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
+                d_emb_i = self.d_embed // (self.div_val**i)
+
+                weight = self.add_weight(
+                    shape=(d_emb_i, self.d_proj), initializer="zeros", trainable=True, name=f"out_projs_._{i}"
+                )
+                self.out_projs.append(weight)
+                weight = self.add_weight(
+                    shape=(r_idx - l_idx, d_emb_i),
+                    initializer="zeros",
+                    trainable=True,
+                    name=f"out_layers_._{i}_._weight",
+                )
+                bias = self.add_weight(
+                    shape=(r_idx - l_idx,),
+                    initializer="zeros",
+                    trainable=True,
+                    name=f"out_layers_._{i}_._bias",
+                )
+                self.out_layers.append((weight, bias))
+        super().build(input_shape)
+
+    @staticmethod
+    def _logit(x, W, b, proj=None):
+        y = x
+        if proj is not None:
+            y = tf.einsum("ibd,ed->ibe", y, proj)
+        return tf.einsum("ibd,nd->ibn", y, W) + b
+
+    @staticmethod
+    def _gather_logprob(logprob, target):
+        lp_size = shape_list(logprob)
+        r = tf.range(lp_size[0], dtype=target.dtype)
+        idx = tf.stack([r, target], 1)
+        return tf.gather_nd(logprob, idx)
+
+    def call(self, hidden, target, return_mean=True, training=False):
+        head_logprob = 0
+        if self.n_clusters == 0:
+            output = self._logit(hidden, self.out_layers[0][0], self.out_layers[0][1], self.out_projs[0])
+            if target is not None:
+                loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target, logits=output)
+            out = tf.nn.log_softmax(output, axis=-1)
+        else:
+            hidden_sizes = shape_list(hidden)
+            out = []
+            loss = tf.zeros(hidden_sizes[:2])
+            for i in range(len(self.cutoffs)):
+                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
+                if target is not None:
+                    mask = (target >= l_idx) & (target < r_idx)
+                    mask_idx = tf.where(mask)
+                    cur_target = tf.boolean_mask(target, mask) - l_idx
+
+                if self.div_val == 1:
+                    cur_W = self.out_layers[0][0][l_idx:r_idx]
+                    cur_b = self.out_layers[0][1][l_idx:r_idx]
+                else:
+                    cur_W = self.out_layers[i][0]
+                    cur_b = self.out_layers[i][1]
+
+                if i == 0:
+                    cur_W = tf.concat([cur_W, self.cluster_weight], 0)
+                    cur_b = tf.concat([cur_b, self.cluster_bias], 0)
+
+                    head_logit = self._logit(hidden, cur_W, cur_b, self.out_projs[0])
+                    head_logprob = tf.nn.log_softmax(head_logit)
+                    out.append(head_logprob[..., : self.cutoffs[0]])
+                    if target is not None:
+                        cur_head_logprob = tf.boolean_mask(head_logprob, mask)
+                        cur_logprob = self._gather_logprob(cur_head_logprob, cur_target)
+                else:
+                    tail_logit = self._logit(hidden, cur_W, cur_b, self.out_projs[i])
+                    tail_logprob = tf.nn.log_softmax(tail_logit)
+                    cluster_prob_idx = self.cutoffs[0] + i - 1  # No probability for the head cluster
+                    logprob_i = head_logprob[..., cluster_prob_idx, None] + tail_logprob
+                    out.append(logprob_i)
+                    if target is not None:
+                        cur_head_logprob = tf.boolean_mask(head_logprob, mask)
+                        cur_tail_logprob = tf.boolean_mask(tail_logprob, mask)
+                        cur_logprob = self._gather_logprob(cur_tail_logprob, cur_target)
+                        cur_logprob += cur_head_logprob[:, self.cutoff_ends[1] + i - 1]
+                if target is not None:
+                    loss += tf.scatter_nd(mask_idx, -cur_logprob, shape_list(loss))
+            out = tf.concat(out, axis=-1)
+
+        if target is not None:
+            if return_mean:
+                loss = tf.reduce_mean(loss)
+            # Add the training-time loss value to the layer using `self.add_loss()`.
+            self.add_loss(loss)
+
+            # Log the loss as a metric (we could log arbitrary metrics,
+            # including different metrics for training and inference.
+            self.add_metric(loss, name=self.name, aggregation="mean" if return_mean else "")
+
+        return out
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/transfo_xl/modeling_transfo_xl.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/transfo_xl/modeling_transfo_xl.py
new file mode 100644
index 0000000000000000000000000000000000000000..49d07391320d0fbc93c969cc72eb27b3e5e4ed0a
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/transfo_xl/modeling_transfo_xl.py
@@ -0,0 +1,1303 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+PyTorch Transformer XL model. Adapted from https://github.com/kimiyoung/transformer-xl. In particular
+https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/mem_transformer.py
+"""
+
+import warnings
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ....modeling_utils import PreTrainedModel
+from ....utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
+from .configuration_transfo_xl import TransfoXLConfig
+from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "transfo-xl/transfo-xl-wt103"
+_CONFIG_FOR_DOC = "TransfoXLConfig"
+
+
+def build_tf_to_pytorch_map(model, config):
+    """
+    A map of modules from TF to PyTorch. This time I use a map to keep the PyTorch model as identical to the original
+    PyTorch model as possible.
+    """
+    tf_to_pt_map = {}
+
+    if hasattr(model, "transformer"):
+        # We are loading in a TransfoXLLMHeadModel => we will load also the Adaptive Softmax
+        tf_to_pt_map.update(
+            {
+                "transformer/adaptive_softmax/cutoff_0/cluster_W": model.crit.cluster_weight,
+                "transformer/adaptive_softmax/cutoff_0/cluster_b": model.crit.cluster_bias,
+            }
+        )
+        for i, (out_l, proj_l, tie_proj) in enumerate(
+            zip(model.crit.out_layers, model.crit.out_projs, config.tie_projs)
+        ):
+            layer_str = f"transformer/adaptive_softmax/cutoff_{i}/"
+            if config.tie_word_embeddings:
+                tf_to_pt_map.update({layer_str + "b": out_l.bias})
+            else:
+                raise NotImplementedError
+                # I don't think this is implemented in the TF code
+                tf_to_pt_map.update({layer_str + "lookup_table": out_l.weight, layer_str + "b": out_l.bias})
+            if not tie_proj:
+                tf_to_pt_map.update({layer_str + "proj": proj_l})
+        # Now load the rest of the transformer
+        model = model.transformer
+
+    # Embeddings
+    for i, (embed_l, proj_l) in enumerate(zip(model.word_emb.emb_layers, model.word_emb.emb_projs)):
+        layer_str = f"transformer/adaptive_embed/cutoff_{i}/"
+        tf_to_pt_map.update({layer_str + "lookup_table": embed_l.weight, layer_str + "proj_W": proj_l})
+
+    # Transformer blocks
+    for i, b in enumerate(model.layers):
+        layer_str = f"transformer/layer_{i}/"
+        tf_to_pt_map.update(
+            {
+                layer_str + "rel_attn/LayerNorm/gamma": b.dec_attn.layer_norm.weight,
+                layer_str + "rel_attn/LayerNorm/beta": b.dec_attn.layer_norm.bias,
+                layer_str + "rel_attn/o/kernel": b.dec_attn.o_net.weight,
+                layer_str + "rel_attn/qkv/kernel": b.dec_attn.qkv_net.weight,
+                layer_str + "rel_attn/r/kernel": b.dec_attn.r_net.weight,
+                layer_str + "ff/LayerNorm/gamma": b.pos_ff.layer_norm.weight,
+                layer_str + "ff/LayerNorm/beta": b.pos_ff.layer_norm.bias,
+                layer_str + "ff/layer_1/kernel": b.pos_ff.CoreNet[0].weight,
+                layer_str + "ff/layer_1/bias": b.pos_ff.CoreNet[0].bias,
+                layer_str + "ff/layer_2/kernel": b.pos_ff.CoreNet[3].weight,
+                layer_str + "ff/layer_2/bias": b.pos_ff.CoreNet[3].bias,
+            }
+        )
+
+    # Relative positioning biases
+    if config.untie_r:
+        r_r_list = []
+        r_w_list = []
+        for b in model.layers:
+            r_r_list.append(b.dec_attn.r_r_bias)
+            r_w_list.append(b.dec_attn.r_w_bias)
+    else:
+        r_r_list = [model.r_r_bias]
+        r_w_list = [model.r_w_bias]
+    tf_to_pt_map.update({"transformer/r_r_bias": r_r_list, "transformer/r_w_bias": r_w_list})
+    return tf_to_pt_map
+
+
+def load_tf_weights_in_transfo_xl(model, config, tf_path):
+    """Load tf checkpoints in a pytorch model"""
+    try:
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            "Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
+        raise
+    # Build TF to PyTorch weights loading map
+    tf_to_pt_map = build_tf_to_pytorch_map(model, config)
+
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    tf_weights = {}
+    for name, shape in init_vars:
+        logger.info(f"Loading TF weight {name} with shape {shape}")
+        array = tf.train.load_variable(tf_path, name)
+        tf_weights[name] = array
+
+    for name, pointer in tf_to_pt_map.items():
+        assert name in tf_weights
+        array = tf_weights[name]
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if "kernel" in name or "proj" in name:
+            array = np.transpose(array)
+        if ("r_r_bias" in name or "r_w_bias" in name) and len(pointer) > 1:
+            # Here we will split the TF weights
+            assert len(pointer) == array.shape[0]
+            for i, p_i in enumerate(pointer):
+                arr_i = array[i, ...]
+                try:
+                    assert p_i.shape == arr_i.shape
+                except AssertionError as e:
+                    e.args += (p_i.shape, arr_i.shape)
+                    raise
+                logger.info(f"Initialize PyTorch weight {name} for layer {i}")
+                p_i.data = torch.from_numpy(arr_i)
+        else:
+            try:
+                assert pointer.shape == array.shape, (
+                    f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
+                )
+            except AssertionError as e:
+                e.args += (pointer.shape, array.shape)
+                raise
+            logger.info(f"Initialize PyTorch weight {name}")
+            pointer.data = torch.from_numpy(array)
+        tf_weights.pop(name, None)
+        tf_weights.pop(name + "/Adam", None)
+        tf_weights.pop(name + "/Adam_1", None)
+
+    logger.info(f"Weights not copied to PyTorch model: {', '.join(tf_weights.keys())}")
+    return model
+
+
+class PositionalEmbedding(nn.Module):
+    def __init__(self, demb):
+        super().__init__()
+
+        self.demb = demb
+
+        inv_freq = 1 / (10000 ** (torch.arange(0.0, demb, 2.0) / demb))
+        self.register_buffer("inv_freq", inv_freq)
+
+    def forward(self, pos_seq, bsz=None):
+        sinusoid_inp = torch.outer(pos_seq, self.inv_freq)
+        pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=-1)
+
+        if bsz is not None:
+            return pos_emb[:, None, :].expand(-1, bsz, -1)
+        else:
+            return pos_emb[:, None, :]
+
+
+class PositionwiseFF(nn.Module):
+    def __init__(self, d_model, d_inner, dropout, pre_lnorm=False, layer_norm_epsilon=1e-5):
+        super().__init__()
+
+        self.d_model = d_model
+        self.d_inner = d_inner
+        self.dropout = dropout
+
+        self.CoreNet = nn.Sequential(
+            nn.Linear(d_model, d_inner),
+            nn.ReLU(inplace=True),
+            nn.Dropout(dropout),
+            nn.Linear(d_inner, d_model),
+            nn.Dropout(dropout),
+        )
+
+        self.layer_norm = nn.LayerNorm(d_model, eps=layer_norm_epsilon)
+
+        self.pre_lnorm = pre_lnorm
+
+    def forward(self, inp):
+        if self.pre_lnorm:
+            # layer normalization + positionwise feed-forward
+            core_out = self.CoreNet(self.layer_norm(inp))
+
+            # residual connection
+            output = core_out + inp
+        else:
+            # positionwise feed-forward
+            core_out = self.CoreNet(inp)
+
+            # residual connection + layer normalization
+            output = self.layer_norm(inp + core_out)
+
+        return output
+
+
+class RelPartialLearnableMultiHeadAttn(nn.Module):
+    def __init__(
+        self,
+        n_head,
+        d_model,
+        d_head,
+        dropout,
+        dropatt=0,
+        pre_lnorm=False,
+        r_r_bias=None,
+        r_w_bias=None,
+        layer_norm_epsilon=1e-5,
+    ):
+        super().__init__()
+
+        self.n_head = n_head
+        self.d_model = d_model
+        self.d_head = d_head
+        self.dropout = dropout
+
+        self.qkv_net = nn.Linear(d_model, 3 * n_head * d_head, bias=False)
+
+        self.drop = nn.Dropout(dropout)
+        self.dropatt = nn.Dropout(dropatt)
+        self.o_net = nn.Linear(n_head * d_head, d_model, bias=False)
+
+        self.layer_norm = nn.LayerNorm(d_model, eps=layer_norm_epsilon)
+
+        self.scale = 1 / (d_head**0.5)
+
+        self.pre_lnorm = pre_lnorm
+
+        if r_r_bias is None or r_w_bias is None:  # Biases are not shared
+            self.r_r_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
+            self.r_w_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
+        else:
+            self.r_r_bias = r_r_bias
+            self.r_w_bias = r_w_bias
+
+        self.r_net = nn.Linear(self.d_model, self.n_head * self.d_head, bias=False)
+
+    def _rel_shift(self, x):
+        zero_pad_shape = (x.size(0), 1) + x.size()[2:]
+        zero_pad = torch.zeros(zero_pad_shape, device=x.device, dtype=x.dtype)
+        x_padded = torch.cat([zero_pad, x], dim=1)
+
+        x_padded_shape = (x.size(1) + 1, x.size(0)) + x.size()[2:]
+        x_padded = x_padded.view(*x_padded_shape)
+
+        x = x_padded[1:].view_as(x)
+
+        return x
+
+    def forward(self, w, r, attn_mask=None, mems=None, head_mask=None, output_attentions=False):
+        qlen, rlen, bsz = w.size(0), r.size(0), w.size(1)
+
+        if mems is not None:
+            cat = torch.cat([mems, w], 0)
+            if self.pre_lnorm:
+                w_heads = self.qkv_net(self.layer_norm(cat))
+            else:
+                w_heads = self.qkv_net(cat)
+            r_head_k = self.r_net(r)
+
+            w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1)
+            w_head_q = w_head_q[-qlen:]
+        else:
+            if self.pre_lnorm:
+                w_heads = self.qkv_net(self.layer_norm(w))
+            else:
+                w_heads = self.qkv_net(w)
+            r_head_k = self.r_net(r)
+
+            w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1)
+
+        klen = w_head_k.size(0)
+
+        w_head_q = w_head_q.view(qlen, bsz, self.n_head, self.d_head)  # qlen x bsz x n_head x d_head
+        w_head_k = w_head_k.view(klen, bsz, self.n_head, self.d_head)  # qlen x bsz x n_head x d_head
+        w_head_v = w_head_v.view(klen, bsz, self.n_head, self.d_head)  # qlen x bsz x n_head x d_head
+
+        r_head_k = r_head_k.view(rlen, self.n_head, self.d_head)  # qlen x n_head x d_head
+
+        # compute attention score
+        rw_head_q = w_head_q + self.r_w_bias  # qlen x bsz x n_head x d_head
+        AC = torch.einsum("ibnd,jbnd->ijbn", (rw_head_q, w_head_k))  # qlen x klen x bsz x n_head
+
+        rr_head_q = w_head_q + self.r_r_bias
+        BD = torch.einsum("ibnd,jnd->ijbn", (rr_head_q, r_head_k))  # qlen x klen x bsz x n_head
+        BD = self._rel_shift(BD)
+
+        # [qlen x klen x bsz x n_head]
+        attn_score = AC + BD
+        attn_score.mul_(self.scale)
+
+        mask_value = torch.finfo(attn_score.dtype).min
+
+        # compute attention probability
+        if attn_mask is not None and torch.sum(attn_mask).item():
+            attn_mask = attn_mask == 1  # Switch to bool
+            if attn_mask.dim() == 2:
+                attn_score = (
+                    attn_score.float().masked_fill(attn_mask[None, :, :, None], mask_value).type_as(attn_score)
+                )
+            elif attn_mask.dim() == 3:
+                attn_score = attn_score.float().masked_fill(attn_mask[:, :, :, None], mask_value).type_as(attn_score)
+
+        # [qlen x klen x bsz x n_head]
+        attn_prob = nn.functional.softmax(attn_score, dim=1)
+        attn_prob = self.dropatt(attn_prob)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_prob = attn_prob * head_mask
+
+        # compute attention vector
+        attn_vec = torch.einsum("ijbn,jbnd->ibnd", (attn_prob, w_head_v))
+
+        # [qlen x bsz x n_head x d_head]
+        attn_vec = attn_vec.contiguous().view(attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head)
+
+        # linear projection
+        attn_out = self.o_net(attn_vec)
+        attn_out = self.drop(attn_out)
+
+        if self.pre_lnorm:
+            # residual connection
+            outputs = [w + attn_out]
+        else:
+            # residual connection + layer normalization
+            outputs = [self.layer_norm(w + attn_out)]
+
+        if output_attentions:
+            outputs.append(attn_prob)
+
+        return outputs
+
+
+class RelPartialLearnableDecoderLayer(nn.Module):
+    def __init__(self, n_head, d_model, d_head, d_inner, dropout, layer_norm_epsilon=1e-5, **kwargs):
+        super().__init__()
+
+        self.dec_attn = RelPartialLearnableMultiHeadAttn(
+            n_head, d_model, d_head, dropout, layer_norm_epsilon=layer_norm_epsilon, **kwargs
+        )
+        self.pos_ff = PositionwiseFF(
+            d_model, d_inner, dropout, pre_lnorm=kwargs.get("pre_lnorm"), layer_norm_epsilon=layer_norm_epsilon
+        )
+
+    def forward(self, dec_inp, r, dec_attn_mask=None, mems=None, head_mask=None, output_attentions=False):
+        attn_outputs = self.dec_attn(
+            dec_inp,
+            r,
+            attn_mask=dec_attn_mask,
+            mems=mems,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+        )
+        ff_output = self.pos_ff(attn_outputs[0])
+
+        outputs = [ff_output] + attn_outputs[1:]
+
+        return outputs
+
+
+class AdaptiveEmbedding(nn.Module):
+    def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, sample_softmax=False):
+        super().__init__()
+
+        self.n_token = n_token
+        self.d_embed = d_embed
+
+        self.cutoffs = cutoffs + [n_token]
+        self.div_val = div_val
+        self.d_proj = d_proj
+
+        self.emb_scale = d_proj**0.5
+
+        self.cutoff_ends = [0] + self.cutoffs
+
+        self.emb_layers = nn.ModuleList()
+        self.emb_projs = nn.ParameterList()
+        if div_val == 1:
+            self.emb_layers.append(nn.Embedding(n_token, d_embed, sparse=sample_softmax > 0))
+            if d_proj != d_embed:
+                self.emb_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_embed)))
+        else:
+            for i in range(len(self.cutoffs)):
+                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
+                d_emb_i = d_embed // (div_val**i)
+                self.emb_layers.append(nn.Embedding(r_idx - l_idx, d_emb_i))
+                self.emb_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_emb_i)))
+
+    def forward(self, inp):
+        if self.div_val == 1:
+            embed = self.emb_layers[0](inp)
+            if self.d_proj != self.d_embed:
+                embed = nn.functional.linear(embed, self.emb_projs[0])
+        else:
+            param = next(self.parameters())
+            inp_flat = inp.view(-1)
+            emb_flat = torch.zeros([inp_flat.size(0), self.d_proj], dtype=param.dtype, device=param.device)
+            for i in range(len(self.cutoffs)):
+                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
+
+                mask_i = (inp_flat >= l_idx) & (inp_flat < r_idx)
+                indices_i = mask_i.nonzero().squeeze()
+
+                if indices_i.numel() == 0:
+                    continue
+
+                inp_i = inp_flat.index_select(0, indices_i) - l_idx
+                emb_i = self.emb_layers[i](inp_i)
+                emb_i = nn.functional.linear(emb_i, self.emb_projs[i])
+
+                emb_flat.index_copy_(0, indices_i, emb_i)
+
+            embed_shape = inp.size() + (self.d_proj,)
+            embed = emb_flat.view(embed_shape)
+
+        embed.mul_(self.emb_scale)
+
+        return embed
+
+
+class TransfoXLPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config: TransfoXLConfig
+    load_tf_weights = load_tf_weights_in_transfo_xl
+    base_model_prefix = "transformer"
+
+    def _init_weight(self, weight):
+        if self.config.init == "uniform":
+            nn.init.uniform_(weight, -self.config.init_range, self.config.init_range)
+        elif self.config.init == "normal":
+            nn.init.normal_(weight, 0.0, self.config.init_std)
+
+    def _init_bias(self, bias):
+        nn.init.constant_(bias, 0.0)
+
+    def _init_weights(self, m):
+        """Initialize the weights."""
+        classname = m.__class__.__name__
+        if classname.find("Linear") != -1:
+            if hasattr(m, "weight") and m.weight is not None:
+                self._init_weight(m.weight)
+            if hasattr(m, "bias") and m.bias is not None:
+                self._init_bias(m.bias)
+        elif classname.find("AdaptiveEmbedding") != -1:
+            if hasattr(m, "emb_projs"):
+                for i in range(len(m.emb_projs)):
+                    if m.emb_projs[i] is not None:
+                        nn.init.normal_(m.emb_projs[i], 0.0, self.config.proj_init_std)
+        elif classname.find("Embedding") != -1:
+            if hasattr(m, "weight"):
+                self._init_weight(m.weight)
+        elif classname.find("ProjectedAdaptiveLogSoftmax") != -1:
+            if hasattr(m, "cluster_weight") and m.cluster_weight is not None:
+                self._init_weight(m.cluster_weight)
+            if hasattr(m, "cluster_bias") and m.cluster_bias is not None:
+                self._init_bias(m.cluster_bias)
+            if hasattr(m, "out_projs"):
+                for i in range(len(m.out_projs)):
+                    if m.out_projs[i] is not None:
+                        nn.init.normal_(m.out_projs[i], 0.0, self.config.proj_init_std)
+        elif classname.find("LayerNorm") != -1:
+            if hasattr(m, "weight"):
+                nn.init.normal_(m.weight, 1.0, self.config.init_std)
+            if hasattr(m, "bias") and m.bias is not None:
+                self._init_bias(m.bias)
+        else:
+            if hasattr(m, "r_emb"):
+                self._init_weight(m.r_emb)
+            if hasattr(m, "r_w_bias"):
+                self._init_weight(m.r_w_bias)
+            if hasattr(m, "r_r_bias"):
+                self._init_weight(m.r_r_bias)
+            if hasattr(m, "r_bias"):
+                self._init_bias(m.r_bias)
+
+    def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, layer: Optional[int] = -1):
+        """
+        Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size. Take care of tying
+        weights embeddings afterwards if the model class has a *tie_weights()* method.
+
+        Arguments:
+            new_num_tokens: (*optional*) int:
+                New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at
+                the end. Reducing the size will remove vectors from the end. If not provided or None: does nothing and
+                just returns a pointer to the input tokens `torch.nn.Embeddings` Module of the model.
+            layer: (*optional*) int:
+                Layer of the *AdaptiveEmbedding* where the resizing should be done. Per default the last layer will be
+                resized. Be aware that when resizing other than the last layer, you have to ensure that the new
+                token(s) in the tokenizer are at the corresponding position.
+
+        Return: `torch.nn.Embeddings` Pointer to the input tokens Embeddings Module of the model
+        """
+        base_model = getattr(self, self.base_model_prefix, self)  # get the base model if needed
+
+        if new_num_tokens is None:
+            return self.get_input_embeddings()
+
+        new_num_tokens_layer, layer = self._get_new_num_tokens_layer(new_num_tokens, layer)
+        assert new_num_tokens_layer > 0, "The size of the new embedding layer cannot be 0 or less"
+        model_embeds = base_model._resize_token_embeddings(new_num_tokens_layer, layer)
+
+        # Update base model and current model config
+        self.config.vocab_size = new_num_tokens
+        base_model.vocab_size = new_num_tokens
+        base_model.n_token = new_num_tokens
+
+        new_embedding_shapes = self._get_embedding_shapes()
+        self._resize_cutoffs(new_num_tokens, new_num_tokens_layer, new_embedding_shapes, layer)
+
+        # Tie weights again if needed
+        self.tie_weights()
+
+        return model_embeds
+
+    def _get_new_num_tokens_layer(self, new_num_tokens, layer):
+        embeddings = self.get_input_embeddings()
+        if layer == -1:
+            layer = len(embeddings.emb_layers) - 1
+        assert 0 <= layer <= len(embeddings.emb_layers) - 1
+
+        new_num_tokens_layer = (
+            new_num_tokens
+            - sum(emb.weight.shape[0] for emb in embeddings.emb_layers[:layer])
+            - sum(emb.weight.shape[0] for emb in embeddings.emb_layers[layer + 1 :])
+        )
+        return new_num_tokens_layer, layer
+
+    def _get_embedding_shapes(self):
+        embeddings = self.get_input_embeddings()
+        return [emb.weight.shape[0] for emb in embeddings.emb_layers]
+
+    def _resize_token_embeddings(self, new_num_tokens, layer=-1):
+        embeddings = self.get_input_embeddings()
+        if new_num_tokens is None:
+            return embeddings
+        new_embeddings_layer = self._get_resized_embeddings(embeddings.emb_layers[layer], new_num_tokens)
+        embeddings.emb_layers[layer] = new_embeddings_layer
+
+        self.set_input_embeddings(embeddings)
+
+        return self.get_input_embeddings()
+
+    def _resize_cutoffs(self, new_num_tokens, new_emb_size, new_embedding_shapes, layer):
+        embeddings = self.get_input_embeddings()
+
+        for i in range(layer, len(embeddings.cutoffs)):
+            embeddings.cutoffs[i] = sum(new_embedding_shapes[: i + 1])
+
+        embeddings.cutoff_ends = [0] + embeddings.cutoffs
+        embeddings.n_token = new_num_tokens
+
+        self.config.cutoffs = embeddings.cutoffs[:-1]
+
+        return embeddings.cutoffs
+
+
+@dataclass
+class TransfoXLModelOutput(ModelOutput):
+    """
+    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        mems (`list[torch.FloatTensor]` of length `config.n_layers`):
+            Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `mems`
+            input) to speed up sequential decoding. The token ids which have their past given to this model should not
+            be passed as input ids as they have already been computed.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    last_hidden_state: torch.FloatTensor
+    mems: Optional[list[torch.FloatTensor]] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class TransfoXLSequenceClassifierOutputWithPast(ModelOutput):
+    """
+    Base class for outputs of sentence classification models.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Classification (or regression if config.num_labels==1) loss.
+        logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        mems (`list[torch.FloatTensor]` of length `config.n_layers`):
+            Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `mems`
+            input) to speed up sequential decoding. The token ids which have their past given to this model should not
+            be passed as input ids as they have already been computed.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    mems: Optional[list[torch.FloatTensor]] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class TransfoXLLMHeadModelOutput(ModelOutput):
+    """
+    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
+
+    Args:
+        losses (`torch.FloatTensor` of shape *(batch_size, sequence_length-1)*, *optional*, returned when `labels` is provided):
+            Language modeling losses (not reduced).
+        prediction_scores (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token after SoftMax).
+        mems (`list[torch.FloatTensor]` of length `config.n_layers`):
+            Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `mems`
+            input) to speed up sequential decoding. The token ids which have their past given to this model should not
+            be passed as input ids as they have already been computed.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        loss (`torch.FloatTensor` of shape `()`, *optional*, returned when `labels` is provided)
+            Reduced language modeling loss.
+    """
+
+    losses: Optional[torch.FloatTensor] = None
+    prediction_scores: Optional[torch.FloatTensor] = None
+    mems: Optional[list[torch.FloatTensor]] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    loss: Optional[torch.FloatTensor] = None
+
+    @property
+    def logits(self):
+        # prediction scores are the output of the adaptive softmax, see
+        # the file `modeling_transfo_xl_utilities`. Since the adaptive
+        # softmax returns the log softmax value, `self.prediction_scores`
+        # are strictly speaking not exactly `logits`, but behave the same
+        # way logits do.
+        return self.prediction_scores
+
+
+TRANSFO_XL_START_DOCSTRING = r"""
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`TransfoXLConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+TRANSFO_XL_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        mems (`list[torch.FloatTensor]` of length `config.n_layers`):
+            Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see
+            `mems` output below). Can be used to speed up sequential decoding. The token ids which have their mems
+            given to this model should not be passed as `input_ids` as they have already been computed.
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
+    TRANSFO_XL_START_DOCSTRING,
+)
+class TransfoXLModel(TransfoXLPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.n_token = config.vocab_size
+
+        self.d_embed = config.d_embed
+        self.d_model = config.d_model
+        self.n_head = config.n_head
+        self.d_head = config.d_head
+
+        self.word_emb = AdaptiveEmbedding(
+            config.vocab_size, config.d_embed, config.d_model, config.cutoffs, div_val=config.div_val
+        )
+
+        self.drop = nn.Dropout(config.dropout)
+
+        self.n_layer = config.n_layer
+        self.mem_len = config.mem_len
+        self.attn_type = config.attn_type
+
+        if not config.untie_r:
+            self.r_w_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
+            self.r_r_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
+
+        self.layers = nn.ModuleList()
+        if config.attn_type == 0:  # the default attention
+            for i in range(config.n_layer):
+                self.layers.append(
+                    RelPartialLearnableDecoderLayer(
+                        config.n_head,
+                        config.d_model,
+                        config.d_head,
+                        config.d_inner,
+                        config.dropout,
+                        dropatt=config.dropatt,
+                        pre_lnorm=config.pre_lnorm,
+                        r_w_bias=None if config.untie_r else self.r_w_bias,
+                        r_r_bias=None if config.untie_r else self.r_r_bias,
+                        layer_norm_epsilon=config.layer_norm_epsilon,
+                    )
+                )
+        else:  # learnable embeddings and absolute embeddings are not used in our pretrained checkpoints
+            raise NotImplementedError  # Removed them to avoid maintaining dead code
+
+        self.same_length = config.same_length
+        self.clamp_len = config.clamp_len
+
+        if self.attn_type == 0:  # default attention
+            self.pos_emb = PositionalEmbedding(self.d_model)
+        else:  # learnable embeddings and absolute embeddings
+            raise NotImplementedError  # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.word_emb
+
+    def set_input_embeddings(self, new_embeddings):
+        self.word_emb = new_embeddings
+
+    def backward_compatible(self):
+        self.sample_softmax = -1
+
+    def reset_memory_length(self, mem_len):
+        self.mem_len = mem_len
+
+    def _prune_heads(self, heads):
+        logger.info("Head pruning is not implemented for Transformer-XL model")
+        pass
+
+    def init_mems(self, bsz):
+        if self.mem_len > 0:
+            mems = []
+            param = next(self.parameters())
+            for i in range(self.n_layer):
+                empty = torch.zeros(self.mem_len, bsz, self.config.d_model, dtype=param.dtype, device=param.device)
+                mems.append(empty)
+
+            return mems
+        else:
+            return None
+
+    def _update_mems(self, hids, mems, mlen, qlen):
+        # does not deal with None
+        if mems is None:
+            return None
+
+        # mems is not None
+        assert len(hids) == len(mems), "len(hids) != len(mems)"
+
+        # There are `mlen + qlen` steps that can be cached into mems
+        with torch.no_grad():
+            new_mems = []
+            end_idx = mlen + max(0, qlen)
+            beg_idx = max(0, end_idx - self.mem_len)
+            for i in range(len(hids)):
+                cat = torch.cat([mems[i], hids[i]], dim=0)
+                new_mems.append(cat[beg_idx:end_idx].detach())
+
+        return new_mems
+
+    @add_start_docstrings_to_model_forward(TRANSFO_XL_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TransfoXLModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        mems: Optional[list[torch.FloatTensor]] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, TransfoXLModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library
+        # so we transpose here from shape [bsz, len] to shape [len, bsz]
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_ids = input_ids.transpose(0, 1).contiguous()
+            qlen, bsz = input_ids.size()
+        elif inputs_embeds is not None:
+            inputs_embeds = inputs_embeds.transpose(0, 1).contiguous()
+            qlen, bsz = inputs_embeds.shape[0], inputs_embeds.shape[1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if mems is None:
+            mems = self.init_mems(bsz)
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] (a head_mask for each layer)
+        # and head_mask is converted to shape [num_hidden_layers x qlen x klen x bsz x n_head]
+        if head_mask is not None:
+            if head_mask.dim() == 1:
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(0).unsqueeze(0)
+                head_mask = head_mask.expand(self.n_layer, -1, -1, -1, -1)
+            elif head_mask.dim() == 2:
+                head_mask = head_mask.unsqueeze(1).unsqueeze(1).unsqueeze(1)
+            head_mask = head_mask.to(
+                dtype=next(self.parameters()).dtype
+            )  # switch to float if need + fp16 compatibility
+        else:
+            head_mask = [None] * self.n_layer
+
+        if inputs_embeds is not None:
+            word_emb = inputs_embeds
+        else:
+            word_emb = self.word_emb(input_ids)
+
+        mlen = mems[0].size(0) if mems is not None else 0
+        klen = mlen + qlen
+        if self.same_length:
+            all_ones = word_emb.new_ones((qlen, klen), dtype=torch.bool)
+            mask_len = klen - self.mem_len
+            if mask_len > 0:
+                mask_shift_len = qlen - mask_len
+            else:
+                mask_shift_len = qlen
+            dec_attn_mask = (torch.triu(all_ones, 1 + mlen) + torch.tril(all_ones, -mask_shift_len))[:, :, None]  # -1
+        else:
+            dec_attn_mask = torch.triu(word_emb.new_ones((qlen, klen), dtype=torch.bool), diagonal=1 + mlen)[
+                :, :, None
+            ]
+
+        hids = []
+        attentions = [] if output_attentions else None
+        if self.attn_type == 0:  # default
+            pos_seq = torch.arange(klen - 1, -1, -1.0, device=word_emb.device, dtype=torch.int64).type_as(
+                dtype=word_emb.dtype
+            )
+            if self.clamp_len > 0:
+                pos_seq.clamp_(max=self.clamp_len)
+            pos_emb = self.pos_emb(pos_seq)
+
+            core_out = self.drop(word_emb)
+            pos_emb = self.drop(pos_emb)
+
+            for i, layer in enumerate(self.layers):
+                hids.append(core_out)
+                mems_i = None if mems is None else mems[i]
+                layer_outputs = layer(
+                    core_out,
+                    pos_emb,
+                    dec_attn_mask=dec_attn_mask,
+                    mems=mems_i,
+                    head_mask=head_mask[i],
+                    output_attentions=output_attentions,
+                )
+                core_out = layer_outputs[0]
+                if output_attentions:
+                    attentions.append(layer_outputs[1])
+        else:  # learnable embeddings and absolute embeddings
+            raise NotImplementedError  # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
+
+        core_out = self.drop(core_out)
+
+        new_mems = self._update_mems(hids, mems, mlen, qlen)
+
+        if output_hidden_states:
+            # Add last layer and transpose to library standard shape [bsz, len, hidden_dim]
+            hids.append(core_out)
+            hids = tuple(t.transpose(0, 1).contiguous() for t in hids)
+        else:
+            hids = None
+        if output_attentions:
+            # Transpose to library standard shape [bsz, n_heads, query_seq_len, key_seq_len]
+            attentions = tuple(t.permute(2, 3, 0, 1).contiguous() for t in attentions)
+        # We transpose back here to shape [bsz, len, hidden_dim]
+        core_out = core_out.transpose(0, 1).contiguous()
+
+        if not return_dict:
+            return tuple(v for v in [core_out, new_mems, hids, attentions] if v is not None)
+
+        return TransfoXLModelOutput(
+            last_hidden_state=core_out,
+            mems=new_mems,
+            hidden_states=hids,
+            attentions=attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    The Transformer-XL Model with a language modeling head on top (adaptive softmax with weights tied to the adaptive
+    input embeddings)
+    """,
+    TRANSFO_XL_START_DOCSTRING,
+)
+class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
+    _tied_weights_keys = [r"crit\.out_projs\.\d+", r"crit\.out_layers\.\d+\.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.transformer = TransfoXLModel(config)
+        self.sample_softmax = config.sample_softmax
+        self.trainer_compatible = getattr(config, "trainer_compatible", False)
+
+        if not self.trainer_compatible:
+            warnings.warn(
+                "The output of TransfoXL will be updated in v5 to support a single loss as first argument. In order "
+                "to use that updated output, please specify `trainer_compatible=True` as your configuration"
+                " attribute.",
+                DeprecationWarning,
+            )
+
+        assert self.sample_softmax <= 0, (
+            "Sampling from the softmax is not implemented yet. Please look at issue: #3310:"
+            " https://github.com/huggingface/transformers/issues/3310"
+        )
+
+        self.crit = ProjectedAdaptiveLogSoftmax(
+            config.vocab_size, config.d_embed, config.d_model, config.cutoffs, div_val=config.div_val
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def tie_weights(self):
+        """
+        Run this to be sure output and input (adaptive) softmax weights are tied
+        """
+
+        if self.config.tie_word_embeddings:
+            for i in range(len(self.crit.out_layers)):
+                self._tie_or_clone_weights(self.crit.out_layers[i], self.transformer.word_emb.emb_layers[i])
+        if self.config.tie_projs:
+            for i, tie_proj in enumerate(self.config.tie_projs):
+                if tie_proj and self.config.div_val == 1 and self.config.d_model != self.config.d_embed:
+                    if self.config.torchscript:
+                        self.crit.out_projs[i] = nn.Parameter(self.transformer.word_emb.emb_projs[0].clone())
+                    else:
+                        self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[0]
+                elif tie_proj and self.config.div_val != 1:
+                    if self.config.torchscript:
+                        self.crit.out_projs[i] = nn.Parameter(self.transformer.word_emb.emb_projs[i].clone())
+                    else:
+                        self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[i]
+
+    def reset_memory_length(self, mem_len):
+        self.transformer.reset_memory_length(mem_len)
+
+    def init_mems(self, bsz):
+        return self.transformer.init_mems(bsz)
+
+    @add_start_docstrings_to_model_forward(TRANSFO_XL_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TransfoXLLMHeadModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        mems: Optional[list[torch.FloatTensor]] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, TransfoXLLMHeadModelOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if input_ids is not None:
+            bsz, tgt_len = input_ids.size(0), input_ids.size(1)
+        elif inputs_embeds is not None:
+            bsz, tgt_len = inputs_embeds.size(0), inputs_embeds.size(1)
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            mems=mems,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden = transformer_outputs[0]
+        pred_hid = last_hidden[:, -tgt_len:]
+
+        if labels is not None:
+            # Prevents all labels being -100 and throwing an error
+            # when backwarding the loss
+            miss_valid_label = labels[0, 1:].sum() == (labels.size(1) - 1) * -100
+            if miss_valid_label:
+                # Sets an <EOS> token, just to prevent loss from being NaN
+                labels[0, 1] = self.config.eos_token_id
+
+        softmax_output = self.crit(pred_hid, labels)
+        prediction_scores = softmax_output.view(bsz, tgt_len, -1) if labels is None else ()
+
+        if labels is not None:
+            losses = softmax_output.view(bsz, tgt_len - 1)
+            # Avoids from incorporating padding (-100) tokens into loss value
+            loss = losses[losses != 0].mean()
+        else:
+            losses, loss = None, None
+
+        if not return_dict:
+            if self.trainer_compatible:
+                output = (prediction_scores, losses) if losses is not None else (prediction_scores,)
+                output += transformer_outputs[1:]
+                return ((loss,) + output) if loss is not None else output
+            else:
+                output = (prediction_scores, *transformer_outputs[1:])
+                output = ((losses,) + output) if losses is not None else output
+                return (output + (loss,)) if loss is not None else output
+
+        return TransfoXLLMHeadModelOutput(
+            loss=loss,
+            prediction_scores=prediction_scores,
+            losses=losses,
+            mems=transformer_outputs.mems,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    def get_output_embeddings(self):
+        """Double-check if you are using adaptive softmax."""
+        if self.sample_softmax > 0:
+            return self.out_layer
+        else:
+            return self.crit.out_layers[-1]
+
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **model_kwargs):
+        inputs = {}
+
+        # if past is defined in model kwargs then use it for faster decoding
+        if past_key_values:
+            inputs["mems"] = past_key_values
+            inputs["input_ids"] = input_ids[:, -1].unsqueeze(-1)
+        else:
+            inputs["input_ids"] = input_ids
+
+        return inputs
+
+    def _resize_cutoffs(self, new_num_tokens, new_emb_size, new_embedding_shapes, layer):
+        new_cutoffs = super()._resize_cutoffs(new_num_tokens, new_emb_size, new_embedding_shapes, layer)
+
+        self.crit.cutoffs = new_cutoffs
+        self.crit.cutoff_ends = [0] + new_cutoffs
+        self.crit.n_token = new_num_tokens
+
+    @staticmethod
+    def _reorder_cache(mems: list[torch.Tensor], beam_idx: torch.Tensor) -> list[torch.Tensor]:
+        """
+        This function is used to re-order the `mems` cache if [`~PreTrainedModel.beam_search`] or
+        [`~PreTrainedModel.beam_sample`] is called. This is required to match `mems` with the correct beam_idx at every
+        generation step.
+        """
+        return [layer_past.index_select(1, beam_idx.to(layer_past.device)) for layer_past in mems]
+
+
+@add_start_docstrings(
+    """
+    The Transformer-XL Model transformer with a sequence classification head on top (linear layer).
+
+    [`TransfoXLForSequenceClassification`] uses the last token in order to do the classification, as other causal
+    models (e.g. GPT-1) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    TRANSFO_XL_START_DOCSTRING,
+)
+class TransfoXLForSequenceClassification(TransfoXLPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.transformer = TransfoXLModel(config)
+        self.score = nn.Linear(config.d_embed, self.num_labels, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(TRANSFO_XL_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TransfoXLSequenceClassifierOutputWithPast,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        mems: Optional[list[torch.FloatTensor]] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, TransfoXLSequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            mems=mems,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size, sequence_length = input_ids.shape[:2]
+        else:
+            batch_size, sequence_length = inputs_embeds.shape[:2]
+
+        assert self.config.pad_token_id is not None or batch_size == 1, (
+            "Cannot handle batch sizes > 1 if no padding token is defined."
+        )
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
+                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
+                sequence_lengths = sequence_lengths % input_ids.shape[-1]
+                sequence_lengths = sequence_lengths.to(logits.device)
+            else:
+                sequence_lengths = -1
+                logger.warning_once(
+                    f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
+                    "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
+                )
+
+        pooled_logits = logits[range(batch_size), sequence_lengths]
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TransfoXLSequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            mems=transformer_outputs.mems,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+__all__ = [
+    "AdaptiveEmbedding",
+    "TransfoXLForSequenceClassification",
+    "TransfoXLLMHeadModel",
+    "TransfoXLModel",
+    "TransfoXLPreTrainedModel",
+    "load_tf_weights_in_transfo_xl",
+]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/transfo_xl/modeling_transfo_xl_utilities.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/transfo_xl/modeling_transfo_xl_utilities.py
new file mode 100644
index 0000000000000000000000000000000000000000..f76f3ccc6259fcb033b44eb43dd98be23482221c
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/transfo_xl/modeling_transfo_xl_utilities.py
@@ -0,0 +1,251 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Utilities for PyTorch Transformer XL model. Directly adapted from https://github.com/kimiyoung/transformer-xl.
+"""
+
+import torch
+from torch import nn
+
+
+# CUDA_MAJOR = int(torch.version.cuda.split('.')[0])
+# CUDA_MINOR = int(torch.version.cuda.split('.')[1])
+
+
+class ProjectedAdaptiveLogSoftmax(nn.Module):
+    def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, keep_order=False):
+        super().__init__()
+
+        self.n_token = n_token
+        self.d_embed = d_embed
+        self.d_proj = d_proj
+
+        self.cutoffs = cutoffs + [n_token]
+        self.cutoff_ends = [0] + self.cutoffs
+        self.div_val = div_val
+
+        self.shortlist_size = self.cutoffs[0]
+        self.n_clusters = len(self.cutoffs) - 1
+        self.head_size = self.shortlist_size + self.n_clusters
+
+        if self.n_clusters > 0:
+            self.cluster_weight = nn.Parameter(torch.zeros(self.n_clusters, self.d_embed))
+            self.cluster_bias = nn.Parameter(torch.zeros(self.n_clusters))
+
+        self.out_layers = nn.ModuleList()
+        self.out_projs = nn.ParameterList()
+
+        if div_val == 1:
+            for i in range(len(self.cutoffs)):
+                if d_proj != d_embed:
+                    self.out_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_embed)))
+                else:
+                    self.out_projs.append(None)
+
+            self.out_layers.append(nn.Linear(d_embed, n_token))
+        else:
+            for i in range(len(self.cutoffs)):
+                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
+                d_emb_i = d_embed // (div_val**i)
+
+                self.out_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_emb_i)))
+
+                self.out_layers.append(nn.Linear(d_emb_i, r_idx - l_idx))
+
+        self.keep_order = keep_order
+
+    def _compute_logit(self, hidden, weight, bias, proj):
+        if proj is None:
+            logit = nn.functional.linear(hidden, weight, bias=bias)
+        else:
+            # if CUDA_MAJOR <= 9 and CUDA_MINOR <= 1:
+            proj_hid = nn.functional.linear(hidden, proj.t().contiguous())
+            logit = nn.functional.linear(proj_hid, weight, bias=bias)
+            # else:
+            #     logit = torch.einsum('bd,de,ev->bv', (hidden, proj, weight.t()))
+            #     if bias is not None:
+            #         logit = logit + bias
+
+        return logit
+
+    def forward(self, hidden, labels=None, keep_order=False):
+        """
+        Params:
+            hidden :: [len*bsz x d_proj]
+            labels :: [len*bsz]
+
+        Return:
+            if labels is None: out :: [len*bsz x n_tokens] log probabilities of tokens over the vocabulary else: out ::
+            [(len-1)*bsz] Negative log likelihood. We could replace this implementation by the native PyTorch one if
+            theirs had an option to set bias on all clusters in the native one. here:
+            https://github.com/pytorch/pytorch/blob/dbe6a7a9ff1a364a8706bf5df58a1ca96d2fd9da/torch/nn/modules/adaptive.py#L138
+        """
+
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            hidden = hidden[..., :-1, :].contiguous()
+            labels = labels[..., 1:].contiguous()
+            hidden = hidden.view(-1, hidden.size(-1))
+            labels = labels.view(-1)
+            if hidden.size(0) != labels.size(0):
+                raise RuntimeError("Input and labels should have the same size in the batch dimension.")
+        else:
+            hidden = hidden.view(-1, hidden.size(-1))
+
+        if self.n_clusters == 0:
+            logit = self._compute_logit(hidden, self.out_layers[0].weight, self.out_layers[0].bias, self.out_projs[0])
+            if labels is not None:
+                mask = labels != -100
+                out = torch.zeros_like(labels, dtype=hidden.dtype, device=hidden.device)
+                out[mask] = (
+                    -nn.functional.log_softmax(logit, dim=-1)[mask].gather(1, labels[mask].unsqueeze(1)).squeeze(1)
+                )
+            else:
+                out = nn.functional.log_softmax(logit, dim=-1)
+        else:
+            # construct weights and biases
+            weights, biases = [], []
+            for i in range(len(self.cutoffs)):
+                if self.div_val == 1:
+                    l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
+                    weight_i = self.out_layers[0].weight[l_idx:r_idx]
+                    bias_i = self.out_layers[0].bias[l_idx:r_idx]
+                else:
+                    weight_i = self.out_layers[i].weight
+                    bias_i = self.out_layers[i].bias
+
+                if i == 0:
+                    weight_i = torch.cat([weight_i, self.cluster_weight], dim=0)
+                    bias_i = torch.cat([bias_i, self.cluster_bias], dim=0)
+
+                weights.append(weight_i)
+                biases.append(bias_i)
+
+            head_weight, head_bias, head_proj = weights[0], biases[0], self.out_projs[0]
+
+            head_logit = self._compute_logit(hidden, head_weight, head_bias, head_proj)
+            head_logprob = nn.functional.log_softmax(head_logit, dim=1)
+
+            if labels is None:
+                out = hidden.new_empty((head_logit.size(0), self.n_token))
+            else:
+                out = torch.zeros_like(labels, dtype=hidden.dtype, device=hidden.device)
+
+            offset = 0
+            cutoff_values = [0] + self.cutoffs
+            for i in range(len(cutoff_values) - 1):
+                l_idx, r_idx = cutoff_values[i], cutoff_values[i + 1]
+
+                if labels is not None:
+                    mask_i = (labels >= l_idx) & (labels < r_idx)
+                    indices_i = mask_i.nonzero().squeeze()
+
+                    if indices_i.numel() == 0:
+                        continue
+
+                    target_i = labels.index_select(0, indices_i) - l_idx
+                    head_logprob_i = head_logprob.index_select(0, indices_i)
+                    hidden_i = hidden.index_select(0, indices_i)
+                else:
+                    hidden_i = hidden
+
+                if i == 0:
+                    if labels is not None:
+                        logprob_i = head_logprob_i.gather(1, target_i[:, None]).squeeze(1)
+                    else:
+                        out[:, : self.cutoffs[0]] = head_logprob[:, : self.cutoffs[0]]
+                else:
+                    weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i]
+
+                    tail_logit_i = self._compute_logit(hidden_i, weight_i, bias_i, proj_i)
+                    tail_logprob_i = nn.functional.log_softmax(tail_logit_i, dim=1)
+                    cluster_prob_idx = self.cutoffs[0] + i - 1  # No probability for the head cluster
+                    if labels is not None:
+                        logprob_i = head_logprob_i[:, cluster_prob_idx] + tail_logprob_i.gather(
+                            1, target_i[:, None]
+                        ).squeeze(1)
+                    else:
+                        logprob_i = head_logprob[:, cluster_prob_idx, None] + tail_logprob_i
+                        out[:, l_idx:r_idx] = logprob_i
+
+                if labels is not None:
+                    if (hasattr(self, "keep_order") and self.keep_order) or keep_order:
+                        out.index_copy_(0, indices_i, -logprob_i)
+                    else:
+                        out[offset : offset + logprob_i.size(0)].copy_(-logprob_i)
+                    offset += logprob_i.size(0)
+
+        return out
+
+    def log_prob(self, hidden):
+        r"""
+        Computes log probabilities for all \\(n\_classes\\) From:
+        https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/adaptive.p
+
+        Args:
+            hidden (Tensor): a minibatch of example
+
+        Returns:
+            log-probabilities of for each class \\(c\\) in range \\(0 <= c <= n\_classes\\), where \\(n\_classes\\) is
+            a parameter passed to `AdaptiveLogSoftmaxWithLoss` constructor. Shape:
+
+            - Input: \\((N, in\_features)\\)
+            - Output: \\((N, n\_classes)\\)
+        """
+        if self.n_clusters == 0:
+            logit = self._compute_logit(hidden, self.out_layers[0].weight, self.out_layers[0].bias, self.out_projs[0])
+            return nn.functional.log_softmax(logit, dim=-1)
+        else:
+            # construct weights and biases
+            weights, biases = [], []
+            for i in range(len(self.cutoffs)):
+                if self.div_val == 1:
+                    l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
+                    weight_i = self.out_layers[0].weight[l_idx:r_idx]
+                    bias_i = self.out_layers[0].bias[l_idx:r_idx]
+                else:
+                    weight_i = self.out_layers[i].weight
+                    bias_i = self.out_layers[i].bias
+
+                if i == 0:
+                    weight_i = torch.cat([weight_i, self.cluster_weight], dim=0)
+                    bias_i = torch.cat([bias_i, self.cluster_bias], dim=0)
+
+                weights.append(weight_i)
+                biases.append(bias_i)
+
+            head_weight, head_bias, head_proj = weights[0], biases[0], self.out_projs[0]
+            head_logit = self._compute_logit(hidden, head_weight, head_bias, head_proj)
+
+            out = hidden.new_empty((head_logit.size(0), self.n_token))
+            head_logprob = nn.functional.log_softmax(head_logit, dim=1)
+
+            cutoff_values = [0] + self.cutoffs
+            for i in range(len(cutoff_values) - 1):
+                start_idx, stop_idx = cutoff_values[i], cutoff_values[i + 1]
+
+                if i == 0:
+                    out[:, : self.cutoffs[0]] = head_logprob[:, : self.cutoffs[0]]
+                else:
+                    weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i]
+
+                    tail_logit_i = self._compute_logit(hidden, weight_i, bias_i, proj_i)
+                    tail_logprob_i = nn.functional.log_softmax(tail_logit_i, dim=1)
+
+                    logprob_i = head_logprob[:, -i] + tail_logprob_i
+                    out[:, start_idx, stop_idx] = logprob_i
+
+            return out
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/transfo_xl/tokenization_transfo_xl.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/transfo_xl/tokenization_transfo_xl.py
new file mode 100644
index 0000000000000000000000000000000000000000..70e2da0185563d4115748266474a935ffddef059
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/transfo_xl/tokenization_transfo_xl.py
@@ -0,0 +1,825 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Tokenization classes for Transformer XL model. Adapted from https://github.com/kimiyoung/transformer-xl.
+"""
+
+import glob
+import os
+import pickle
+import re
+from collections import Counter, OrderedDict
+from typing import Optional
+
+import numpy as np
+
+from ....tokenization_utils import PreTrainedTokenizer
+from ....utils import (
+    cached_file,
+    check_torch_load_is_safe,
+    is_sacremoses_available,
+    is_torch_available,
+    logging,
+    requires_backends,
+    strtobool,
+    torch_only_method,
+)
+
+
+if is_sacremoses_available():
+    import sacremoses as sm
+
+
+if is_torch_available():
+    import torch
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {
+    "pretrained_vocab_file": "vocab.pkl",
+    "pretrained_vocab_file_torch": "vocab.bin",
+    "vocab_file": "vocab.txt",
+}
+
+
+PRETRAINED_CORPUS_ARCHIVE_MAP = {
+    "transfo-xl/transfo-xl-wt103": "https://huggingface.co/transfo-xl/transfo-xl-wt103/resolve/main/corpus.bin",
+}
+CORPUS_NAME = "corpus.bin"
+
+MATCH_NUMBERS = r"(?<=\d)[,.](?=\d)", r" @\g<0>@ "
+DETOKENIZE_NUMBERS = [(r" @\,@ ", r","), (r" @\.@ ", r".")]
+
+
+def tokenize_numbers(text_array: list[str]) -> list[str]:
+    """
+    Splits large comma-separated numbers and floating point values. This is done by replacing commas with ' @,@ ' and
+    dots with ' @.@ '.
+
+    Args:
+        text_array: An already tokenized text as list.
+
+    Returns:
+        A list of strings with tokenized numbers.
+
+    Example:
+
+    ```python
+    >>> tokenize_numbers(["$", "5,000", "1.73", "m"])
+    ['$', '5', '@,@', '000', '1', '@.@', '73', 'm']
+    ```"""
+    tokenized = []
+    for i in range(len(text_array)):
+        reg, sub = MATCH_NUMBERS
+        replaced = re.sub(reg, sub, text_array[i]).split()
+        tokenized.extend(replaced)
+
+    return tokenized
+
+
+def detokenize_numbers(text: str) -> str:
+    """
+    Inverts the operation of *tokenize_numbers*. This is replacing ' @,@ ' and ' @.@' by ',' and '.'.
+
+    Args:
+        text: A string where the number should be detokenized.
+
+    Returns:
+        A detokenized string.
+
+    Example:
+
+    ```python
+    >>> detokenize_numbers("$ 5 @,@ 000 1 @.@ 73 m")
+    '$ 5,000 1.73 m'
+    ```"""
+    for reg, sub in DETOKENIZE_NUMBERS:
+        text = re.sub(reg, sub, text)
+    return text
+
+
+class TransfoXLTokenizer(PreTrainedTokenizer):
+    """
+    Construct a Transformer-XL tokenizer adapted from Vocab class in [the original
+    code](https://github.com/kimiyoung/transformer-xl). The Transformer-XL tokenizer is a word-level tokenizer (no
+    sub-word tokenization).
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        special (`list[str]`, *optional*):
+            A list of special tokens (to be treated by the original implementation of this tokenizer).
+        min_freq (`int`, *optional*, defaults to 0):
+            The minimum number of times a token has to be present in order to be kept in the vocabulary (otherwise it
+            will be mapped to `unk_token`).
+        max_size (`int`, *optional*):
+            The maximum size of the vocabulary. If left unset, it will default to the size of the vocabulary found
+            after excluding the tokens according to the `min_freq` rule.
+        lower_case (`bool`, *optional*, defaults to `False`):
+            Whether or not to lowercase the input when tokenizing.
+        delimiter (`str`, *optional*):
+            The delimiter used between tokens.
+        vocab_file (`str`, *optional*):
+            File containing the vocabulary (from the original implementation).
+        pretrained_vocab_file (`str`, *optional*):
+            File containing the vocabulary as saved with the `save_pretrained()` method.
+        never_split (`list[str]`, *optional*):
+            List of tokens that should never be split. If no list is specified, will simply use the existing special
+            tokens.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        eos_token (`str`, *optional*, defaults to `"<eos>"`):
+            The end of sequence token.
+        additional_special_tokens (`list[str]`, *optional*, defaults to `['<formula>']`):
+            A list of additional special tokens (for the HuggingFace functionality).
+        language (`str`, *optional*, defaults to `"en"`):
+            The language of this tokenizer (used for mose preprocessing).
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids"]
+
+    def __init__(
+        self,
+        special=None,
+        min_freq=0,
+        max_size=None,
+        lower_case=False,
+        delimiter=None,
+        vocab_file=None,
+        pretrained_vocab_file: Optional[str] = None,
+        never_split=None,
+        unk_token="<unk>",
+        eos_token="<eos>",
+        additional_special_tokens=["<formula>"],
+        language="en",
+        **kwargs,
+    ):
+        logger.error(
+            "`TransfoXL` was deprecated due to security issues linked to `pickle.load` in `TransfoXLTokenizer`. "
+            "See more details on this model's documentation page: "
+            "`https://github.com/huggingface/transformers/blob/main/docs/source/en/model_doc/transfo-xl.md`."
+        )
+
+        requires_backends(self, "sacremoses")
+        if special is None:
+            special = []
+        self.counter = Counter()
+        self.special = special
+        self.min_freq = min_freq
+        self.max_size = max_size
+        self.lower_case = lower_case
+        self.delimiter = delimiter
+        self.vocab_file = vocab_file
+        self.punctuation_symbols = '!"#$%&()*+,-./\\:;<=>?@[\\]^_`{|}~'
+        self.punction_without_space_before_pattern = re.compile(rf"[^\s][{self.punctuation_symbols}]")
+        self.punctuation_with_space_around_pattern = self._compile_space_around_punctuation_pattern()
+        self.language = language
+        self.moses_punct_normalizer = sm.MosesPunctNormalizer(language)
+        self.moses_tokenizer = sm.MosesTokenizer(language)
+        self.moses_detokenizer = sm.MosesDetokenizer(language)
+        self.idx2sym = []
+        self.sym2idx = OrderedDict()
+        # This try... catch... is not beautiful but honestly this tokenizer was not made to be used
+        # in a library like ours, at all.
+        try:
+            vocab_dict = None
+            if pretrained_vocab_file is not None:
+                # Priority on pickle files (support PyTorch and TF)
+                if not strtobool(os.environ.get("TRUST_REMOTE_CODE", "False")):
+                    raise ValueError(
+                        "This part uses `pickle.load` which is insecure and will execute arbitrary code that is "
+                        "potentially malicious. It's recommended to never unpickle data that could have come from an "
+                        "untrusted source, or that could have been tampered with. If you already verified the pickle "
+                        "data and decided to use it, you can set the environment variable "
+                        "`TRUST_REMOTE_CODE` to `True` to allow it."
+                    )
+                with open(pretrained_vocab_file, "rb") as f:
+                    vocab_dict = pickle.load(f)
+
+                # Loading a torch-saved transfo-xl vocab dict with pickle results in an integer
+                # Entering this if statement means that we tried to load a torch-saved file with pickle, and we failed.
+                # We therefore load it with torch, if it's available.
+                if isinstance(vocab_dict, int):
+                    if not is_torch_available():
+                        raise ImportError(
+                            "Not trying to load dict with PyTorch as you need to install pytorch to load "
+                            "from a PyTorch pretrained vocabulary, "
+                            "or activate it with environment variables USE_TORCH=1 and USE_TF=0."
+                        )
+                    check_torch_load_is_safe()
+                    vocab_dict = torch.load(pretrained_vocab_file, weights_only=True)
+
+            if vocab_dict is not None:
+                for key, value in vocab_dict.items():
+                    if key not in self.__dict__ or key in ["sym2idx", "idx2sym"]:
+                        self.__dict__[key] = value
+            elif vocab_file is not None:
+                self.build_vocab()
+
+        except Exception as e:
+            raise ValueError(
+                f"Unable to parse file {pretrained_vocab_file}. Unknown format. "
+                "If you tried to load a model saved through TransfoXLTokenizerFast, "
+                "please note they are not compatible."
+            ) from e
+
+        if vocab_file is not None:
+            self.build_vocab()
+
+        super().__init__(
+            special=special,
+            min_freq=min_freq,
+            max_size=max_size,
+            lower_case=lower_case,
+            delimiter=delimiter,
+            vocab_file=vocab_file,
+            pretrained_vocab_file=pretrained_vocab_file,
+            never_split=never_split,
+            unk_token=unk_token,
+            eos_token=eos_token,
+            additional_special_tokens=additional_special_tokens,
+            language=language,
+            **kwargs,
+        )
+
+        # these are not required to initialize the parent class as only used when tokenizing.
+        if never_split is None:
+            never_split = self.all_special_tokens
+        self.never_split = never_split
+
+    @property
+    def do_lower_case(self):
+        return self.lower_case
+
+    def _compile_space_around_punctuation_pattern(self):
+        look_ahead_for_special_token = f"(?=[{self.punctuation_symbols}])"
+        look_ahead_to_match_all_except_space = r"(?=[^\s])"
+        return re.compile(r"" + look_ahead_for_special_token + look_ahead_to_match_all_except_space)
+
+    def count_file(self, path, verbose=False, add_eos=False):
+        if verbose:
+            logger.info(f"counting file {path} ...")
+        assert os.path.exists(path), f"Input file {path} not found"
+
+        sents = []
+        with open(path, "r", encoding="utf-8") as f:
+            for idx, line in enumerate(f):
+                if verbose and idx > 0 and idx % 500000 == 0:
+                    logger.info(f"    line {idx}")
+                symbols = self.tokenize(line, add_eos=add_eos)
+                self.counter.update(symbols)
+                sents.append(symbols)
+
+        return sents
+
+    def count_sents(self, sents, verbose=False):
+        """
+        sents : a list of sentences, each a list of tokenized symbols
+        """
+        if verbose:
+            logger.info(f"counting {len(sents)} sents ...")
+        for idx, symbols in enumerate(sents):
+            if verbose and idx > 0 and idx % 500000 == 0:
+                logger.info(f"    line {idx}")
+            self.counter.update(symbols)
+
+    def _build_from_file(self, vocab_file):
+        self.idx2sym = []
+        self.sym2idx = OrderedDict()
+
+        with open(vocab_file, "r", encoding="utf-8") as f:
+            for line in f:
+                symb = line.strip().split()[0]
+                self.add_symbol(symb)
+        if "<UNK>" in self.sym2idx:
+            self.unk_idx = self.sym2idx["<UNK>"]
+        elif "<unk>" in self.sym2idx:
+            self.unk_idx = self.sym2idx["<unk>"]
+        else:
+            raise ValueError("Token not in vocabulary and no <unk> token in vocabulary for replacement.")
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(
+                save_directory,
+                (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["pretrained_vocab_file"],
+            )
+        else:
+            vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
+        with open(vocab_file, "wb") as f:
+            pickle.dump(self.__dict__, f)
+        return (vocab_file,)
+
+    def build_vocab(self):
+        if self.vocab_file:
+            logger.info(f"building vocab from {self.vocab_file}")
+            self._build_from_file(self.vocab_file)
+            logger.info(f"Final vocab size {len(self.sym2idx)}")
+        else:
+            logger.info(f"building vocab with min_freq={self.min_freq}, max_size={self.max_size}")
+            self.idx2sym = []
+            self.sym2idx = OrderedDict()
+
+            for sym in self.special:
+                self.add_special(sym)
+
+            for sym, cnt in self.counter.most_common(self.max_size):
+                if cnt < self.min_freq:
+                    break
+                self.add_symbol(sym)
+
+            logger.info(f"Final vocab size {len(self.sym2idx)} from {len(self.counter)} unique tokens")
+
+    @torch_only_method
+    def encode_file(self, path, ordered=False, verbose=False, add_eos=True, add_double_eos=False):
+        if verbose:
+            logger.info(f"encoding file {path} ...")
+        assert os.path.exists(path), f"Output file {path} not found"
+        encoded = []
+        with open(path, "r", encoding="utf-8") as f:
+            for idx, line in enumerate(f):
+                if verbose and idx > 0 and idx % 500000 == 0:
+                    logger.info(f"    line {idx}")
+                symbols = self.tokenize(line, add_eos=add_eos, add_double_eos=add_double_eos)
+                encoded.append(self.convert_to_tensor(symbols))
+
+        if ordered:
+            encoded = torch.cat(encoded)
+
+        return encoded
+
+    @torch_only_method
+    def encode_sents(self, sents, ordered=False, verbose=False):
+        if verbose:
+            logger.info(f"encoding {len(sents)} sents ...")
+        encoded = []
+        for idx, symbols in enumerate(sents):
+            if verbose and idx > 0 and idx % 500000 == 0:
+                logger.info(f"    line {idx}")
+            encoded.append(self.convert_to_tensor(symbols))
+
+        if ordered:
+            encoded = torch.cat(encoded)
+
+        return encoded
+
+    def add_special(self, sym):
+        if sym not in self.sym2idx:
+            self.idx2sym.append(sym)
+            self.sym2idx[sym] = len(self.idx2sym) - 1
+            setattr(self, f"{sym.strip('<>')}_idx", self.sym2idx[sym])
+
+    def add_symbol(self, sym):
+        if sym not in self.sym2idx:
+            self.idx2sym.append(sym)
+            self.sym2idx[sym] = len(self.idx2sym) - 1
+
+    def move_added_token(self, token: str, target_idx: int):
+        """
+        Moves an added token to a specific position in the vocab. This method should be used when resizing an embedding
+        layer other than the last one in the `AdaptiveEmbedding` in order to move the token in the tokenizer from the
+        default position (at the very end) to the desired one.
+
+        Args:
+            token: The token to move to a specific position in the vocab.
+            target_idx: The position where the token should be moved to.
+        """
+        assert token in self.added_tokens_encoder, "Token which should be moved has to be an added token"
+        assert token not in self.idx2sym, "Token which should be moved is already in vocab"
+
+        # Insert sym into vocab
+        self.idx2sym.insert(target_idx, token)
+        self.sym2idx[token] = target_idx
+
+        # Shift following indices in sym2idx
+        for idx in range(target_idx + 1, len(self.idx2sym)):
+            current_sym = self.idx2sym[idx]
+            self.sym2idx[current_sym] = idx
+
+        # Delete token from added_tokens
+        old_index = self._added_tokens_encoder.pop(token)
+        self._added_tokens_decoder.pop(old_index)
+
+    def moses_punct_norm(self, text):
+        return self.moses_punct_normalizer.normalize(text)
+
+    def moses_tokenize(self, text):
+        return self.moses_tokenizer.tokenize(
+            text, aggressive_dash_splits=True, return_str=False, escape=False, protected_patterns=self.never_split
+        )
+
+    def moses_pipeline(self, text: str) -> list[str]:
+        """
+        Does basic tokenization using [`sacremoses.MosesPunctNormalizer`] and [`sacremoses.MosesTokenizer`] with
+        *aggressive_dash_splits=True* (see [`sacremoses.tokenize.MosesTokenizer.tokenize`]). Additionally, large
+        comma-separated numbers and floating point values are split. E.g. "23,000 people are 1.80m tall" -> "23 @,@ 000
+        people are 1 @.@ 80m tall"
+
+        Args:
+            text: Text to be tokenize
+
+        Returns:
+            A list of tokenized string
+
+        Example:
+
+        ```python
+        >>> tokenizer = TransfoXLTokenizer.from_pretrained("transfo-xl/transfo-xl-wt103")
+        >>> tokenizer.moses_pipeline("23,000 people are 1.80 m tall")
+        ['23', '@,@', '000', 'people', 'are', '1', '@.@', '80', 'm', 'tall']
+        ```"""
+        text = self.moses_punct_norm(text)
+        text = self.moses_tokenize(text)
+        text = tokenize_numbers(text)
+        return text
+
+    def _convert_id_to_token(self, idx):
+        """Converts an id in a token (BPE) using the vocab."""
+        assert 0 <= idx < len(self), f"Index {idx} out of vocabulary range"
+        return self.idx2sym[idx]
+
+    def _convert_token_to_id(self, sym):
+        """Converts a token (str) in an id using the vocab."""
+        if sym in self.sym2idx:
+            return self.sym2idx[sym]
+        else:
+            # logger.info(f'encounter unk {sym}')
+            # assert '<eos>' not in sym
+            if hasattr(self, "unk_idx"):
+                return self.sym2idx.get(sym, self.unk_idx)
+            # Backward compatibility with pre-trained models
+            elif "<unk>" in self.sym2idx:
+                return self.sym2idx["<unk>"]
+            elif "<UNK>" in self.sym2idx:
+                return self.sym2idx["<UNK>"]
+            else:
+                raise ValueError("Token not in vocabulary and no <unk> token in vocabulary for replacement.")
+
+    def convert_tokens_to_string(self, tokens):
+        """
+        Converts a sequence of tokens (string) in a single string. Additionally, the split numbers are converted back
+        into it's original form.
+        """
+        out_string = self.moses_detokenizer.detokenize(tokens)
+        return detokenize_numbers(out_string).strip()
+
+    @torch_only_method
+    def convert_to_tensor(self, symbols):
+        return torch.LongTensor(self.convert_tokens_to_ids(symbols))
+
+    @property
+    def vocab_size(self):
+        return len(self.idx2sym)
+
+    def get_vocab(self):
+        vocab = self.sym2idx.copy()
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def _tokenize(self, line, add_eos=False, add_double_eos=False):
+        line = line.strip()
+        # convert to lower case
+        if self.lower_case:
+            line = line.lower()
+
+        # empty delimiter '' will evaluate False
+        if self.delimiter == "":
+            symbols = line
+        else:
+            symbols = self.moses_pipeline(line)
+
+        if add_double_eos:  # lm1b
+            return ["<S>"] + symbols + ["<S>"]
+        elif add_eos:
+            return symbols + ["<eos>"]
+        else:
+            return symbols
+
+
+class LMOrderedIterator:
+    def __init__(self, data, bsz, bptt, device="cpu", ext_len=None):
+        """
+        data -- LongTensor -- the LongTensor is strictly ordered
+        """
+        self.bsz = bsz
+        self.bptt = bptt
+        self.ext_len = ext_len if ext_len is not None else 0
+
+        self.device = device
+
+        # Work out how cleanly we can divide the dataset into bsz parts.
+        self.n_step = data.size(0) // bsz
+
+        # Trim off any extra elements that wouldn't cleanly fit (remainders).
+        data = data.narrow(0, 0, self.n_step * bsz)
+
+        # Evenly divide the data across the bsz batches.
+        self.data = data.view(bsz, -1).t().contiguous().to(device)
+
+        # Number of mini-batches
+        self.n_batch = (self.n_step + self.bptt - 1) // self.bptt
+
+    def get_batch(self, i, bptt=None):
+        if bptt is None:
+            bptt = self.bptt
+        seq_len = min(bptt, self.data.size(0) - 1 - i)
+
+        end_idx = i + seq_len
+        beg_idx = max(0, i - self.ext_len)
+
+        data = self.data[beg_idx:end_idx]
+        target = self.data[i + 1 : i + 1 + seq_len]
+
+        data_out = data.transpose(0, 1).contiguous().to(self.device)
+        target_out = target.transpose(0, 1).contiguous().to(self.device)
+
+        return data_out, target_out, seq_len
+
+    def get_fixlen_iter(self, start=0):
+        for i in range(start, self.data.size(0) - 1, self.bptt):
+            yield self.get_batch(i)
+
+    def get_varlen_iter(self, start=0, std=5, min_len=5, max_deviation=3):
+        max_len = self.bptt + max_deviation * std
+        i = start
+        while True:
+            bptt = self.bptt if np.random.random() < 0.95 else self.bptt / 2.0
+            bptt = min(max_len, max(min_len, int(np.random.normal(bptt, std))))
+            data, target, seq_len = self.get_batch(i, bptt)
+            i += seq_len
+            yield data, target, seq_len
+            if i >= self.data.size(0) - 2:
+                break
+
+    def __iter__(self):
+        return self.get_fixlen_iter()
+
+
+class LMShuffledIterator:
+    def __init__(self, data, bsz, bptt, device="cpu", ext_len=None, shuffle=False):
+        """
+        data -- list[LongTensor] -- there is no order among the LongTensors
+        """
+        self.data = data
+
+        self.bsz = bsz
+        self.bptt = bptt
+        self.ext_len = ext_len if ext_len is not None else 0
+
+        self.device = device
+        self.shuffle = shuffle
+
+    def get_sent_stream(self):
+        # index iterator
+        epoch_indices = np.random.permutation(len(self.data)) if self.shuffle else np.array(range(len(self.data)))
+
+        # sentence iterator
+        for idx in epoch_indices:
+            yield self.data[idx]
+
+    @torch_only_method
+    def stream_iterator(self, sent_stream):
+        # streams for each data in the batch
+        streams = [None] * self.bsz
+
+        data = torch.LongTensor(self.bptt, self.bsz)
+        target = torch.LongTensor(self.bptt, self.bsz)
+
+        n_retain = 0
+
+        while True:
+            # data   : [n_retain+bptt x bsz]
+            # target : [bptt x bsz]
+            data[n_retain:].fill_(-1)
+            target.fill_(-1)
+
+            valid_batch = True
+
+            for i in range(self.bsz):
+                n_filled = 0
+                try:
+                    while n_filled < self.bptt:
+                        if streams[i] is None or len(streams[i]) <= 1:
+                            streams[i] = next(sent_stream)
+                        # number of new tokens to fill in
+                        n_new = min(len(streams[i]) - 1, self.bptt - n_filled)
+                        # first n_retain tokens are retained from last batch
+                        data[n_retain + n_filled : n_retain + n_filled + n_new, i] = streams[i][:n_new]
+                        target[n_filled : n_filled + n_new, i] = streams[i][1 : n_new + 1]
+                        streams[i] = streams[i][n_new:]
+                        n_filled += n_new
+                except StopIteration:
+                    valid_batch = False
+                    break
+
+            if not valid_batch:
+                return
+
+            data_out = data.transpose(0, 1).contiguous().to(self.device)
+            target_out = target.transpose(0, 1).contiguous().to(self.device)
+
+            yield data_out, target_out, self.bptt
+
+            n_retain = min(data.size(0), self.ext_len)
+            if n_retain > 0:
+                data[:n_retain] = data[-n_retain:]
+            data.resize_(n_retain + self.bptt, data.size(1))
+
+    def __iter__(self):
+        # sent_stream is an iterator
+        sent_stream = self.get_sent_stream()
+
+        for batch in self.stream_iterator(sent_stream):
+            yield batch
+
+
+class LMMultiFileIterator(LMShuffledIterator):
+    def __init__(self, paths, vocab, bsz, bptt, device="cpu", ext_len=None, shuffle=False):
+        self.paths = paths
+        self.vocab = vocab
+
+        self.bsz = bsz
+        self.bptt = bptt
+        self.ext_len = ext_len if ext_len is not None else 0
+
+        self.device = device
+        self.shuffle = shuffle
+
+    def get_sent_stream(self, path):
+        sents = self.vocab.encode_file(path, add_double_eos=True)
+        if self.shuffle:
+            np.random.shuffle(sents)
+        sent_stream = iter(sents)
+
+        return sent_stream
+
+    def __iter__(self):
+        if self.shuffle:
+            np.random.shuffle(self.paths)
+
+        for path in self.paths:
+            # sent_stream is an iterator
+            sent_stream = self.get_sent_stream(path)
+            for batch in self.stream_iterator(sent_stream):
+                yield batch
+
+
+class TransfoXLCorpus:
+    @classmethod
+    @torch_only_method
+    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
+        """
+        Instantiate a pre-processed corpus.
+        """
+        vocab = TransfoXLTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+        is_local = os.path.isdir(pretrained_model_name_or_path)
+        # redirect to the cache, if necessary
+        try:
+            resolved_corpus_file = cached_file(pretrained_model_name_or_path, CORPUS_NAME, cache_dir=cache_dir)
+        except OSError:
+            logger.error(
+                f"Corpus '{pretrained_model_name_or_path}' was not found in corpus list"
+                f" ({', '.join(PRETRAINED_CORPUS_ARCHIVE_MAP.keys())}. We assumed '{pretrained_model_name_or_path}'"
+                f" was a path or url but couldn't find files {CORPUS_NAME} at this path or url."
+            )
+            return None
+        if is_local:
+            logger.info(f"loading corpus file {resolved_corpus_file}")
+        else:
+            logger.info(f"loading corpus file {CORPUS_NAME} from cache at {resolved_corpus_file}")
+
+        # Instantiate tokenizer.
+        corpus = cls(*inputs, **kwargs)
+        check_torch_load_is_safe()
+        corpus_dict = torch.load(resolved_corpus_file, weights_only=True)
+        for key, value in corpus_dict.items():
+            corpus.__dict__[key] = value
+        corpus.vocab = vocab
+        if corpus.train is not None:
+            corpus.train = torch.tensor(corpus.train, dtype=torch.long)
+        if corpus.valid is not None:
+            corpus.valid = torch.tensor(corpus.valid, dtype=torch.long)
+        if corpus.test is not None:
+            corpus.test = torch.tensor(corpus.test, dtype=torch.long)
+        return corpus
+
+    def __init__(self, *args, **kwargs):
+        self.vocab = TransfoXLTokenizer(*args, **kwargs)
+        self.dataset = None
+        self.train = None
+        self.valid = None
+        self.test = None
+
+    def build_corpus(self, path, dataset):
+        self.dataset = dataset
+
+        if self.dataset in ["ptb", "wt2", "enwik8", "text8"]:
+            self.vocab.count_file(os.path.join(path, "train.txt"))
+            self.vocab.count_file(os.path.join(path, "valid.txt"))
+            self.vocab.count_file(os.path.join(path, "test.txt"))
+        elif self.dataset == "wt103":
+            self.vocab.count_file(os.path.join(path, "train.txt"))
+        elif self.dataset == "lm1b":
+            train_path_pattern = os.path.join(
+                path,
+                "1-billion-word-language-modeling-benchmark-r13output",
+                "training-monolingual.tokenized.shuffled",
+                "news.en-*",
+            )
+            train_paths = glob.glob(train_path_pattern)
+            # the vocab will load from file when build_vocab() is called
+
+        self.vocab.build_vocab()
+
+        if self.dataset in ["ptb", "wt2", "wt103"]:
+            self.train = self.vocab.encode_file(os.path.join(path, "train.txt"), ordered=True)
+            self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"), ordered=True)
+            self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=True)
+        elif self.dataset in ["enwik8", "text8"]:
+            self.train = self.vocab.encode_file(os.path.join(path, "train.txt"), ordered=True, add_eos=False)
+            self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"), ordered=True, add_eos=False)
+            self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=True, add_eos=False)
+        elif self.dataset == "lm1b":
+            self.train = train_paths
+            self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"), ordered=False, add_double_eos=True)
+            self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=False, add_double_eos=True)
+
+    def get_iterator(self, split, *args, **kwargs):
+        if split == "train":
+            if self.dataset in ["ptb", "wt2", "wt103", "enwik8", "text8"]:
+                data_iter = LMOrderedIterator(self.train, *args, **kwargs)
+            elif self.dataset == "lm1b":
+                kwargs["shuffle"] = True
+                data_iter = LMMultiFileIterator(self.train, self.vocab, *args, **kwargs)
+        elif split in ["valid", "test"]:
+            data = self.valid if split == "valid" else self.test
+            if self.dataset in ["ptb", "wt2", "wt103", "enwik8", "text8"]:
+                data_iter = LMOrderedIterator(data, *args, **kwargs)
+            elif self.dataset == "lm1b":
+                data_iter = LMShuffledIterator(data, *args, **kwargs)
+        else:
+            data_iter = None
+            raise ValueError(f"Split not recognized: {split}")
+
+        return data_iter
+
+
+@torch_only_method
+def get_lm_corpus(datadir, dataset):
+    fn = os.path.join(datadir, "cache.pt")
+    fn_pickle = os.path.join(datadir, "cache.pkl")
+    if os.path.exists(fn):
+        logger.info("Loading cached dataset...")
+        check_torch_load_is_safe()
+        corpus = torch.load(fn_pickle, weights_only=True)
+    elif os.path.exists(fn):
+        logger.info("Loading cached dataset from pickle...")
+        if not strtobool(os.environ.get("TRUST_REMOTE_CODE", "False")):
+            raise ValueError(
+                "This part uses `pickle.load` which is insecure and will execute arbitrary code that is potentially "
+                "malicious. It's recommended to never unpickle data that could have come from an untrusted source, or "
+                "that could have been tampered with. If you already verified the pickle data and decided to use it, "
+                "you can set the environment variable `TRUST_REMOTE_CODE` to `True` to allow it."
+            )
+        with open(fn, "rb") as fp:
+            corpus = pickle.load(fp)
+    else:
+        logger.info(f"Producing dataset {dataset}...")
+        kwargs = {}
+        if dataset in ["wt103", "wt2"]:
+            kwargs["special"] = ["<eos>"]
+            kwargs["lower_case"] = False
+        elif dataset == "ptb":
+            kwargs["special"] = ["<eos>"]
+            kwargs["lower_case"] = True
+        elif dataset == "lm1b":
+            kwargs["special"] = []
+            kwargs["lower_case"] = False
+            kwargs["vocab_file"] = os.path.join(datadir, "1b_word_vocab.txt")
+        elif dataset in ["enwik8", "text8"]:
+            pass
+
+        corpus = TransfoXLCorpus(datadir, dataset, **kwargs)
+        torch.save(corpus, fn)
+
+    return corpus
+
+
+__all__ = ["TransfoXLCorpus", "TransfoXLTokenizer"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/tvlt/__init__.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/tvlt/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..941db2f6ac5fefe66e06d598e1adbe0db1cf1769
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/tvlt/__init__.py
@@ -0,0 +1,20 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+from typing import TYPE_CHECKING
+
+from ....utils import _LazyModule
+from ....utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_tvlt import *
+    from .feature_extraction_tvlt import *
+    from .processing_tvlt import *
+    from .modeling_tvlt import *
+    from .image_processing_tvlt import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/tvlt/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/tvlt/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fce8d547cc528ef84e5f628ebc8339803536976b
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/tvlt/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/tvlt/__pycache__/configuration_tvlt.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/tvlt/__pycache__/configuration_tvlt.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..24fa4f5eec157b38c86ff1146dc21df2eb026c8d
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/tvlt/__pycache__/configuration_tvlt.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/tvlt/__pycache__/feature_extraction_tvlt.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/tvlt/__pycache__/feature_extraction_tvlt.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..222438926ca1f8acf0ca33a75d0670e258522d3a
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/tvlt/__pycache__/feature_extraction_tvlt.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/tvlt/__pycache__/image_processing_tvlt.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/tvlt/__pycache__/image_processing_tvlt.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bd7909f68438d63042422252a8e100bcc3990581
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/tvlt/__pycache__/image_processing_tvlt.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/tvlt/__pycache__/modeling_tvlt.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/tvlt/__pycache__/modeling_tvlt.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6cf5cd713c9e9b0aab6ce515c1b1e3ef9bb66d18
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/tvlt/__pycache__/modeling_tvlt.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/tvlt/__pycache__/processing_tvlt.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/tvlt/__pycache__/processing_tvlt.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..19fb50c42ac6c3211dd9e933749376d5c1d91b39
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/tvlt/__pycache__/processing_tvlt.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/tvlt/configuration_tvlt.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/tvlt/configuration_tvlt.py
new file mode 100644
index 0000000000000000000000000000000000000000..57144e349f5bcb244b4f9eeeefebd0312c4770aa
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/tvlt/configuration_tvlt.py
@@ -0,0 +1,187 @@
+# coding=utf-8
+# Copyright 2023 MURGe-Lab and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TVLT model configuration"""
+
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class TvltConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`TvltModel`]. It is used to instantiate a TVLT
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the TVLT
+    [ZinengTang/tvlt-base](https://huggingface.co/ZinengTang/tvlt-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        spectrogram_length (`int`, *optional*, defaults to 2048):
+            The time length of each audio spectrogram.
+        frequency_length (`int`, *optional*, defaults to 128):
+            The frequency length of audio spectrogram.
+        image_patch_size (`list[int]`, *optional*, defaults to `[16, 16]`):
+            The size (resolution) of each image patch.
+        audio_patch_size (`list[int]`, *optional*, defaults to `[16, 16]`):
+            The size (resolution) of each audio patch.
+        num_image_channels (`int`, *optional*, defaults to 3):
+            The number of input image channels.
+        num_audio_channels (`int`, *optional*, defaults to 1):
+            The number of input audio channels.
+        num_frames (`int`, *optional*, defaults to 8):
+            The maximum number of frames for an input video.
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+        use_mean_pooling (`bool`, *optional*, defaults to `False`):
+            Whether to mean pool the final hidden states instead of using the final hidden state of the [CLS] token.
+        decoder_num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the decoder.
+        decoder_hidden_size (`int`, *optional*, defaults to 512):
+            Dimensionality of the decoder.
+        decoder_num_hidden_layers (`int`, *optional*, defaults to 8):
+            Number of hidden layers in the decoder.
+        decoder_intermediate_size (`int`, *optional*, defaults to 2048):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the decoder.
+        pixel_mask_ratio (`float`, *optional*, defaults to 0.75):
+            Image patch masking ratio.
+        audio_mask_ratio (`float`, *optional*, defaults to 0.15):
+            Audio patch masking ratio.
+        audio_mask_type (`str`, *optional*, defaults to `"frame-level"`):
+            Audio patch masking type, choose between "frame-level" and "patch-level".
+        task_matching (`bool`, *optional*, defaults to `True`):
+            Whether to use vision audio matching task in pretraining.
+        task_mae (`bool`, *optional*, defaults to `True`):
+            Whether to use the masked auto-encoder (MAE) in pretraining.
+        loss_type (`str`, *optional*, defaults to `"classification"`):
+            Loss types including regression and classification.
+
+    Example:
+
+    ```python
+    >>> from transformers import TvltConfig, TvltModel
+
+    >>> # # Initializing a TVLT ZinengTang/tvlt-base style configuration
+    >>> configuration = TvltConfig()
+
+    >>> # # Initializing a model (with random weights) from the ZinengTang/tvlt-base style configuration
+    >>> model = TvltModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "tvlt"
+
+    def __init__(
+        self,
+        image_size=224,
+        spectrogram_length=2048,
+        frequency_length=128,
+        image_patch_size=[16, 16],
+        audio_patch_size=[16, 16],
+        num_image_channels=3,
+        num_audio_channels=1,
+        num_frames=8,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        initializer_range=0.02,
+        layer_norm_eps=1e-6,
+        qkv_bias=True,
+        use_mean_pooling=False,
+        decoder_num_attention_heads=16,
+        decoder_hidden_size=512,
+        decoder_num_hidden_layers=8,
+        decoder_intermediate_size=2048,
+        pixel_mask_ratio=0.75,
+        audio_mask_ratio=0.15,
+        audio_mask_type="frame-level",
+        task_matching=True,
+        task_mae=True,
+        loss_type="classification",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        if audio_mask_type not in ("frame-level", "patch_level"):
+            raise ValueError(
+                "audio_mask_type must be one of two acceptable strategies - {'frame_level', 'patch-level') "
+                f"got {audio_mask_type}"
+            )
+
+        self.image_size = image_size
+        self.spectrogram_length = spectrogram_length
+        self.frequency_length = frequency_length
+        self.image_patch_size = image_patch_size
+        self.audio_patch_size = audio_patch_size
+        self.num_image_channels = num_image_channels
+        self.num_audio_channels = num_audio_channels
+        self.num_frames = num_frames
+
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.qkv_bias = qkv_bias
+        self.use_mean_pooling = use_mean_pooling
+
+        self.decoder_num_attention_heads = decoder_num_attention_heads
+        self.decoder_hidden_size = decoder_hidden_size
+        self.decoder_num_hidden_layers = decoder_num_hidden_layers
+        self.decoder_intermediate_size = decoder_intermediate_size
+        self.pixel_mask_ratio = pixel_mask_ratio
+        self.audio_mask_ratio = audio_mask_ratio
+        self.audio_mask_type = audio_mask_type
+
+        self.task_matching = task_matching
+        self.task_mae = task_mae
+        self.loss_type = loss_type
+
+
+__all__ = ["TvltConfig"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/tvlt/feature_extraction_tvlt.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/tvlt/feature_extraction_tvlt.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9350d31a0197926ba96e04dacfdb7bc71adbfc6
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/tvlt/feature_extraction_tvlt.py
@@ -0,0 +1,233 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for TVLT."""
+
+from math import ceil
+from typing import Optional, Union
+
+import numpy as np
+
+from ....audio_utils import mel_filter_bank, spectrogram, window_function
+from ....feature_extraction_sequence_utils import BatchFeature, SequenceFeatureExtractor
+from ....utils import TensorType, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class TvltFeatureExtractor(SequenceFeatureExtractor):
+    r"""
+    Constructs a TVLT audio feature extractor. This feature extractor can be used to prepare audios for the model.
+
+    This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main methods. Users
+    should refer to this superclass for more information regarding those methods.
+
+    Args:
+        spectrogram_length (`dict[str, int]` *optional*, defaults to 2048):
+            The time length of each audio spectrogram.
+        num_channels (`int` *optional*, defaults to 1):
+            Number of audio channels.
+        patch_size (`list[int]` *optional*, defaults to `[16, 16]`):
+            The patch size of audio patch embedding.
+        feature_size (`int`, *optional*, defaults to 128):
+            The frequency length of audio spectrogram.
+        sampling_rate (`int`, *optional*, defaults to 44100):
+            The sampling rate at which the audio files should be digitalized expressed in Hertz (Hz).
+        hop_length_to_sampling_rate (`int`, *optional*, defaults to 86):
+            Hop length is length of the overlapping windows for the STFT used to obtain the Mel Frequency coefficients.
+            For example, with sampling rate 44100, the hop length is 512, with 44100 / 512 = 86
+        n_fft (`int`, *optional*, defaults to 2048):
+            Size of the Fourier transform.
+        padding_value (`float`, *optional*, defaults to 0.0):
+            Padding value used to pad the audio. Should correspond to silences.
+    """
+
+    model_input_names = ["audio_values", "audio_mask"]
+
+    def __init__(
+        self,
+        spectrogram_length=2048,
+        num_channels=1,
+        patch_size=[16, 16],
+        feature_size=128,
+        sampling_rate=44100,
+        hop_length_to_sampling_rate=86,
+        n_fft=2048,
+        padding_value=0.0,
+        **kwargs,
+    ):
+        super().__init__(
+            feature_size=feature_size,
+            sampling_rate=sampling_rate,
+            padding_value=padding_value,
+            **kwargs,
+        )
+
+        self.spectrogram_length = spectrogram_length
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.freq_len = feature_size // self.patch_size[1]
+        self.n_fft = n_fft
+        self.hop_length = sampling_rate // hop_length_to_sampling_rate
+        self.sampling_rate = sampling_rate
+        self.padding_value = padding_value
+        self.mel_filters = mel_filter_bank(
+            num_frequency_bins=1 + n_fft // 2,
+            num_mel_filters=feature_size,
+            min_frequency=0.0,
+            max_frequency=22050.0,
+            sampling_rate=sampling_rate,
+            norm="slaney",
+            mel_scale="slaney",
+        ).T
+
+    def _np_extract_fbank_features(self, waveform: np.ndarray) -> np.ndarray:
+        """
+        Compute the log-mel spectrogram of the provided audio, gives similar results to Whisper's original torch
+        implementation with 1e-5 tolerance.
+        """
+        log_spec = spectrogram(
+            waveform,
+            window_function(self.n_fft, "hann"),
+            frame_length=self.n_fft,
+            hop_length=self.hop_length,
+            power=2.0,
+            mel_filters=self.mel_filters.T,
+            log_mel="dB",
+            db_range=80.0,
+        )
+        log_spec = log_spec[:, :-1]
+        log_spec = log_spec - 20.0
+        log_spec = np.clip(log_spec / 40.0, -2.0, 0.0) + 1.0
+        return log_spec
+
+    def __call__(
+        self,
+        raw_speech: Union[np.ndarray, list[float], list[np.ndarray], list[list[float]]],
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_attention_mask: Optional[bool] = True,
+        sampling_rate: Optional[int] = None,
+        resample: bool = False,
+        mask_audio: bool = False,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Main method to prepare one or several audio(s) for the model.
+
+        Args:
+            raw_speech (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`):
+                The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
+                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
+                stereo, i.e. single float per timestep.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+            return_attention_mask (`bool`, *optional*, default to `True`):
+                Whether to return the attention mask. If left to the default, will return the attention mask according
+                to the specific feature_extractor's default. [What are attention masks?](../glossary#attention-mask)
+
+                <Tip>
+
+                For TvltTransformer models, `attention_mask` should always be passed for batched inference, to avoid
+                subtle bugs.
+
+                </Tip>
+
+            sampling_rate (`int`, *optional*):
+                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
+                `sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition
+                pipeline. Current model supports sampling rate 16000 and 44100.
+            resample (`bool`, *optional*, defaults to `False`):
+                If the sampling rate is not matched, resample the input audio to match.
+            mask_audio (`bool`, *optional*, defaults to `False`):
+                Whether or not to mask input audio for MAE task.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **audio_values** -- Audio values to be fed to a model, of shape (batch_size, num_channels, height,
+              width).
+
+            - **audio_mask** -- Audio masks to be fed to a model, of shape (batch_size, num_audio_patches).
+        """
+
+        if sampling_rate is not None:
+            if sampling_rate != self.sampling_rate:
+                raise ValueError(
+                    "This feature extractor is set to support sampling rate"
+                    f" of {self.sampling_rate}. Please make sure that the provided `raw_speech` input was sampled"
+                    f" with {self.sampling_rate} and not {sampling_rate}."
+                )
+        else:
+            logger.warning(
+                "It is strongly recommended to pass the `sampling_rate` argument to this function. "
+                "Failing to do so can result in silent errors that might be hard to debug."
+            )
+
+        is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
+        if is_batched_numpy and len(raw_speech.shape) > 2:
+            raise ValueError(f"Only mono-channel audio is supported for input to {self}")
+        is_batched = is_batched_numpy or (
+            isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list)))
+        )
+        if is_batched:
+            raw_speech = [np.asarray([speech], dtype=np.float32).T for speech in raw_speech]
+        elif not is_batched and not isinstance(raw_speech, np.ndarray):
+            raw_speech = np.asarray(raw_speech, dtype=np.float32)
+        elif isinstance(raw_speech, np.ndarray) and raw_speech.dtype is np.dtype(np.float64):
+            raw_speech = raw_speech.astype(np.float32)
+        # always return batch
+        if not is_batched:
+            raw_speech = [np.asarray([raw_speech]).T]
+
+        # Convert audio signals to log mel spectrograms, truncate by time axis
+        audio_features = [
+            self._np_extract_fbank_features(waveform.squeeze()).T[: self.spectrogram_length] for waveform in raw_speech
+        ]
+        if isinstance(audio_features[0], list):
+            audio_features = [np.asarray(feature, dtype=np.float32) for feature in audio_features]
+
+        # Create audio attention mask
+        max_patch_len = max(
+            ceil(feature.shape[0] / self.patch_size[0]) * self.freq_len for feature in audio_features
+        )  # The maximum number of audio patches in a batch
+        if return_attention_mask:
+            audio_mask = [
+                (ceil(feature.shape[0] / self.patch_size[0]) * self.freq_len) * [1]
+                + (max_patch_len - ceil(feature.shape[0] / self.patch_size[0]) * self.freq_len) * [0]
+                for feature in audio_features
+            ]
+            audio_mask = np.array(audio_mask).astype(np.float32)
+
+        # convert into correct format for padding
+        max_time_len = max_patch_len // self.freq_len * self.patch_size[0]  # The maximum audio size in a batch
+        padded_audio_features = np.ones([len(audio_features), 1, max_time_len, self.feature_size]).astype(np.float32)
+        padded_audio_features = padded_audio_features * self.padding_value
+        for i in range(len(audio_features)):
+            feature = audio_features[i]
+            padded_audio_features[i, :, : feature.shape[0], :] = feature
+
+        # return as BatchFeature
+        if return_attention_mask:
+            data = {"audio_values": padded_audio_features, "audio_mask": audio_mask}
+        else:
+            data = {"audio_values": padded_audio_features}
+
+        encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
+        return encoded_inputs
+
+
+__all__ = ["TvltFeatureExtractor"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/tvlt/image_processing_tvlt.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/tvlt/image_processing_tvlt.py
new file mode 100644
index 0000000000000000000000000000000000000000..01fb42429a96fdb8b171dbeedbd21e45a66a718f
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/tvlt/image_processing_tvlt.py
@@ -0,0 +1,438 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for TVLT."""
+
+from typing import Optional, Union
+
+import numpy as np
+
+from ....image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ....image_transforms import (
+    get_resize_output_image_size,
+    resize,
+    to_channel_dimension_format,
+)
+from ....image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    is_valid_image,
+    to_numpy_array,
+    valid_images,
+    validate_kwargs,
+    validate_preprocess_arguments,
+)
+from ....utils import TensorType, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+def make_batched(videos) -> list[list[ImageInput]]:
+    if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)):
+        return videos
+
+    elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
+        videos_dim = np.array(videos[0]).ndim
+        if videos_dim == 3:
+            return [videos]
+        elif videos_dim == 4:
+            return videos
+
+    elif is_valid_image(videos):
+        videos_dim = np.array(videos).ndim
+        if videos_dim == 3:
+            return [[videos]]
+        elif videos_dim == 4:
+            return [videos]
+        elif videos_dim == 5:
+            return videos
+
+    raise ValueError(f"Could not make batched video from {videos}")
+
+
+class TvltImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a TVLT image processor.
+
+    This processor can be used to prepare either videos or images for the model by converting images to 1-frame videos.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
+            `do_resize` parameter in the `preprocess` method.
+        size (`dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
+            Size of the output image after resizing. The shortest edge of the image will be resized to
+            `size["shortest_edge"]` while maintaining the aspect ratio of the original image. Can be overridden by
+            `size` in the `preprocess` method.
+        patch_size (`list[int]` *optional*, defaults to [16,16]):
+            The patch size of image patch embedding.
+        num_frames (`int` *optional*, defaults to 8):
+            The maximum number of video frames.
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+            Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
+            `preprocess` method.
+        do_center_crop (`bool`, *optional*, defaults to `True`):
+            Whether to center crop the image to the specified `crop_size`. Can be overridden by the `do_center_crop`
+            parameter in the `preprocess` method.
+        crop_size (`dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
+            Size of the image after applying the center crop. Can be overridden by the `crop_size` parameter in the
+            `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
+            parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to 1/255):
+            Defines the scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter
+            in the `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method.
+        image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+    """
+
+    model_input_names = [
+        "pixel_values",
+        "pixel_mask",
+        "pixel_values_mixed",
+        "pixel_mask_mixed",
+    ]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Optional[dict[str, int]] = None,
+        patch_size: list[int] = [16, 16],
+        num_frames: int = 8,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_center_crop: bool = True,
+        crop_size: Optional[dict[str, int]] = None,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, list[float]]] = IMAGENET_STANDARD_MEAN,
+        image_std: Optional[Union[float, list[float]]] = IMAGENET_STANDARD_STD,
+        init_mask_generator=False,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"shortest_edge": 224}
+        size = get_size_dict(size, default_to_square=False)
+        crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
+        crop_size = get_size_dict(crop_size, param_name="crop_size")
+
+        self.do_resize = do_resize
+        self.size = size
+        self.patch_size = patch_size
+        self.num_frames = num_frames
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self._valid_processor_keys = [
+            "videos",
+            "do_resize",
+            "size",
+            "patch_size",
+            "num_frames",
+            "resample",
+            "do_center_crop",
+            "crop_size",
+            "do_rescale",
+            "rescale_factor",
+            "do_normalize",
+            "image_mean",
+            "image_std",
+            "is_mixed",
+            "return_tensors",
+            "data_format",
+            "input_data_format",
+        ]
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`dict[str, int]`):
+                Size of the output image. If `size` is of the form `{"height": h, "width": w}`, the output image will
+                have the size `(h, w)`. If `size` is of the form `{"shortest_edge": s}`, the output image will have its
+                shortest edge of length `s` while keeping the aspect ratio of the original image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+                Resampling filter to use when resiizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        size = get_size_dict(size, default_to_square=False)
+        if "shortest_edge" in size:
+            output_size = get_resize_output_image_size(
+                image, size["shortest_edge"], default_to_square=False, input_data_format=input_data_format
+            )
+        elif "height" in size and "width" in size:
+            output_size = (size["height"], size["width"])
+        else:
+            raise ValueError(f"Size must have 'height' and 'width' or 'shortest_edge' as keys. Got {size.keys()}")
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    def _preprocess_image(
+        self,
+        image: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample: Optional[PILImageResampling] = None,
+        do_center_crop: Optional[bool] = None,
+        crop_size: Optional[dict[str, int]] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """Preprocesses a single image."""
+
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_center_crop=do_center_crop,
+            crop_size=crop_size,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+
+        # All transformations expect numpy arrays.
+        image = to_numpy_array(image)
+
+        if do_rescale and is_scaled_image(image):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            input_data_format = infer_channel_dimension_format(image)
+
+        if do_resize:
+            image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+
+        if do_center_crop:
+            image = self.center_crop(image, size=crop_size, input_data_format=input_data_format)
+
+        if do_rescale:
+            image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+
+        if do_normalize:
+            image = self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+        image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+        return image
+
+    def preprocess(
+        self,
+        videos: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        patch_size: Optional[list[int]] = None,
+        num_frames: Optional[int] = None,
+        resample: Optional[PILImageResampling] = None,
+        do_center_crop: Optional[bool] = None,
+        crop_size: Optional[dict[str, int]] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        is_mixed: bool = False,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Preprocess an videos or image or batch of videos or images.
+
+        Args:
+            videos (`ImageInput`):
+                Images or videos to preprocess. Expects a single or batch of frames with pixel values ranging from 0 to
+                255. If passing in frames with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after applying resize.
+            patch_size (`list[int]` *optional*, defaults to self.patch_size):
+                The patch size of image patch embedding.
+            num_frames (`int` *optional*, defaults to self.num_frames):
+                The maximum number of video frames.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`, Only
+                has an effect if `do_resize` is set to `True`.
+            do_center_crop (`bool`, *optional*, defaults to `self.do_centre_crop`):
+                Whether to centre crop the image.
+            crop_size (`dict[str, int]`, *optional*, defaults to `self.crop_size`):
+                Size of the image after applying the centre crop.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean.
+            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation.
+            is_mixed (`bool`, *optional*):
+                If the input video has negative samples.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                    - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                    - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                    - Unset: Use the inferred channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **pixel_values** -- Pixel values to be fed to a model, of shape (batch_size, num_channels, height,
+              width).
+
+            - **pixel_mask** -- Pixel masks to be fed to a model, of shape (batch_size, num_pixel_patches).
+
+            - **pixel_values_mixed** -- Pixel values with both positive or negative to be fed to a model, of shape
+              (batch_size, num_channels, height, width).
+
+            - **pixel_mask_mixed** -- Pixel masks with both positive or negative to be fed to a model, of shape
+              (batch_size, num_pixel_patches).
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        resample = resample if resample is not None else self.resample
+        do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+
+        size = size if size is not None else self.size
+        size = get_size_dict(size, default_to_square=False)
+        crop_size = crop_size if crop_size is not None else self.crop_size
+        crop_size = get_size_dict(crop_size, param_name="crop_size")
+        patch_size = patch_size if patch_size is not None else self.patch_size
+        num_frames = num_frames if patch_size is not None else self.num_frames
+
+        validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys)
+
+        if not valid_images(videos):
+            raise ValueError(
+                "Invalid image or video type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        videos = make_batched(videos)
+
+        # Check number of frames is fewer than maximum frames
+        for video in videos:
+            if len(video) > self.num_frames:
+                raise ValueError(
+                    f"number of frames must not be greater than the maximum frames of the model {self.num_frames}."
+                )
+
+        max_num_frames = max(len(video) for video in videos)
+        num_patches_per_image = (size["shortest_edge"] // patch_size[0]) ** 2
+        video_masks = np.array(
+            [
+                len(video) * num_patches_per_image * [1] + (max_num_frames - len(video)) * num_patches_per_image * [0]
+                for video in videos
+            ]
+        )
+
+        videos = [
+            [
+                self._preprocess_image(
+                    image=img,
+                    do_resize=do_resize,
+                    size=size,
+                    resample=resample,
+                    do_center_crop=do_center_crop,
+                    crop_size=crop_size,
+                    do_rescale=do_rescale,
+                    rescale_factor=rescale_factor,
+                    do_normalize=do_normalize,
+                    image_mean=image_mean,
+                    image_std=image_std,
+                    data_format=data_format,
+                    input_data_format=input_data_format,
+                )
+                for img in video
+            ]
+            for video in videos
+        ]
+
+        # If videos contain both positive/negative, use mixed key for video-audio matching task
+        if is_mixed:
+            data = {"pixel_values_mixed": videos, "pixel_mask_mixed": video_masks}
+        else:
+            data = {"pixel_values": videos, "pixel_mask": video_masks}
+
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+
+__all__ = ["TvltImageProcessor"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/tvlt/modeling_tvlt.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/tvlt/modeling_tvlt.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b21df928ff3d74022d858f7a73578802127dd08
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/tvlt/modeling_tvlt.py
@@ -0,0 +1,1274 @@
+# coding=utf-8
+# Copyright 2023 MURGe-Lab and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch TVLT model."""
+
+import collections.abc
+import math
+from copy import deepcopy
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ....activations import ACT2FN
+from ....modeling_layers import GradientCheckpointingLayer
+from ....modeling_outputs import BaseModelOutput, SequenceClassifierOutput
+from ....modeling_utils import PreTrainedModel
+from ....pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ....utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_tvlt import TvltConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "TvltConfig"
+_CHECKPOINT_FOR_DOC = "ZinengTang/tvlt-base"
+
+
+@dataclass
+class TvltModelOutput(ModelOutput):
+    """
+    Class for TvltModel's outputs, with potential hidden states and attentions.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        last_pixel_hidden_state (`torch.FloatTensor` of shape `(batch_size, pixel_sequence_length, hidden_size)`):
+            Pixel sequence of hidden-states at the output of the last layer of the model.
+        last_audio_hidden_state (`torch.FloatTensor` of shape `(batch_size, audio_sequence_length, hidden_size)`):
+            Audio sequence of hidden-states at the output of the last layer of the model.
+        pixel_label_masks (`torch.FloatTensor` of shape `(batch_size, pixel_patch_length)`):
+            Tensor indicating which pixel patches are masked (1) and which are not (0).
+        audio_label_masks (`torch.FloatTensor` of shape `(batch_size, audio_patch_length)`):
+            Tensor indicating which audio patches are masked (1) and which are not (0).
+        pixel_ids_restore (`torch.LongTensor` of shape `(batch_size, pixel_patch_length)`):
+            Tensor containing the ids permutation of pixel masking.
+        audio_ids_restore (`torch.LongTensor` of shape `(batch_size, audio_patch_length)`):
+            Tensor containing the ids permutation of audio masking.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
+            plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+    """
+
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    last_pixel_hidden_state: Optional[torch.FloatTensor] = None
+    last_audio_hidden_state: Optional[torch.FloatTensor] = None
+    pixel_label_masks: Optional[torch.LongTensor] = None
+    audio_label_masks: Optional[torch.LongTensor] = None
+    pixel_ids_restore: Optional[torch.LongTensor] = None
+    audio_ids_restore: Optional[torch.LongTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+@dataclass
+class TvltDecoderOutput(ModelOutput):
+    """
+    Class for TvltDecoder's outputs, with potential hidden states and attentions.
+
+    Args:
+        logits (`torch.FloatTensor` of shape `(batch_size, patch_size ** 2 * num_channels)`):
+            Pixel reconstruction logits.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
+            plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+    """
+
+    logits: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+@dataclass
+class TvltForPreTrainingOutput(ModelOutput):
+    """
+    Class for TvltForPreTraining's outputs, with potential hidden states and attentions.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`):
+            Pixel reconstruction loss.
+        matching_logits (`torch.FloatTensor` of shape `(batch_size, 1)`):
+            Matching objective logits.
+        pixel_logits (`torch.FloatTensor` of shape
+            `(batch_size, pixel_patch_length, image_patch_size ** 3 * pixel_num_channels)`): Pixel reconstruction
+            logits.
+        audio_logits (`torch.FloatTensor` of shape
+            `(batch_size, audio_patch_length, image_patch_size[0] * image_patch_size[1])`): Audio reconstruction
+            logits.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
+            plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    matching_logits: Optional[torch.FloatTensor] = None
+    pixel_logits: Optional[torch.FloatTensor] = None
+    audio_logits: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+def generate_pixel_mask_noise(pixel_values, pixel_mask=None, mask_ratio=0.75):
+    """Generate noise for audio masking."""
+
+    batch_size, seq_len = pixel_values.shape[:2]
+    noise = torch.rand((batch_size, seq_len), device=pixel_values.device)  # noise in [0, 1]
+    len_keep = int(seq_len * (1 - mask_ratio))
+    return noise, len_keep
+
+
+def generate_audio_mask_noise(audio_values, audio_mask=None, mask_ratio=0.75, mask_type="patch-level", freq_len=8):
+    """Generate noise for audio masking."""
+
+    batch_size, seq_len = audio_values.shape[:2]
+    if mask_type == "frame-level":
+        num_time_patches = seq_len // freq_len
+        noise = (
+            torch.rand(batch_size, num_time_patches, device=audio_values.device)
+            .unsqueeze(-1)
+            .repeat(1, 1, freq_len)
+            .view(batch_size, seq_len)
+        )  # noise in [0, 1]
+    elif mask_type == "patch-level":
+        noise = torch.rand(batch_size, seq_len, device=audio_values.device)  # noise in [0, 1]
+    len_keep = int(seq_len * (1 - mask_ratio))
+    return noise, len_keep
+
+
+def random_masking(sequence, noise, len_keep, attention_masks=None):
+    """
+    Perform random masking by per-sample shuffling on frame-level. Per-sample shuffling is done by argsort random
+    noise. sequence: [batch_size, seq_len, hidden_dim], sequence
+    """
+
+    batch_size, seq_len, hidden_dim = sequence.shape
+
+    # sort noise for each sample
+    ids_shuffle = torch.argsort(noise, dim=1)  # ascend: small is keep, large is remove
+    ids_restore = torch.argsort(ids_shuffle, dim=1)
+
+    # keep the first subset
+    ids_keep = ids_shuffle[:, :len_keep]
+    sequence_masked = torch.gather(sequence, dim=1, index=ids_keep.unsqueeze(-1).repeat(1, 1, hidden_dim))
+
+    # generate the binary mask: 0 is keep, 1 is remove
+    label_masks = torch.ones([batch_size, seq_len], device=sequence.device)
+    label_masks[:, :len_keep] = 0
+    # unshuffle to get the binary mask
+    label_masks = torch.gather(label_masks, dim=1, index=ids_restore)
+
+    if attention_masks is not None:
+        label_masks *= attention_masks
+        attention_masks = torch.gather(attention_masks, dim=1, index=ids_keep)
+
+    return sequence_masked, attention_masks, label_masks, ids_restore
+
+
+class TvltPixelEmbeddings(nn.Module):
+    """Construct the patch and position embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.patch_embeddings = TvltPixelPatchEmbeddings(config)
+        self.num_patches_per_image = self.patch_embeddings.num_patches_per_image
+
+        self.type_embed_v = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+        self.temporal_embed = nn.Parameter(torch.zeros(1, config.num_frames, config.hidden_size))
+        self.pos_embed_v = nn.Parameter(torch.zeros(1, self.num_patches_per_image, config.hidden_size))
+
+        self.config = config
+
+    def forward(self, pixel_values, attention_masks=None):
+        # create patch embeddings
+        batch_size, num_frames, num_channels, height, width = pixel_values.shape
+
+        embeddings = self.patch_embeddings(pixel_values)
+        embeddings += self.pos_embed_v.repeat(1, num_frames, 1)
+        embeddings += torch.repeat_interleave(self.temporal_embed[:, :num_frames], self.num_patches_per_image, dim=1)
+        embeddings += self.type_embed_v
+
+        return embeddings, attention_masks
+
+
+class TvltAudioEmbeddings(nn.Module):
+    """Construct the patch and position embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.patch_embeddings = TvltAudioPatchEmbeddings(config)
+        self.num_patches = self.patch_embeddings.num_patches
+
+        self.type_embed_a = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+        self.num_freq_patches = config.frequency_length // config.audio_patch_size[1]
+        self.pos_embed_a = nn.Parameter(torch.zeros(1, self.num_patches // self.num_freq_patches, config.hidden_size))
+        self.freq_embed = nn.Parameter(torch.zeros(1, self.num_freq_patches, config.hidden_size))
+
+        self.num_freq_patches = config.frequency_length // config.audio_patch_size[1]
+        self.config = config
+
+    def forward(self, audio_values, attention_masks=None):
+        # create patch embeddings
+        embeddings = self.patch_embeddings(audio_values)
+
+        num_time_patches = embeddings.size(1) // self.num_freq_patches
+        embeddings += self.freq_embed.repeat(1, num_time_patches, 1)
+        embeddings += torch.repeat_interleave(self.pos_embed_a[:, :num_time_patches], self.num_freq_patches, dim=1)
+        embeddings += self.type_embed_a
+
+        return embeddings, attention_masks
+
+
+class TvltPixelPatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        image_size, patch_size = config.image_size, config.image_patch_size
+        num_channels, hidden_size = config.num_image_channels, config.hidden_size
+
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches_per_image = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches_per_image = num_patches_per_image
+        self.hidden_size = hidden_size
+
+        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        batch_size, num_frames, num_channels, height, width = pixel_values.shape
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        if height != self.image_size[0] or width != self.image_size[1]:
+            raise ValueError(
+                f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
+            )
+
+        pixel_values = pixel_values.reshape(batch_size * num_frames, num_channels, height, width)
+        embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2)
+        embeddings = embeddings.reshape(batch_size, num_frames * self.num_patches_per_image, self.hidden_size)
+
+        return embeddings
+
+
+class TvltAudioPatchEmbeddings(nn.Module):
+    """
+    This class turns `audio_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        spectrogram_length, frequency_length, patch_size = (
+            config.spectrogram_length,
+            config.frequency_length,
+            config.audio_patch_size,
+        )
+        num_channels, hidden_size = config.num_audio_channels, config.hidden_size
+
+        spectrogram_size = (spectrogram_length, frequency_length)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (spectrogram_size[1] // patch_size[1]) * (spectrogram_size[0] // patch_size[0])
+        patch_shape = (spectrogram_size[0] // patch_size[0], spectrogram_size[1] // patch_size[1])
+        self.spectrogram_size = spectrogram_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+        self.patch_shape = patch_shape
+
+        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, audio_values: torch.Tensor) -> torch.Tensor:
+        batch_size, num_channels, height, width = audio_values.shape
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        if height > self.spectrogram_size[0] or width != self.spectrogram_size[1]:
+            raise ValueError(
+                f"Input audio size ({height}*{width}) doesn't match model"
+                f" ({self.spectrogram_size[0]}*{self.spectrogram_size[1]})."
+            )
+        embeddings = self.projection(audio_values).flatten(2).transpose(1, 2)
+
+        return embeddings
+
+
+class TvltSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
+                f"heads {config.num_attention_heads}."
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False):
+        mixed_query_layer = self.query(hidden_states)
+
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+class TvltSelfOutput(nn.Module):
+    """
+    The residual connection is defined in TvltLayer instead of here (as is the case with other models), due to the
+    layernorm applied before each block.
+    """
+
+    def __init__(self, config: TvltConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        return hidden_states
+
+
+class TvltAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.attention = TvltSelfAttention(config)
+        self.output = TvltSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False):
+        self_outputs = self.attention(hidden_states, attention_mask, head_mask, output_attentions)
+
+        attention_output = self.output(self_outputs[0], hidden_states)
+
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class TvltIntermediate(nn.Module):
+    def __init__(self, config: TvltConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+
+        return hidden_states
+
+
+class TvltOutput(nn.Module):
+    def __init__(self, config: TvltConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        hidden_states = hidden_states + input_tensor
+
+        return hidden_states
+
+
+class TvltLayer(GradientCheckpointingLayer):
+    """This corresponds to the Block class in the timm implementation."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = TvltAttention(config)
+        self.intermediate = TvltIntermediate(config)
+        self.output = TvltOutput(config)
+        self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False):
+        self_attention_outputs = self.attention(
+            self.layernorm_before(hidden_states),  # in ViLT, layernorm is applied before self-attention
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        # first residual connection
+        hidden_states = attention_output + hidden_states.to(attention_output.device)
+
+        # in ViLT, layernorm is also applied after self-attention
+        layer_output = self.layernorm_after(hidden_states)
+        layer_output = self.intermediate(layer_output)
+
+        # second residual connection is done here
+        layer_output = self.output(layer_output, hidden_states)
+
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+
+class TvltEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([TvltLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            layer_outputs = layer_module(hidden_states, attention_mask, layer_head_mask, output_attentions)
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class TvltPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config: TvltConfig
+    base_model_prefix = "tvlt"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+TVLT_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`TvltConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+TVLT_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`TvltProcessor`]. See [`TvltProcessor.__call__`] for
+            details.
+
+        audio_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Audio values. Audio values can be obtained using [`TvltProcessor`]. See [`TvltProcessor.__call__`] for
+            details.
+
+        pixel_mask (`torch.FloatTensor` of shape `(batch_size, num_pixel_patches)`):
+            Pixel masks. Pixel masks can be obtained using [`TvltProcessor`]. See [`TvltProcessor.__call__`] for
+            details.
+
+        audio_mask (`torch.FloatTensor` of shape `(batch_size, num_audio_patches)`):
+            Audio masks. Audio masks can be obtained using [`TvltProcessor`]. See [`TvltProcessor.__call__`] for
+            details.
+
+        pixel_values_mixed (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
+            Pixel values that mix positive and negative samples in Tvlt vision-audio matching. Pixel values mixed can
+            be obtained using [`TvltProcessor`]. See [`TvltProcessor.__call__`] for details.
+
+        pixel_mask_mixed (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel masks of pixel_values_mixed. Pixel masks mixed can be obtained using [`TvltProcessor`]. See
+            [`TvltProcessor.__call__`] for details.
+
+        mask_pixel (`bool`, *optional*):
+            Whether to mask pixel for MAE tasks. Only set to True in TvltForPreTraining.
+
+        mask_audio (`bool`, *optional*):
+            Whether to mask audio for MAE tasks. Only set to True in TvltForPreTraining.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare TVLT Model transformer outputting raw hidden-states without any specific head on top.",
+    TVLT_START_DOCSTRING,
+)
+class TvltModel(TvltPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+
+        self.pixel_embeddings = TvltPixelEmbeddings(config)
+        self.audio_embeddings = TvltAudioEmbeddings(config)
+        self.encoder = TvltEncoder(config)
+
+        self.cls_embedding = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+
+        if config.use_mean_pooling:
+            self.layernorm = None
+        else:
+            self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.pixel_embeddings.patch_embeddings, self.audio_embeddings.patch_embeddings
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(TVLT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=TvltModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        audio_values: torch.FloatTensor,
+        pixel_mask: Optional[torch.FloatTensor] = None,
+        audio_mask: Optional[torch.FloatTensor] = None,
+        mask_pixel: bool = False,
+        mask_audio: bool = False,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.FloatTensor], TvltModelOutput]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import TvltProcessor, TvltModel
+        >>> import numpy as np
+        >>> import torch
+
+        >>> num_frames = 8
+        >>> images = list(np.random.randn(num_frames, 3, 224, 224))
+        >>> audio = list(np.random.randn(10000))
+
+        >>> processor = TvltProcessor.from_pretrained("ZinengTang/tvlt-base")
+        >>> model = TvltModel.from_pretrained("ZinengTang/tvlt-base")
+
+        >>> input_dict = processor(images, audio, sampling_rate=44100, return_tensors="pt")
+
+        >>> outputs = model(**input_dict)
+        >>> loss = outputs.loss
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        pixel_embedding_output, pixel_mask = self.pixel_embeddings(pixel_values, pixel_mask)
+
+        audio_embedding_output, audio_mask = self.audio_embeddings(audio_values, audio_mask)
+
+        # Mask pixel if mask_pixel is True
+        pixel_label_masks = None
+        pixel_ids_restore = None
+        if mask_pixel:
+            pixel_mask_noise, pixel_len_keep = generate_pixel_mask_noise(
+                pixel_embedding_output, pixel_mask=pixel_mask, mask_ratio=self.config.pixel_mask_ratio
+            )
+            pixel_embedding_output, pixel_mask, pixel_label_masks, pixel_ids_restore = random_masking(
+                pixel_embedding_output,
+                pixel_mask_noise,
+                pixel_len_keep,
+                attention_masks=pixel_mask,
+            )
+
+        # Mask audio if mask_audio is True
+        audio_label_masks = None
+        audio_ids_restore = None
+        if mask_audio:
+            num_freq_patches = self.config.frequency_length // self.config.audio_patch_size[1]
+            audio_mask_noise, audio_len_keep = generate_audio_mask_noise(
+                audio_embedding_output,
+                audio_mask=audio_mask,
+                mask_ratio=self.config.audio_mask_ratio,
+                mask_type=self.config.audio_mask_type,
+                freq_len=num_freq_patches,
+            )
+            audio_embedding_output, audio_mask, audio_label_masks, audio_ids_restore = random_masking(
+                audio_embedding_output,
+                audio_mask_noise,
+                audio_len_keep,
+                attention_masks=audio_mask,
+            )
+
+        # Prepare for encoder inputs and attention masks
+        batch_size = pixel_values.size(0)
+        embedding_output = torch.cat(
+            [self.cls_embedding.repeat(batch_size, 1, 1), pixel_embedding_output, audio_embedding_output], 1
+        )
+        masked_pixel_len = pixel_embedding_output.size(1)
+
+        attention_mask = None
+        if pixel_mask is not None and audio_mask is not None:
+            attention_mask = torch.cat([pixel_mask[:, :1], pixel_mask, audio_mask], 1)
+
+        input_shape = embedding_output.size()
+        extended_attention_mask = None
+        if attention_mask is not None:
+            extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        if self.layernorm is not None:
+            sequence_output = self.layernorm(sequence_output)
+
+        pixel_sequence_output = sequence_output[:, 1 : 1 + masked_pixel_len]
+        audio_sequence_output = sequence_output[:, 1 + masked_pixel_len :]
+        if not return_dict:
+            return (
+                sequence_output,
+                pixel_sequence_output,
+                audio_sequence_output,
+                pixel_label_masks,
+                audio_label_masks,
+                pixel_ids_restore,
+                audio_ids_restore,
+            ) + encoder_outputs[1:]
+
+        return TvltModelOutput(
+            last_hidden_state=sequence_output,
+            last_pixel_hidden_state=pixel_sequence_output,
+            last_audio_hidden_state=audio_sequence_output,
+            pixel_label_masks=pixel_label_masks,
+            audio_label_masks=audio_label_masks,
+            pixel_ids_restore=pixel_ids_restore,
+            audio_ids_restore=audio_ids_restore,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class TvltDecoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        decoder_config = deepcopy(config)
+        decoder_config.hidden_size = config.decoder_hidden_size
+        decoder_config.num_hidden_layers = config.decoder_num_hidden_layers
+        decoder_config.num_attention_heads = config.decoder_num_attention_heads
+        decoder_config.intermediate_size = config.decoder_intermediate_size
+        self.decoder_layers = nn.ModuleList(
+            [TvltLayer(decoder_config) for _ in range(config.decoder_num_hidden_layers)]
+        )
+
+        self.layernorm = nn.LayerNorm(config.decoder_hidden_size, eps=config.layer_norm_eps)
+
+        self.gradient_checkpointing = False
+        self.config = config
+
+    def forward(
+        self,
+        hidden_states,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        # apply Transformer layers (blocks)
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        for i, layer_module in enumerate(self.decoder_layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_outputs = layer_module(hidden_states, output_attentions=output_attentions)
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        # predictor projection
+        logits = self.layernorm(hidden_states)
+
+        if not return_dict:
+            return tuple(v for v in [logits, all_hidden_states, all_self_attentions] if v is not None)
+        return TvltDecoderOutput(logits=logits, hidden_states=all_hidden_states, attentions=all_self_attentions)
+
+
+@add_start_docstrings(
+    "The TVLT Model transformer with the decoder on top for self-supervised pre-training.",
+    TVLT_START_DOCSTRING,
+)
+class TvltForPreTraining(TvltPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+
+        self.task_matching = config.task_matching
+        self.task_mae = config.task_mae
+        if not (self.task_matching or self.task_mae):
+            raise ValueError("Must set at least one of matching task and MAE task to true")
+
+        self.tvlt = TvltModel(config)
+
+        if self.task_matching:
+            self.matching_head = TvltMatchingHead(config)
+
+        if self.task_mae:
+            self.encoder_to_decoder = nn.Linear(config.hidden_size, config.decoder_hidden_size, bias=True)
+
+            self.pixel_mask_token = nn.Parameter(torch.zeros(1, 1, config.decoder_hidden_size))
+            self.audio_mask_token = nn.Parameter(torch.zeros(1, 1, config.decoder_hidden_size))
+
+            self.decoder = TvltDecoder(config)
+
+            decoder_hidden_size = config.decoder_hidden_size
+
+            num_frames = config.num_frames
+            num_patches_per_image = self.tvlt.pixel_embeddings.num_patches_per_image
+            self.decoder_pixel_pos_embed = nn.Parameter(torch.zeros(1, num_patches_per_image, decoder_hidden_size))
+            self.decoder_temporal_embed = nn.Parameter(torch.zeros(1, config.num_frames, decoder_hidden_size))
+            self.decoder_pixel_type_embed = nn.Parameter(torch.zeros(1, 1, decoder_hidden_size))
+
+            num_audio_patches = self.tvlt.audio_embeddings.num_patches
+            num_freq_patches = config.frequency_length // config.audio_patch_size[1]
+            self.decoder_audio_pos_embed = nn.Parameter(
+                torch.zeros(1, num_audio_patches // num_freq_patches, decoder_hidden_size)
+            )
+            self.decoder_freq_embed = nn.Parameter(torch.zeros(1, num_freq_patches, decoder_hidden_size))
+            self.decoder_audio_type_embed = nn.Parameter(torch.zeros(1, 1, decoder_hidden_size))
+
+            pixel_mae_output_dim = self.config.image_patch_size[0] ** 2 * self.config.num_image_channels
+            self.pixel_mae_head = TvltMAEHead(config, pixel_mae_output_dim)
+            audio_mae_output_dim = (
+                self.config.audio_patch_size[0] * self.config.audio_patch_size[1] * self.config.num_audio_channels
+            )
+            self.audio_mae_head = TvltMAEHead(config, audio_mae_output_dim)
+
+            self.num_frames = num_frames
+            self.num_patches_per_image = num_patches_per_image
+            self.num_freq_patches = num_freq_patches
+            self.image_patch_size = config.image_patch_size
+            self.audio_patch_size = config.audio_patch_size
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def patchify_pixel(self, pixel_values):
+        """
+        pixel_values: [batch_size, num_frames, 3, height, width]
+        """
+        batch_size, num_frames, num_channels, height, width = pixel_values.shape
+        num_patches_height = pixel_values.shape[3] // self.image_patch_size[0]
+        num_patches_width = pixel_values.shape[4] // self.image_patch_size[1]
+        patchified_pixel_values = pixel_values.reshape(
+            shape=(
+                batch_size,
+                num_frames,
+                num_channels,
+                num_patches_height,
+                self.image_patch_size[0],
+                num_patches_width,
+                self.image_patch_size[1],
+            )
+        )
+        patchified_pixel_values = torch.einsum("ntchpwq->nthwpqc", patchified_pixel_values)
+        patchified_pixel_values = patchified_pixel_values.reshape(
+            shape=(
+                batch_size,
+                num_patches_height * num_patches_width * num_frames,
+                self.image_patch_size[0] * self.image_patch_size[1] * num_channels,
+            )
+        )
+        return patchified_pixel_values
+
+    def patchify_audio(self, audio_values):
+        """
+        audio_values: [batch_size, 1, height, width]
+        """
+        batch_size, num_channels, height, width = audio_values.shape
+        num_patches_height = height // self.audio_patch_size[0]
+        num_patches_width = width // self.audio_patch_size[1]
+        patchified_audio_values = audio_values.reshape(
+            shape=(
+                batch_size,
+                num_channels,
+                num_patches_height,
+                self.audio_patch_size[0],
+                num_patches_width,
+                self.audio_patch_size[1],
+            )
+        )
+        patchified_audio_values = torch.einsum("nchpwq->nhwpqc", patchified_audio_values)
+        patchified_audio_values = patchified_audio_values.reshape(
+            shape=(
+                batch_size,
+                num_patches_height * num_patches_width,
+                self.audio_patch_size[0] * self.audio_patch_size[1] * num_channels,
+            )
+        )
+        return patchified_audio_values
+
+    def pixel_mae_loss(self, pixel_values, pixel_predictions, mask):
+        patchified_pixel_values = self.patchify_pixel(pixel_values)
+        loss = (pixel_predictions - patchified_pixel_values) ** 2
+        loss = loss.mean(dim=-1)  # [batch_size, pixel_pixel_length], mean loss per patch
+        loss = (loss * mask).sum() / mask.sum()  # mean loss on removed patches
+        return loss
+
+    def audio_mae_loss(self, audio_values, audio_predictions, mask):
+        patchified_audio_values = self.patchify_audio(audio_values)
+        loss = (audio_predictions - patchified_audio_values) ** 2
+        loss = loss.mean(dim=-1)  # [batch_size, audio_pixel_length], mean loss per patch
+        loss = (loss * mask).sum() / mask.sum()  # mean loss on removed patches
+        return loss
+
+    def concatenate_mask(self, mask_token, sequence, ids_restore):
+        batch_size, seq_length, dim = sequence.shape
+        mask_tokens = mask_token.repeat(batch_size, ids_restore.shape[1] - seq_length, 1)
+        padded_sequence = torch.cat([sequence, mask_tokens], dim=1)
+        padded_sequence = torch.gather(
+            padded_sequence, dim=1, index=ids_restore.unsqueeze(-1).repeat(1, 1, dim)
+        )  # unshuffle
+        return padded_sequence
+
+    @add_start_docstrings_to_model_forward(TVLT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=TvltForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        audio_values: torch.FloatTensor,
+        pixel_mask: Optional[torch.FloatTensor] = None,
+        audio_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        pixel_values_mixed: Optional[torch.FloatTensor] = None,
+        pixel_mask_mixed: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.FloatTensor], TvltForPreTrainingOutput]:
+        r"""
+        pixel_values_mixed (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
+            Pixel values that mix positive and negative samples in Tvlt vision-audio matching. Audio values can be
+            obtained using [`TvltProcessor`]. See [`TvltProcessor.__call__`] for details.
+
+        pixel_mask_mixed (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel masks of pixel_values_mixed. Pixel values mixed can be obtained using [`TvltProcessor`]. See
+            [`TvltProcessor.__call__`] for details.
+
+        labels (`torch.LongTensor` of shape `(batch_size, num_labels)`, *optional*):
+            Labels for computing the vision audio matching loss. Indices should be in `[0, 1]`. num_labels has to be 1.
+
+        Return:
+
+        Examples:
+
+        ```python
+        >>> from transformers import TvltProcessor, TvltForPreTraining
+        >>> import numpy as np
+        >>> import torch
+
+        >>> num_frames = 8
+        >>> images = list(np.random.randn(num_frames, 3, 224, 224))
+        >>> images_mixed = list(np.random.randn(num_frames, 3, 224, 224))
+        >>> audio = list(np.random.randn(10000))
+        >>> processor = TvltProcessor.from_pretrained("ZinengTang/tvlt-base")
+        >>> model = TvltForPreTraining.from_pretrained("ZinengTang/tvlt-base")
+        >>> input_dict = processor(
+        ...     images, audio, images_mixed, sampling_rate=44100, mask_pixel=True, mask_audio=True, return_tensors="pt"
+        ... )
+
+        >>> outputs = model(**input_dict)
+        >>> loss = outputs.loss
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        total_loss = 0.0
+
+        if self.task_matching:
+            if labels is None:
+                raise ValueError("Matching task requires labels")
+            if pixel_values_mixed is None:
+                raise ValueError("Matching task requires pixel_values_mixed")
+
+            outputs = self.tvlt(
+                pixel_values_mixed,
+                audio_values,
+                pixel_mask=pixel_mask_mixed,
+                audio_mask=audio_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+
+            sequence_output = outputs[0]
+            matching_logits = self.matching_head(sequence_output)
+
+            loss_fct = BCEWithLogitsLoss()
+            loss = loss_fct(matching_logits.view(-1), labels.view(-1))
+            total_loss += loss
+
+        pixel_logits = None
+        audio_logits = None
+        if self.task_mae and self.training:
+            outputs = self.tvlt(
+                pixel_values,
+                audio_values,
+                pixel_mask=pixel_mask,
+                audio_mask=audio_mask,
+                mask_pixel=True,
+                mask_audio=True,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            pixel_sequence_output = outputs.last_pixel_hidden_state if return_dict else outputs[1]
+            audio_sequence_output = outputs.last_audio_hidden_state if return_dict else outputs[2]
+            pixel_label_masks = outputs.pixel_label_masks if return_dict else outputs[3]
+            audio_label_masks = outputs.audio_label_masks if return_dict else outputs[4]
+            pixel_ids_restore = outputs.pixel_ids_restore if return_dict else outputs[5]
+            audio_ids_restore = outputs.audio_ids_restore if return_dict else outputs[6]
+
+            pixel_decoder_input = self.encoder_to_decoder(
+                pixel_sequence_output
+            )  # [batch_size, num_masked_pixel_patches, decoder_hidden_size]
+            audio_decoder_input = self.encoder_to_decoder(
+                audio_sequence_output
+            )  # [batch_size, num_masked_audio_patches, decoder_hidden_size]
+            num_frames = pixel_values.size(1)
+            pixel_decoder_input = self.concatenate_mask(self.pixel_mask_token, pixel_decoder_input, pixel_ids_restore)
+            pixel_decoder_input = pixel_decoder_input + self.decoder_pixel_pos_embed.repeat(1, num_frames, 1)
+            pixel_decoder_input = pixel_decoder_input + torch.repeat_interleave(
+                self.decoder_temporal_embed[:, :num_frames], self.num_patches_per_image, dim=1
+            )
+            pixel_decoder_input = pixel_decoder_input + self.decoder_pixel_type_embed
+            pixel_decoder_outputs = self.decoder(pixel_decoder_input)
+            pixel_logits = self.pixel_mae_head(pixel_decoder_outputs.logits)
+
+            audio_decoder_input = self.concatenate_mask(self.audio_mask_token, audio_decoder_input, audio_ids_restore)
+            num_time_patches = audio_decoder_input.size(1) // self.num_freq_patches
+            audio_decoder_input = audio_decoder_input + self.decoder_freq_embed.repeat(1, num_time_patches, 1)
+            audio_decoder_input = audio_decoder_input + torch.repeat_interleave(
+                self.decoder_audio_pos_embed[:, :num_time_patches], self.num_freq_patches, dim=1
+            )
+            audio_decoder_input = audio_decoder_input + self.decoder_audio_type_embed
+            audio_decoder_outputs = self.decoder(audio_decoder_input)
+            audio_logits = self.audio_mae_head(audio_decoder_outputs.logits)
+
+            loss = self.pixel_mae_loss(pixel_values, pixel_logits, pixel_label_masks) + self.audio_mae_loss(
+                audio_values, audio_logits, audio_label_masks
+            )
+            total_loss += loss
+
+        if not return_dict:
+            output = (matching_logits, pixel_logits, audio_logits) + outputs[7:]
+            return ((total_loss,) + output) if loss is not None else output
+
+        return TvltForPreTrainingOutput(
+            loss=total_loss,
+            matching_logits=matching_logits,
+            pixel_logits=pixel_logits,
+            audio_logits=audio_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class TvltPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class TvltMatchingHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.pooler = TvltPooler(config)
+        self.fc = nn.Linear(config.hidden_size, 1)
+
+    def forward(self, hidden_states):
+        hidden_states = self.fc(self.pooler(hidden_states))
+        return hidden_states
+
+
+class TvltMAEHead(nn.Module):
+    def __init__(self, config, output_dim=None):
+        super().__init__()
+        self.config = config
+        self.decoder = nn.Linear(config.decoder_hidden_size, output_dim)
+
+    def forward(self, hidden_states):
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+@add_start_docstrings(
+    """
+    Tvlt Model transformer with a classifier head on top (an MLP on top of the final hidden state of the [CLS] token)
+    for audiovisual classification tasks, e.g. CMU-MOSEI Sentiment Analysis and Audio to Video Retrieval.
+    """,
+    TVLT_START_DOCSTRING,
+)
+class TvltForAudioVisualClassification(TvltPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.tvlt = TvltModel(config)
+
+        # Classifier head
+        self.classifier = nn.Sequential(
+            nn.Linear(config.hidden_size, config.hidden_size * 2),
+            nn.LayerNorm(config.hidden_size * 2, eps=config.layer_norm_eps),
+            nn.GELU(),
+            nn.Linear(config.hidden_size * 2, config.num_labels),
+        )
+        self.config = config
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(TVLT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        audio_values: torch.FloatTensor,
+        pixel_mask: Optional[torch.FloatTensor] = None,
+        audio_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.LongTensor] = None,
+    ) -> Union[tuple[torch.FloatTensor], SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, num_labels)`, *optional*):
+            Labels for computing the audiovisual loss. Indices should be in `[0, ..., num_classes-1]` where num_classes
+            refers to the number of classes in audiovisual tasks.
+
+        Return:
+
+        Examples:
+        ```python
+        >>> from transformers import TvltProcessor, TvltForAudioVisualClassification
+        >>> import numpy as np
+        >>> import torch
+
+        >>> num_frames = 8
+        >>> images = list(np.random.randn(num_frames, 3, 224, 224))
+        >>> audio = list(np.random.randn(10000))
+        >>> processor = TvltProcessor.from_pretrained("ZinengTang/tvlt-base")
+        >>> model = TvltForAudioVisualClassification.from_pretrained("ZinengTang/tvlt-base")
+        >>> input_dict = processor(images, audio, sampling_rate=44100, return_tensors="pt")
+
+        >>> outputs = model(**input_dict)
+        >>> loss = outputs.loss
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.tvlt(
+            pixel_values,
+            audio_values,
+            pixel_mask=pixel_mask,
+            audio_mask=audio_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0][:, 0]
+        logits = self.classifier(sequence_output)  # rank value
+
+        loss = None
+        if labels is not None:
+            if self.config.loss_type == "regression":
+                loss_fct = MSELoss()
+                loss = loss_fct(logits, labels)
+            elif self.config.loss_type == "classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[4:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = ["TvltModel", "TvltForPreTraining", "TvltForAudioVisualClassification", "TvltPreTrainedModel"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/tvlt/processing_tvlt.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/tvlt/processing_tvlt.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ecbac0dfa16f7c2cf70d07775bba2beeb456973
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/tvlt/processing_tvlt.py
@@ -0,0 +1,86 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for TVLT.
+"""
+
+from ....processing_utils import ProcessorMixin
+
+
+class TvltProcessor(ProcessorMixin):
+    r"""
+    Constructs a TVLT processor which wraps a TVLT image processor and TVLT feature extractor into a single processor.
+
+    [`TvltProcessor`] offers all the functionalities of [`TvltImageProcessor`] and [`TvltFeatureExtractor`]. See the
+    docstring of [`~TvltProcessor.__call__`] for more information.
+
+    Args:
+        image_processor (`TvltImageProcessor`):
+            An instance of [`TvltImageProcessor`]. The image processor is a required input.
+        feature_extractor (`TvltFeatureExtractor`):
+            An instance of [`TvltFeatureExtractor`]. The feature extractor is a required input.
+    """
+
+    attributes = ["image_processor", "feature_extractor"]
+    image_processor_class = "TvltImageProcessor"
+    feature_extractor_class = "TvltFeatureExtractor"
+
+    def __init__(self, image_processor, feature_extractor):
+        super().__init__(image_processor=image_processor, feature_extractor=feature_extractor)
+
+        self.image_processor = image_processor
+        self.feature_extractor = feature_extractor
+
+    def __call__(
+        self,
+        images=None,
+        audio=None,
+        images_mixed=None,
+        sampling_rate=None,
+        mask_audio=False,
+        mask_pixel=False,
+        *args,
+        **kwargs,
+    ):
+        """
+        Forwards the `images` argument to TvltImageProcessor's [`~TvltImageProcessor.preprocess`] and the `audio`
+        argument to TvltFeatureExtractor's [`~TvltFeatureExtractor.__call__`]. Please refer to the docstring of the
+        above two methods for more information.
+        """
+
+        if images is None and audio is None:
+            raise ValueError("You need to specify either an `images` or `audio` input to process.")
+
+        images_mixed_dict = None
+        if images is not None:
+            images_dict = self.image_processor(images, mask_pixel=mask_pixel, *args, **kwargs)
+        if images_mixed is not None:
+            images_mixed_dict = self.image_processor(images_mixed, is_mixed=True, *args, **kwargs)
+        if audio is not None:
+            audio_dict = self.feature_extractor(
+                audio, *args, sampling_rate=sampling_rate, mask_audio=mask_audio, **kwargs
+            )
+
+        output_dict = {}
+        if audio is not None:
+            output_dict.update(audio_dict)
+        if images is not None:
+            output_dict.update(images_dict)
+        if images_mixed_dict is not None:
+            output_dict.update(images_mixed_dict)
+        return output_dict
+
+
+__all__ = ["TvltProcessor"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/van/__init__.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/van/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9552c827365d3913539be3eafaf822146ada5829
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/van/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ....utils import _LazyModule
+from ....utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_van import *
+    from .modeling_van import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/van/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/van/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..01c59614e26f5a3b083008d27976008661ce7bcb
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/van/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/van/__pycache__/configuration_van.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/van/__pycache__/configuration_van.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b3f613160fc538ed4aa53108e274802721d60482
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/van/__pycache__/configuration_van.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/van/__pycache__/modeling_van.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/van/__pycache__/modeling_van.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ac4934a29e664b5f37d2d45db4fbbe5d95de9ed7
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/van/__pycache__/modeling_van.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/van/configuration_van.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/van/configuration_van.py
new file mode 100644
index 0000000000000000000000000000000000000000..be94e45ec63d9516584c320c98e0b1157c092c2c
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/van/configuration_van.py
@@ -0,0 +1,110 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""VAN model configuration"""
+
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class VanConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`VanModel`]. It is used to instantiate a VAN model
+    according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the VAN
+    [Visual-Attention-Network/van-base](https://huggingface.co/Visual-Attention-Network/van-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        patch_sizes (`list[int]`, *optional*, defaults to `[7, 3, 3, 3]`):
+            Patch size to use in each stage's embedding layer.
+        strides (`list[int]`, *optional*, defaults to `[4, 2, 2, 2]`):
+            Stride size to use in each stage's embedding layer to downsample the input.
+        hidden_sizes (`list[int]`, *optional*, defaults to `[64, 128, 320, 512]`):
+            Dimensionality (hidden size) at each stage.
+        depths (`list[int]`, *optional*, defaults to `[3, 3, 12, 3]`):
+            Depth (number of layers) for each stage.
+        mlp_ratios (`list[int]`, *optional*, defaults to `[8, 8, 4, 4]`):
+            The expansion ratio for mlp layer at each stage.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in each layer. If string, `"gelu"`, `"relu"`,
+            `"selu"` and `"gelu_new"` are supported.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        layer_scale_init_value (`float`, *optional*, defaults to 0.01):
+            The initial value for layer scaling.
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            The dropout probability for stochastic depth.
+        dropout_rate (`float`, *optional*, defaults to 0.0):
+            The dropout probability for dropout.
+
+    Example:
+    ```python
+    >>> from transformers import VanModel, VanConfig
+
+    >>> # Initializing a VAN van-base style configuration
+    >>> configuration = VanConfig()
+    >>> # Initializing a model from the van-base style configuration
+    >>> model = VanModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "van"
+
+    def __init__(
+        self,
+        image_size=224,
+        num_channels=3,
+        patch_sizes=[7, 3, 3, 3],
+        strides=[4, 2, 2, 2],
+        hidden_sizes=[64, 128, 320, 512],
+        depths=[3, 3, 12, 3],
+        mlp_ratios=[8, 8, 4, 4],
+        hidden_act="gelu",
+        initializer_range=0.02,
+        layer_norm_eps=1e-6,
+        layer_scale_init_value=1e-2,
+        drop_path_rate=0.0,
+        dropout_rate=0.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.image_size = image_size
+        self.num_channels = num_channels
+        self.patch_sizes = patch_sizes
+        self.strides = strides
+        self.hidden_sizes = hidden_sizes
+        self.depths = depths
+        self.mlp_ratios = mlp_ratios
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.layer_scale_init_value = layer_scale_init_value
+        self.drop_path_rate = drop_path_rate
+        self.dropout_rate = dropout_rate
+
+
+__all__ = ["VanConfig"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/van/modeling_van.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/van/modeling_van.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0fc0bc1a63728ea54935775b085dd8041e46a45
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/van/modeling_van.py
@@ -0,0 +1,520 @@
+# coding=utf-8
+# Copyright 2022 BNRist (Tsinghua University), TKLNDST (Nankai University) and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Visual Attention Network (VAN) model."""
+
+import math
+from collections import OrderedDict
+from typing import Optional, Union
+
+import torch
+from torch import nn
+
+from ....activations import ACT2FN
+from ....modeling_outputs import (
+    BaseModelOutputWithNoAttention,
+    BaseModelOutputWithPoolingAndNoAttention,
+    ImageClassifierOutputWithNoAttention,
+)
+from ....modeling_utils import PreTrainedModel
+from ....utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from .configuration_van import VanConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "VanConfig"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "Visual-Attention-Network/van-base"
+_EXPECTED_OUTPUT_SHAPE = [1, 512, 7, 7]
+
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "Visual-Attention-Network/van-base"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
+
+
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+
+
+class VanDropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return f"p={self.drop_prob}"
+
+
+class VanOverlappingPatchEmbedder(nn.Module):
+    """
+    Downsamples the input using a patchify operation with a `stride` of 4 by default making adjacent windows overlap by
+    half of the area. From [PVTv2: Improved Baselines with Pyramid Vision
+    Transformer](https://huggingface.co/papers/2106.13797).
+    """
+
+    def __init__(self, in_channels: int, hidden_size: int, patch_size: int = 7, stride: int = 4):
+        super().__init__()
+        self.convolution = nn.Conv2d(
+            in_channels, hidden_size, kernel_size=patch_size, stride=stride, padding=patch_size // 2
+        )
+        self.normalization = nn.BatchNorm2d(hidden_size)
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.convolution(input)
+        hidden_state = self.normalization(hidden_state)
+        return hidden_state
+
+
+class VanMlpLayer(nn.Module):
+    """
+    MLP with depth-wise convolution, from [PVTv2: Improved Baselines with Pyramid Vision
+    Transformer](https://huggingface.co/papers/2106.13797).
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        hidden_size: int,
+        out_channels: int,
+        hidden_act: str = "gelu",
+        dropout_rate: float = 0.5,
+    ):
+        super().__init__()
+        self.in_dense = nn.Conv2d(in_channels, hidden_size, kernel_size=1)
+        self.depth_wise = nn.Conv2d(hidden_size, hidden_size, kernel_size=3, padding=1, groups=hidden_size)
+        self.activation = ACT2FN[hidden_act]
+        self.dropout1 = nn.Dropout(dropout_rate)
+        self.out_dense = nn.Conv2d(hidden_size, out_channels, kernel_size=1)
+        self.dropout2 = nn.Dropout(dropout_rate)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.in_dense(hidden_state)
+        hidden_state = self.depth_wise(hidden_state)
+        hidden_state = self.activation(hidden_state)
+        hidden_state = self.dropout1(hidden_state)
+        hidden_state = self.out_dense(hidden_state)
+        hidden_state = self.dropout2(hidden_state)
+        return hidden_state
+
+
+class VanLargeKernelAttention(nn.Module):
+    """
+    Basic Large Kernel Attention (LKA).
+    """
+
+    def __init__(self, hidden_size: int):
+        super().__init__()
+        self.depth_wise = nn.Conv2d(hidden_size, hidden_size, kernel_size=5, padding=2, groups=hidden_size)
+        self.depth_wise_dilated = nn.Conv2d(
+            hidden_size, hidden_size, kernel_size=7, dilation=3, padding=9, groups=hidden_size
+        )
+        self.point_wise = nn.Conv2d(hidden_size, hidden_size, kernel_size=1)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.depth_wise(hidden_state)
+        hidden_state = self.depth_wise_dilated(hidden_state)
+        hidden_state = self.point_wise(hidden_state)
+        return hidden_state
+
+
+class VanLargeKernelAttentionLayer(nn.Module):
+    """
+    Computes attention using Large Kernel Attention (LKA) and attends the input.
+    """
+
+    def __init__(self, hidden_size: int):
+        super().__init__()
+        self.attention = VanLargeKernelAttention(hidden_size)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        attention = self.attention(hidden_state)
+        attended = hidden_state * attention
+        return attended
+
+
+class VanSpatialAttentionLayer(nn.Module):
+    """
+    Van spatial attention layer composed by projection (via conv) -> act -> Large Kernel Attention (LKA) attention ->
+    projection (via conv) + residual connection.
+    """
+
+    def __init__(self, hidden_size: int, hidden_act: str = "gelu"):
+        super().__init__()
+        self.pre_projection = nn.Sequential(
+            OrderedDict(
+                [
+                    ("conv", nn.Conv2d(hidden_size, hidden_size, kernel_size=1)),
+                    ("act", ACT2FN[hidden_act]),
+                ]
+            )
+        )
+        self.attention_layer = VanLargeKernelAttentionLayer(hidden_size)
+        self.post_projection = nn.Conv2d(hidden_size, hidden_size, kernel_size=1)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        residual = hidden_state
+        hidden_state = self.pre_projection(hidden_state)
+        hidden_state = self.attention_layer(hidden_state)
+        hidden_state = self.post_projection(hidden_state)
+        hidden_state = hidden_state + residual
+        return hidden_state
+
+
+class VanLayerScaling(nn.Module):
+    """
+    Scales the inputs by a learnable parameter initialized by `initial_value`.
+    """
+
+    def __init__(self, hidden_size: int, initial_value: float = 1e-2):
+        super().__init__()
+        self.weight = nn.Parameter(initial_value * torch.ones(hidden_size), requires_grad=True)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        # unsqueezing for broadcasting
+        hidden_state = self.weight.unsqueeze(-1).unsqueeze(-1) * hidden_state
+        return hidden_state
+
+
+class VanLayer(nn.Module):
+    """
+    Van layer composed by normalization layers, large kernel attention (LKA) and a multi layer perceptron (MLP).
+    """
+
+    def __init__(
+        self,
+        config: VanConfig,
+        hidden_size: int,
+        mlp_ratio: int = 4,
+        drop_path_rate: float = 0.5,
+    ):
+        super().__init__()
+        self.drop_path = VanDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
+        self.pre_normomalization = nn.BatchNorm2d(hidden_size)
+        self.attention = VanSpatialAttentionLayer(hidden_size, config.hidden_act)
+        self.attention_scaling = VanLayerScaling(hidden_size, config.layer_scale_init_value)
+        self.post_normalization = nn.BatchNorm2d(hidden_size)
+        self.mlp = VanMlpLayer(
+            hidden_size, hidden_size * mlp_ratio, hidden_size, config.hidden_act, config.dropout_rate
+        )
+        self.mlp_scaling = VanLayerScaling(hidden_size, config.layer_scale_init_value)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        residual = hidden_state
+        # attention
+        hidden_state = self.pre_normomalization(hidden_state)
+        hidden_state = self.attention(hidden_state)
+        hidden_state = self.attention_scaling(hidden_state)
+        hidden_state = self.drop_path(hidden_state)
+        # residual connection
+        hidden_state = residual + hidden_state
+        residual = hidden_state
+        # mlp
+        hidden_state = self.post_normalization(hidden_state)
+        hidden_state = self.mlp(hidden_state)
+        hidden_state = self.mlp_scaling(hidden_state)
+        hidden_state = self.drop_path(hidden_state)
+        # residual connection
+        hidden_state = residual + hidden_state
+        return hidden_state
+
+
+class VanStage(nn.Module):
+    """
+    VanStage, consisting of multiple layers.
+    """
+
+    def __init__(
+        self,
+        config: VanConfig,
+        in_channels: int,
+        hidden_size: int,
+        patch_size: int,
+        stride: int,
+        depth: int,
+        mlp_ratio: int = 4,
+        drop_path_rate: float = 0.0,
+    ):
+        super().__init__()
+        self.embeddings = VanOverlappingPatchEmbedder(in_channels, hidden_size, patch_size, stride)
+        self.layers = nn.Sequential(
+            *[
+                VanLayer(
+                    config,
+                    hidden_size,
+                    mlp_ratio=mlp_ratio,
+                    drop_path_rate=drop_path_rate,
+                )
+                for _ in range(depth)
+            ]
+        )
+        self.normalization = nn.LayerNorm(hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.embeddings(hidden_state)
+        hidden_state = self.layers(hidden_state)
+        # rearrange b c h w -> b (h w) c
+        batch_size, hidden_size, height, width = hidden_state.shape
+        hidden_state = hidden_state.flatten(2).transpose(1, 2)
+        hidden_state = self.normalization(hidden_state)
+        # rearrange  b (h w) c- > b c h w
+        hidden_state = hidden_state.view(batch_size, height, width, hidden_size).permute(0, 3, 1, 2)
+        return hidden_state
+
+
+class VanEncoder(nn.Module):
+    """
+    VanEncoder, consisting of multiple stages.
+    """
+
+    def __init__(self, config: VanConfig):
+        super().__init__()
+        self.stages = nn.ModuleList([])
+        patch_sizes = config.patch_sizes
+        strides = config.strides
+        hidden_sizes = config.hidden_sizes
+        depths = config.depths
+        mlp_ratios = config.mlp_ratios
+        drop_path_rates = [
+            x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths), device="cpu")
+        ]
+
+        for num_stage, (patch_size, stride, hidden_size, depth, mlp_expansion, drop_path_rate) in enumerate(
+            zip(patch_sizes, strides, hidden_sizes, depths, mlp_ratios, drop_path_rates)
+        ):
+            is_first_stage = num_stage == 0
+            in_channels = hidden_sizes[num_stage - 1]
+            if is_first_stage:
+                in_channels = config.num_channels
+            self.stages.append(
+                VanStage(
+                    config,
+                    in_channels,
+                    hidden_size,
+                    patch_size=patch_size,
+                    stride=stride,
+                    depth=depth,
+                    mlp_ratio=mlp_expansion,
+                    drop_path_rate=drop_path_rate,
+                )
+            )
+
+    def forward(
+        self,
+        hidden_state: torch.Tensor,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[tuple, BaseModelOutputWithNoAttention]:
+        all_hidden_states = () if output_hidden_states else None
+
+        for _, stage_module in enumerate(self.stages):
+            hidden_state = stage_module(hidden_state)
+
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_state,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_state, all_hidden_states] if v is not None)
+
+        return BaseModelOutputWithNoAttention(last_hidden_state=hidden_state, hidden_states=all_hidden_states)
+
+
+class VanPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config: VanConfig
+    base_model_prefix = "van"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            nn.init.trunc_normal_(module.weight, std=self.config.initializer_range)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                nn.init.constant_(module.bias, 0)
+        elif isinstance(module, nn.LayerNorm):
+            nn.init.constant_(module.bias, 0)
+            nn.init.constant_(module.weight, 1.0)
+        elif isinstance(module, nn.Conv2d):
+            fan_out = module.kernel_size[0] * module.kernel_size[1] * module.out_channels
+            fan_out //= module.groups
+            module.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if module.bias is not None:
+                module.bias.data.zero_()
+
+
+VAN_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`VanConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+VAN_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`ConvNextImageProcessor.__call__`] for details.
+
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all stages. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare VAN model outputting raw features without any specific head on top. Note, VAN does not have an embedding"
+    " layer.",
+    VAN_START_DOCSTRING,
+)
+class VanModel(VanPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.encoder = VanEncoder(config)
+        # final layernorm layer
+        self.layernorm = nn.LayerNorm(config.hidden_sizes[-1], eps=config.layer_norm_eps)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(VAN_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPoolingAndNoAttention,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor],
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutputWithPoolingAndNoAttention]:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_outputs = self.encoder(
+            pixel_values,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_state = encoder_outputs[0]
+        # global average pooling, n c w h -> n c
+        pooled_output = last_hidden_state.mean(dim=[-2, -1])
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndNoAttention(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+        )
+
+
+@add_start_docstrings(
+    """
+    VAN Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
+    ImageNet.
+    """,
+    VAN_START_DOCSTRING,
+)
+class VanForImageClassification(VanPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.van = VanModel(config)
+        # Classifier head
+        self.classifier = (
+            nn.Linear(config.hidden_sizes[-1], config.num_labels) if config.num_labels > 0 else nn.Identity()
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(VAN_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=ImageClassifierOutputWithNoAttention,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, ImageClassifierOutputWithNoAttention]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.van(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict)
+
+        pooled_output = outputs.pooler_output if return_dict else outputs[1]
+
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(labels, logits, self.config)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return ImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states)
+
+
+__all__ = ["VanForImageClassification", "VanModel", "VanPreTrainedModel"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/vit_hybrid/__init__.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/vit_hybrid/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5bd93aa4dabea9b2adbc54eeeb3664de589f43a
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/vit_hybrid/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ....utils import _LazyModule
+from ....utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_vit_hybrid import *
+    from .image_processing_vit_hybrid import *
+    from .modeling_vit_hybrid import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/vit_hybrid/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/vit_hybrid/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d956afc08a06d342f5e058879239c1bc7869ae93
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/vit_hybrid/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/vit_hybrid/__pycache__/configuration_vit_hybrid.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/vit_hybrid/__pycache__/configuration_vit_hybrid.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2d531ea6a6089c1f4ae117b1ff4b0bed81e7e071
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/vit_hybrid/__pycache__/configuration_vit_hybrid.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/vit_hybrid/__pycache__/image_processing_vit_hybrid.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/vit_hybrid/__pycache__/image_processing_vit_hybrid.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0048c553a49f65cb2ad6e54d2f2a321055a49b57
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/vit_hybrid/__pycache__/image_processing_vit_hybrid.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/vit_hybrid/__pycache__/modeling_vit_hybrid.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/vit_hybrid/__pycache__/modeling_vit_hybrid.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..15b64f0dfedc77025974e28bd84ceaf2ac47f104
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/vit_hybrid/__pycache__/modeling_vit_hybrid.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/vit_hybrid/configuration_vit_hybrid.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/vit_hybrid/configuration_vit_hybrid.py
new file mode 100644
index 0000000000000000000000000000000000000000..e96c6c4a1b58cd40e86dfac71dfcb9c0eb19f57c
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/vit_hybrid/configuration_vit_hybrid.py
@@ -0,0 +1,180 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""ViT Hybrid model configuration"""
+
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
+from ...auto.configuration_auto import CONFIG_MAPPING
+from ...bit import BitConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class ViTHybridConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ViTHybridModel`]. It is used to instantiate a ViT
+    Hybrid model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the ViT Hybrid
+    [google/vit-hybrid-base-bit-384](https://huggingface.co/google/vit-hybrid-base-bit-384) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        backbone_config (`Union[dict[str, Any], PretrainedConfig]`, *optional*):
+            The configuration of the backbone in a dictionary or the config object of the backbone.
+        backbone (`str`, *optional*):
+            Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
+            will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
+            is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
+        use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
+            Whether to use pretrained weights for the backbone.
+        use_timm_backbone (`bool`, *optional*, defaults to `False`):
+            Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
+            library.
+        backbone_kwargs (`dict`, *optional*):
+            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
+            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 1):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        backbone_featmap_shape (`list[int]`, *optional*, defaults to `[1, 1024, 24, 24]`):
+            Used only for the `hybrid` embedding type. The shape of the feature maps of the backbone.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+
+    Example:
+
+    ```python
+    >>> from transformers import ViTHybridConfig, ViTHybridModel
+
+    >>> # Initializing a ViT Hybrid vit-hybrid-base-bit-384 style configuration
+    >>> configuration = ViTHybridConfig()
+
+    >>> # Initializing a model (with random weights) from the vit-hybrid-base-bit-384 style configuration
+    >>> model = ViTHybridModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "vit-hybrid"
+
+    def __init__(
+        self,
+        backbone_config=None,
+        backbone=None,
+        use_pretrained_backbone=False,
+        use_timm_backbone=False,
+        backbone_kwargs=None,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        image_size=224,
+        patch_size=1,
+        num_channels=3,
+        backbone_featmap_shape=[1, 1024, 24, 24],
+        qkv_bias=True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if use_pretrained_backbone:
+            raise ValueError("Pretrained backbones are not supported yet.")
+
+        if backbone_config is not None and backbone is not None:
+            raise ValueError("You can't specify both `backbone` and `backbone_config`.")
+
+        if backbone_config is None and backbone is None:
+            logger.info("`backbone_config` is `None`. Initializing the config with a `BiT` backbone.")
+            backbone_config = {
+                "global_padding": "same",
+                "layer_type": "bottleneck",
+                "depths": [3, 4, 9],
+                "out_features": ["stage3"],
+                "embedding_dynamic_padding": True,
+            }
+
+        if backbone_kwargs is not None and backbone_kwargs and backbone_config is not None:
+            raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.")
+
+        if isinstance(backbone_config, dict):
+            if "model_type" in backbone_config:
+                backbone_config_class = CONFIG_MAPPING[backbone_config["model_type"]]
+            else:
+                logger.info(
+                    "`model_type` is not found in `backbone_config`. Use `Bit` as the backbone configuration class."
+                )
+                backbone_config_class = BitConfig
+            backbone_config = backbone_config_class(**backbone_config)
+
+        self.backbone_featmap_shape = backbone_featmap_shape
+        self.backbone_config = backbone_config
+        self.backbone = backbone
+        self.use_pretrained_backbone = use_pretrained_backbone
+        self.use_timm_backbone = use_timm_backbone
+        self.backbone_kwargs = backbone_kwargs
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.qkv_bias = qkv_bias
+
+    @property
+    def sub_configs(self):
+        return (
+            {"backbone_config": type(self.backbone_config)}
+            if getattr(self, "backbone_config", None) is not None
+            else {}
+        )
+
+
+__all__ = ["ViTHybridConfig"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/vit_hybrid/image_processing_vit_hybrid.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/vit_hybrid/image_processing_vit_hybrid.py
new file mode 100644
index 0000000000000000000000000000000000000000..92d518363b2c34b5b30fd04dc0dbe982c8551e3d
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/vit_hybrid/image_processing_vit_hybrid.py
@@ -0,0 +1,341 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for ViT hybrid."""
+
+from typing import Optional, Union
+
+import numpy as np
+
+from ....image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ....image_transforms import (
+    convert_to_rgb,
+    get_resize_output_image_size,
+    resize,
+    to_channel_dimension_format,
+)
+from ....image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_flat_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_kwargs,
+    validate_preprocess_arguments,
+)
+from ....utils import TensorType, is_vision_available, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+if is_vision_available():
+    import PIL
+
+
+class ViTHybridImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a ViT Hybrid image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
+            `do_resize` in the `preprocess` method.
+        size (`dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
+            Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with
+            the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess`
+            method.
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+            Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
+        do_center_crop (`bool`, *optional*, defaults to `True`):
+            Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the
+            `preprocess` method.
+        crop_size (`dict[str, int]` *optional*, defaults to 224):
+            Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess`
+            method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
+            the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
+            method.
+        do_normalize:
+            Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
+        image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_center_crop: bool = True,
+        crop_size: Optional[dict[str, int]] = None,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_convert_rgb: bool = True,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"shortest_edge": 224}
+        size = get_size_dict(size, default_to_square=False)
+        crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
+        crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
+
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
+        self.do_convert_rgb = do_convert_rgb
+        self._valid_processor_keys = [
+            "images",
+            "do_resize",
+            "size",
+            "resample",
+            "do_center_crop",
+            "crop_size",
+            "do_rescale",
+            "rescale_factor",
+            "do_normalize",
+            "image_mean",
+            "image_std",
+            "do_convert_rgb",
+            "return_tensors",
+            "data_format",
+            "input_data_format",
+        ]
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
+        resized to keep the input aspect ratio.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`dict[str, int]`):
+                Size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                Resampling filter to use when resiizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        default_to_square = True
+        if "shortest_edge" in size:
+            size = size["shortest_edge"]
+            default_to_square = False
+        elif "height" in size and "width" in size:
+            size = (size["height"], size["width"])
+        else:
+            raise ValueError("Size must contain either 'shortest_edge' or 'height' and 'width'.")
+
+        output_size = get_resize_output_image_size(
+            image,
+            size=size,
+            default_to_square=default_to_square,
+            input_data_format=input_data_format,
+        )
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample: Optional[PILImageResampling] = None,
+        do_center_crop: Optional[bool] = None,
+        crop_size: Optional[int] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_convert_rgb: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
+                the longest edge resized to keep the input aspect ratio.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
+                Whether to center crop the image.
+            crop_size (`dict[str, int]`, *optional*, defaults to `self.crop_size`):
+                Size of the center crop. Only has an effect if `do_center_crop` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: defaults to the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        size = get_size_dict(size, param_name="size", default_to_square=False)
+        resample = resample if resample is not None else self.resample
+        do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
+        crop_size = crop_size if crop_size is not None else self.crop_size
+        crop_size = get_size_dict(crop_size, param_name="crop_size", default_to_square=True)
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+
+        images = make_flat_list_of_images(images)
+
+        validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_center_crop=do_center_crop,
+            crop_size=crop_size,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+
+        # PIL RGBA images are converted to RGB
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if do_rescale and is_scaled_image(images[0]):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        all_images = []
+        for image in images:
+            if do_resize:
+                image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+
+            if do_center_crop:
+                image = self.center_crop(image=image, size=crop_size, input_data_format=input_data_format)
+
+            if do_rescale:
+                image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+
+            if do_normalize:
+                image = self.normalize(
+                    image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
+                )
+
+            all_images.append(image)
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+            for image in all_images
+        ]
+
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+
+__all__ = ["ViTHybridImageProcessor"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/vit_hybrid/modeling_vit_hybrid.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/vit_hybrid/modeling_vit_hybrid.py
new file mode 100644
index 0000000000000000000000000000000000000000..86b1594a20c92b8f2f528714d4ea9f36e93b8e4d
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/vit_hybrid/modeling_vit_hybrid.py
@@ -0,0 +1,740 @@
+# coding=utf-8
+# Copyright 2022 Google AI, Ross Wightman, The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch ViT Hybrid model."""
+
+import collections.abc
+import math
+from typing import Optional, Union
+
+import torch
+from torch import nn
+
+from ....activations import ACT2FN
+from ....modeling_layers import GradientCheckpointingLayer
+from ....modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
+from ....modeling_utils import PreTrainedModel
+from ....pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ....utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    torch_int,
+)
+from ....utils.backbone_utils import load_backbone
+from .configuration_vit_hybrid import ViTHybridConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "ViTHybridConfig"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "google/vit-hybrid-base-bit-384"
+_EXPECTED_OUTPUT_SHAPE = [1, 197, 768]
+
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "google/vit-hybrid-base-bit-384"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
+
+
+class ViTHybridEmbeddings(nn.Module):
+    """
+    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
+    """
+
+    def __init__(self, config: ViTHybridConfig, use_mask_token: bool = False) -> None:
+        super().__init__()
+
+        self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size))
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) if use_mask_token else None
+        self.patch_embeddings = ViTHybridPatchEmbeddings(config)
+        num_patches = self.patch_embeddings.num_patches
+        self.position_embeddings = nn.Parameter(torch.randn(1, num_patches + 1, config.hidden_size))
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.patch_size = config.patch_size
+        self.config = config
+
+    # Copied from transformers.models.vit.modeling_vit.ViTEmbeddings.interpolate_pos_encoding
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
+        images. This method is also adapted to support torch.jit tracing.
+
+        Adapted from:
+        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
+        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
+        """
+
+        num_patches = embeddings.shape[1] - 1
+        num_positions = self.position_embeddings.shape[1] - 1
+
+        # always interpolate when tracing to ensure the exported model works for dynamic input shapes
+        if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
+            return self.position_embeddings
+
+        class_pos_embed = self.position_embeddings[:, :1]
+        patch_pos_embed = self.position_embeddings[:, 1:]
+
+        dim = embeddings.shape[-1]
+
+        new_height = height // self.patch_size
+        new_width = width // self.patch_size
+
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            size=(new_height, new_width),
+            mode="bicubic",
+            align_corners=False,
+        )
+
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+
+        return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        interpolate_pos_encoding: bool = False,
+    ) -> torch.Tensor:
+        batch_size, num_channels, height, width = pixel_values.shape
+        embeddings = self.patch_embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
+
+        if bool_masked_pos is not None:
+            seq_length = embeddings.shape[1]
+            mask_tokens = self.mask_token.expand(batch_size, seq_length, -1)
+            # replace the masked visual tokens by mask_tokens
+            mask = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)
+            embeddings = embeddings * (1.0 - mask) + mask_tokens * mask
+
+        # add the [CLS] token to the embedded patch tokens
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+
+        # add positional encoding to each token
+        if interpolate_pos_encoding:
+            embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+        else:
+            embeddings = embeddings + self.position_embeddings
+
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+class ViTHybridPatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(self, config, feature_size=None):
+        super().__init__()
+        image_size, patch_size = config.image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.hidden_size
+
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+
+        self.backbone = load_backbone(config)
+        if self.backbone.config.model_type != "bit":
+            raise ValueError(f"Backbone model type {self.backbone.model_type} is not supported.")
+        feature_dim = self.backbone.channels[-1]
+
+        if feature_size is None:
+            feature_map = config.backbone_featmap_shape
+
+            feature_size = feature_map[-2:]
+            feature_dim = feature_map[1]
+        else:
+            feature_size = (
+                feature_size if isinstance(feature_size, collections.abc.Iterable) else (feature_size, feature_size)
+            )
+            feature_dim = self.backbone.channels[-1]
+
+        self.grid_size = (feature_size[0] // patch_size[0], feature_size[1] // patch_size[1])
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+
+        self.projection = nn.Conv2d(feature_dim, hidden_size, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, pixel_values: torch.Tensor, interpolate_pos_encoding: bool = False) -> torch.Tensor:
+        _, num_channels, height, width = pixel_values.shape
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        if not interpolate_pos_encoding:
+            if height != self.image_size[0] or width != self.image_size[1]:
+                raise ValueError(
+                    f"Input image size ({height}*{width}) doesn't match model"
+                    f" ({self.image_size[0]}*{self.image_size[1]})."
+                )
+
+        features = self.backbone(pixel_values).feature_maps[-1]
+        embeddings = self.projection(features).flatten(2).transpose(1, 2)
+
+        return embeddings
+
+
+class ViTHybridSelfAttention(nn.Module):
+    def __init__(self, config: ViTHybridConfig) -> None:
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
+                f"heads {config.num_attention_heads}."
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
+    ) -> Union[tuple[torch.Tensor, torch.Tensor], tuple[torch.Tensor]]:
+        mixed_query_layer = self.query(hidden_states)
+
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+class ViTHybridSdpaSelfAttention(ViTHybridSelfAttention):
+    def __init__(self, config: ViTHybridConfig) -> None:
+        super().__init__(config)
+        self.attention_probs_dropout_prob = config.attention_probs_dropout_prob
+
+    def forward(
+        self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
+    ) -> Union[tuple[torch.Tensor, torch.Tensor], tuple[torch.Tensor]]:
+        mixed_query_layer = self.query(hidden_states)
+
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        context_layer = torch.nn.functional.scaled_dot_product_attention(
+            query_layer,
+            key_layer,
+            value_layer,
+            head_mask,
+            self.attention_probs_dropout_prob if self.training else 0.0,
+            is_causal=False,
+            scale=None,
+        )
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        return context_layer, None
+
+
+class ViTHybridSelfOutput(nn.Module):
+    """
+    The residual connection is defined in ViTHybridLayer instead of here (as is the case with other models), due to the
+    layernorm applied before each block.
+    """
+
+    def __init__(self, config: ViTHybridConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        return hidden_states
+
+
+class ViTHybridAttention(nn.Module):
+    def __init__(self, config: ViTHybridConfig) -> None:
+        super().__init__()
+        self.attention = ViTHybridSelfAttention(config)
+        self.output = ViTHybridSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads: set[int]) -> None:
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[tuple[torch.Tensor, torch.Tensor], tuple[torch.Tensor]]:
+        self_outputs = self.attention(hidden_states, head_mask, output_attentions)
+
+        attention_output = self.output(self_outputs[0], hidden_states)
+
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class ViTHybridSdpaAttention(ViTHybridAttention):
+    def __init__(self, config: ViTHybridConfig) -> None:
+        super().__init__(config)
+        self.attention = ViTHybridSdpaSelfAttention(config)
+
+
+class ViTHybridIntermediate(nn.Module):
+    def __init__(self, config: ViTHybridConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+
+        return hidden_states
+
+
+class ViTHybridOutput(nn.Module):
+    def __init__(self, config: ViTHybridConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        hidden_states = hidden_states + input_tensor
+
+        return hidden_states
+
+
+VIT_HYBRID_ATTENTION_CLASSES = {
+    "eager": ViTHybridAttention,
+    "sdpa": ViTHybridSdpaAttention,
+}
+
+
+class ViTHybridLayer(GradientCheckpointingLayer):
+    """This corresponds to the Block class in the timm implementation."""
+
+    def __init__(self, config: ViTHybridConfig) -> None:
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = VIT_HYBRID_ATTENTION_CLASSES[config._attn_implementation](config)
+        self.intermediate = ViTHybridIntermediate(config)
+        self.output = ViTHybridOutput(config)
+        self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[tuple[torch.Tensor, torch.Tensor], tuple[torch.Tensor]]:
+        self_attention_outputs = self.attention(
+            self.layernorm_before(hidden_states),  # in ViTHybrid, layernorm is applied before self-attention
+            head_mask,
+            output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        # first residual connection
+        # We assign to correct device for `accelerate`, check: https://github.com/huggingface/transformers/pull/20705/
+        hidden_states = attention_output + hidden_states.to(attention_output.device)
+
+        # in ViTHybrid, layernorm is also applied after self-attention
+        layer_output = self.layernorm_after(hidden_states)
+        layer_output = self.intermediate(layer_output)
+
+        # second residual connection is done here
+        layer_output = self.output(layer_output, hidden_states)
+
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+
+class ViTHybridEncoder(nn.Module):
+    def __init__(self, config: ViTHybridConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([ViTHybridLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, BaseModelOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions)
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class ViTHybridPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config: ViTHybridConfig
+    base_model_prefix = "vit"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["ViTHybridEmbeddings", "ViTHybridLayer"]
+    _supports_sdpa = True
+
+    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid
+            # `trunc_normal_cpu` not implemented in `half` issues
+            module.weight.data = nn.init.trunc_normal_(
+                module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range
+            ).to(module.weight.dtype)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, ViTHybridEmbeddings):
+            module.position_embeddings.data = nn.init.trunc_normal_(
+                module.position_embeddings.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.position_embeddings.dtype)
+            module.cls_token.data = nn.init.trunc_normal_(
+                module.cls_token.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.cls_token.dtype)
+            module.mask_token.data.zero_()
+
+
+VIT_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`ViTHybridConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+VIT_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`ViTHybridImageProcessor.__call__`] for details.
+
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare ViT Hybrid Model transformer outputting raw hidden-states without any specific head on top.",
+    VIT_START_DOCSTRING,
+)
+class ViTHybridModel(ViTHybridPreTrainedModel):
+    def __init__(self, config: ViTHybridConfig, add_pooling_layer: bool = True, use_mask_token: bool = False):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = ViTHybridEmbeddings(config, use_mask_token=use_mask_token)
+        self.encoder = ViTHybridEncoder(config)
+
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.pooler = ViTHybridPooler(config) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> ViTHybridPatchEmbeddings:
+        return self.embeddings.patch_embeddings
+
+    def _prune_heads(self, heads_to_prune: dict[int, list[int]]) -> None:
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPooling,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutputWithPooling]:
+        r"""
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        # TODO: maybe have a cleaner way to cast the input (from `ImageProcessor` side?)
+        expected_dtype = self.embeddings.patch_embeddings.projection.weight.dtype
+        if pixel_values.dtype != expected_dtype:
+            pixel_values = pixel_values.to(expected_dtype)
+
+        embedding_output = self.embeddings(
+            pixel_values, bool_masked_pos=bool_masked_pos, interpolate_pos_encoding=interpolate_pos_encoding
+        )
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            head_outputs = (sequence_output, pooled_output) if pooled_output is not None else (sequence_output,)
+            return head_outputs + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class ViTHybridPooler(nn.Module):
+    def __init__(self, config: ViTHybridConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+@add_start_docstrings(
+    """
+    ViT Hybrid Model transformer with an image classification head on top (a linear layer on top of the final hidden
+    state of the [CLS] token) e.g. for ImageNet.
+    """,
+    VIT_START_DOCSTRING,
+)
+class ViTHybridForImageClassification(ViTHybridPreTrainedModel):
+    def __init__(self, config: ViTHybridConfig) -> None:
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.vit = ViTHybridModel(config, add_pooling_layer=False)
+
+        # Classifier head
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=ImageClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, ImageClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.vit(
+            pixel_values,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.classifier(sequence_output[:, 0, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(labels, logits, self.config)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return ImageClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = ["ViTHybridForImageClassification", "ViTHybridModel", "ViTHybridPreTrainedModel"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/xlm_prophetnet/__init__.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/xlm_prophetnet/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c13c67012fa157a94bae21734a1f59b26c78588d
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/xlm_prophetnet/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ....utils import _LazyModule
+from ....utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_xlm_prophetnet import *
+    from .modeling_xlm_prophetnet import *
+    from .tokenization_xlm_prophetnet import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/xlm_prophetnet/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/xlm_prophetnet/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a15f09f29ae9fe75c72c02b97130f0763ceaadcd
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/xlm_prophetnet/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/xlm_prophetnet/__pycache__/configuration_xlm_prophetnet.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/xlm_prophetnet/__pycache__/configuration_xlm_prophetnet.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5be363ff7daf1dd687b3b9fabfac1cd57253be81
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/xlm_prophetnet/__pycache__/configuration_xlm_prophetnet.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/xlm_prophetnet/__pycache__/tokenization_xlm_prophetnet.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/deprecated/xlm_prophetnet/__pycache__/tokenization_xlm_prophetnet.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..de766dc60259e7f34eb22bd26d39576ffe356702
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/deprecated/xlm_prophetnet/__pycache__/tokenization_xlm_prophetnet.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/xlm_prophetnet/configuration_xlm_prophetnet.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/xlm_prophetnet/configuration_xlm_prophetnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..59f42577c59f19d9d379eaaec15029eaed6967c3
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/xlm_prophetnet/configuration_xlm_prophetnet.py
@@ -0,0 +1,181 @@
+# coding=utf-8
+# Copyright 2020 The Microsoft Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""XLM-ProphetNet model configuration"""
+
+from typing import Callable, Optional, Union
+
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class XLMProphetNetConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`XLMProphetNetModel`]. It is used to instantiate a
+    XLMProphetNet model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the XLMProphetNet
+    [microsoft/xprophetnet-large-wiki100-cased](https://huggingface.co/microsoft/xprophetnet-large-wiki100-cased)
+    architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        activation_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for activations inside the fully connected layer.
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the ProphetNET model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`XLMProphetNetModel`].
+        hidden_size (`int`, *optional*, defaults to 1024):
+            Dimensionality of the layers and the pooler layer.
+        encoder_ffn_dim (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        num_encoder_layers (`int`, *optional*, defaults to 12):
+            Number of encoder layers.
+        num_encoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
+            Dimensionality of the `intermediate` (often named feed-forward) layer in decoder.
+        num_decoder_layers (`int`, *optional*, defaults to 12):
+            Number of decoder layers.
+        num_decoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        attention_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        add_cross_attention (`bool`, *optional*, defaults to `True`):
+            Whether cross-attention layers should be added to the model.
+        is_encoder_decoder (`bool`, *optional*, defaults to `True`):
+            Whether this is an encoder/decoder model.
+        pad_token_id (`int`, *optional*, defaults to 1)
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 0)
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2)
+            End of stream token id.
+        ngram (`int`, *optional*, defaults to 2)
+            Number of future tokens to predict. Set to 1 to be same as traditional Language model to predict next first
+            token.
+        num_buckets (`int`, *optional*, defaults to 32)
+            The number of buckets to use for each attention layer. This is for relative position calculation. See the
+            [T5 paper](see https://huggingface.co/papers/1910.10683) for more details.
+        relative_max_distance (`int`, *optional*, defaults to 128)
+            Relative distances greater than this number will be put into the last same bucket. This is for relative
+            position calculation. See the [T5 paper](see https://huggingface.co/papers/1910.10683) for more details.
+        disable_ngram_loss (`bool`, *optional*, defaults to `False`):
+            Whether be trained predicting only the next first token.
+        eps (`float`, *optional*, defaults to 0.0):
+            Controls the `epsilon` parameter value for label smoothing in the loss calculation. If set to 0, no label
+            smoothing is performed.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+    """
+
+    model_type = "xlm-prophetnet"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "num_attention_heads": "num_encoder_attention_heads",
+    }
+
+    def __init__(
+        self,
+        activation_dropout: Optional[float] = 0.1,
+        activation_function: Optional[Union[str, Callable]] = "gelu",
+        vocab_size: Optional[int] = 30522,
+        hidden_size: Optional[int] = 1024,
+        encoder_ffn_dim: Optional[int] = 4096,
+        num_encoder_layers: Optional[int] = 12,
+        num_encoder_attention_heads: Optional[int] = 16,
+        decoder_ffn_dim: Optional[int] = 4096,
+        num_decoder_layers: Optional[int] = 12,
+        num_decoder_attention_heads: Optional[int] = 16,
+        attention_dropout: Optional[float] = 0.1,
+        dropout: Optional[float] = 0.1,
+        max_position_embeddings: Optional[int] = 512,
+        init_std: Optional[float] = 0.02,
+        is_encoder_decoder: Optional[bool] = True,
+        add_cross_attention: Optional[bool] = True,
+        decoder_start_token_id: Optional[int] = 0,
+        ngram: Optional[int] = 2,
+        num_buckets: Optional[int] = 32,
+        relative_max_distance: Optional[int] = 128,
+        disable_ngram_loss: Optional[bool] = False,
+        eps: Optional[float] = 0.0,
+        use_cache: Optional[bool] = True,
+        pad_token_id: Optional[int] = 0,
+        bos_token_id: Optional[int] = 1,
+        eos_token_id: Optional[int] = 2,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.num_encoder_layers = num_encoder_layers
+        self.num_encoder_attention_heads = num_encoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.num_decoder_layers = num_decoder_layers
+        self.num_decoder_attention_heads = num_decoder_attention_heads
+        self.max_position_embeddings = max_position_embeddings
+        self.init_std = init_std  # Normal(0, this parameter)
+        self.activation_function = activation_function
+
+        # parameters for xlmprophetnet
+        self.ngram = ngram
+        self.num_buckets = num_buckets
+        self.relative_max_distance = relative_max_distance
+        self.disable_ngram_loss = disable_ngram_loss
+        self.eps = eps
+
+        # 3 Types of Dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.dropout = dropout
+
+        self.use_cache = use_cache
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            add_cross_attention=add_cross_attention,
+            decoder_start_token_id=decoder_start_token_id,
+            **kwargs,
+        )
+
+    @property
+    def num_hidden_layers(self) -> int:
+        return self.num_encoder_layers + self.num_decoder_layers
+
+    @num_hidden_layers.setter
+    def num_hidden_layers(self, value):
+        raise NotImplementedError(
+            "This model does not support the setting of `num_hidden_layers`. Please set `num_encoder_layers` and"
+            " `num_decoder_layers`."
+        )
+
+
+__all__ = ["XLMProphetNetConfig"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/xlm_prophetnet/modeling_xlm_prophetnet.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/xlm_prophetnet/modeling_xlm_prophetnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..36f6e6097bc3e55a153b991df9161b01c09c153a
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/xlm_prophetnet/modeling_xlm_prophetnet.py
@@ -0,0 +1,2305 @@
+# coding=utf-8
+# Copyright 2020 The Microsoft Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch XLM-ProphetNet model."""
+
+import copy
+import math
+import warnings
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+from torch import Tensor, nn
+from torch.nn import LayerNorm
+
+from ....activations import ACT2FN
+from ....cache_utils import Cache
+from ....modeling_layers import GradientCheckpointingLayer
+from ....modeling_outputs import BaseModelOutput
+from ....modeling_utils import PreTrainedModel
+from ....utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from ....utils.deprecation import deprecate_kwarg
+from .configuration_xlm_prophetnet import XLMProphetNetConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+_CONFIG_FOR_DOC = "XLMProphetNetConfig"
+
+
+XLM_PROPHETNET_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    Original ProphetNet code can be found [here](https://github.com/microsoft/ProphetNet). Checkpoints were converted
+    from original Fairseq checkpoints. For more information on the checkpoint conversion, please take a look at the
+    file `convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py`.
+
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matters related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`XLMProphetNetConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+XLM_PROPHETNET_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+
+            XLMProphetNet uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
+            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+        head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+XLM_PROPHETNET_STANDALONE_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+def softmax(hidden_state, dim, onnx_trace=False):
+    if onnx_trace:
+        return nn.functional.softmax(hidden_state.float(), dim=dim)
+    else:
+        return nn.functional.softmax(hidden_state, dim=dim, dtype=torch.float32)
+
+
+def ngram_attention_bias(sequence_length, ngram, device, dtype):
+    """
+    This function computes the bias for the predict stream
+    """
+    left_block = (
+        torch.ones((ngram, sequence_length, sequence_length), device=device, dtype=dtype) * torch.finfo(dtype).min
+    )
+    right_block = left_block.detach().clone()
+    # create bias
+    for stream_idx in range(ngram):
+        right_block[stream_idx].fill_diagonal_(0, wrap=False)
+        left_block[stream_idx].triu_(-stream_idx + 1)
+
+    left_block[:, :, 0] = 0
+    return torch.cat([left_block, right_block], dim=2)
+
+
+def compute_relative_buckets(num_buckets, max_distance, relative_positions, is_bidirectional=False):
+    """
+    This function computes individual parts of the relative position buckets. For more detail, see paper.
+    """
+    inv_relative_positions = -relative_positions
+    rel_positions_bucket = 0
+
+    if is_bidirectional:
+        num_buckets = num_buckets // 2
+        rel_positions_bucket = (
+            rel_positions_bucket
+            + torch.lt(inv_relative_positions, torch.zeros_like(inv_relative_positions)).int() * num_buckets
+        )
+        inv_relative_positions = torch.abs(inv_relative_positions)
+    else:
+        inv_relative_positions = torch.max(inv_relative_positions, torch.zeros_like(inv_relative_positions))
+
+    max_exact = num_buckets // 2
+    is_small = torch.lt(inv_relative_positions, max_exact)
+    val_if_large = max_exact + torch.log(inv_relative_positions.float() / max_exact) / math.log(
+        max_distance / max_exact
+    ) * (num_buckets - max_exact)
+    val_if_large = torch.min(val_if_large, torch.ones_like(val_if_large) * (num_buckets - 1)).int()
+    rel_positions_bucket = rel_positions_bucket + torch.where(is_small, inv_relative_positions.int(), val_if_large)
+    return rel_positions_bucket
+
+
+def compute_all_stream_relative_buckets(num_buckets, max_distance, position_ids):
+    """
+    This function computes both main and predict relative position buckets. For more detail, see paper.
+    """
+    # main stream
+    main_stream_relative_positions = position_ids.unsqueeze(1).repeat(1, position_ids.size(-1), 1)
+    main_stream_relative_positions = main_stream_relative_positions - position_ids.unsqueeze(-1)
+
+    # predicting stream
+    predicting_stream_relative_positions = torch.cat((position_ids - 1, position_ids), dim=-1).unsqueeze(1)
+    predicting_stream_relative_positions = predicting_stream_relative_positions.repeat(1, position_ids.size(-1), 1)
+    predicting_stream_relative_positions = predicting_stream_relative_positions - position_ids.unsqueeze(-1)
+
+    # get both position buckets
+    main_relative_position_buckets = compute_relative_buckets(
+        num_buckets, max_distance, main_stream_relative_positions, is_bidirectional=False
+    )
+    predict_relative_position_buckets = compute_relative_buckets(
+        num_buckets, max_distance, predicting_stream_relative_positions, is_bidirectional=False
+    )
+    return main_relative_position_buckets, predict_relative_position_buckets
+
+
+@dataclass
+class XLMProphetNetSeq2SeqLMOutput(ModelOutput):
+    """
+    Base class for sequence-to-sequence language models outputs.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss.
+        logits (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, config.vocab_size)`):
+            Prediction scores of the main stream language modeling head (scores for each vocabulary token before
+            SoftMax).
+        logits_ngram (`torch.FloatTensor` of shape `(batch_size, ngram * decoder_sequence_length, config.vocab_size)`):
+            Prediction scores of the predict stream language modeling head (scores for each vocabulary token before
+            SoftMax).
+        past_key_values (`list[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
+            num_attn_heads, decoder_sequence_length, embed_size_per_head)`).
+
+            Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
+            used (see `past_key_values` input) to speed up sequential decoding.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, decoder_sequence_length, hidden_size)`.
+
+            Hidden-states of main stream of the decoder at the output of each layer plus the initial embedding outputs.
+        decoder_ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.
+
+            Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
+            outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
+            decoder_sequence_length, decoder_sequence_length)`.
+
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        decoder_ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
+            decoder_sequence_length, decoder_sequence_length)`.
+
+            Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
+            encoder_sequence_length, decoder_sequence_length)`.
+
+            Attentions weights of the cross-attention layer of the decoder, after the attention softmax, used to
+            compute the weighted average in the
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, encoder_sequence_length, hidden_size)`.
+
+            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
+            encoder_sequence_length, encoder_sequence_length)`. Attentions weights of the encoder, after the attention
+            softmax, used to compute the weighted average in the self-attention heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    logits_ngram: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[Cache] = None
+    decoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    decoder_ngram_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[tuple[torch.FloatTensor]] = None
+    decoder_ngram_attentions: Optional[tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[tuple[torch.FloatTensor]] = None
+
+    @property
+    def decoder_cross_attentions(self):
+        warnings.warn(
+            "`decoder_cross_attentions` is deprecated and will be removed soon. Please use `cross_attentions`"
+            " instead.",
+            FutureWarning,
+        )
+        return self.cross_attentions
+
+
+@dataclass
+class XLMProphetNetSeq2SeqModelOutput(ModelOutput):
+    """
+    Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
+    decoding.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, hidden_size)`):
+            Sequence of main stream hidden-states at the output of the last layer of the decoder of the model.
+
+            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
+            hidden_size)` is output.
+        last_hidden_state_ngram (`torch.FloatTensor` of shape `(batch_size,ngram * decoder_sequence_length, config.vocab_size)`, *optional*):
+            Sequence of predict stream hidden-states at the output of the last layer of the decoder of the model.
+        past_key_values (`list[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
+            num_attn_heads, decoder_sequence_length, embed_size_per_head)`).
+
+            Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
+            used (see `past_key_values` input) to speed up sequential decoding.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, decoder_sequence_length, hidden_size)`.
+
+            Hidden-states of main stream of the decoder at the output of each layer plus the initial embedding outputs.
+        decoder_ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.
+
+            Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
+            outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
+            decoder_sequence_length, decoder_sequence_length)`.
+
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        decoder_ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
+            decoder_sequence_length, decoder_sequence_length)`.
+
+            Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
+            weighted average in the
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
+            encoder_sequence_length, decoder_sequence_length)`.
+
+            Attentions weights of the cross-attention layer of the decoder, after the attention softmax, used to
+            compute the weighted average in the
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, encoder_sequence_length, hidden_size)`.
+
+            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
+            encoder_sequence_length, encoder_sequence_length)`.
+
+            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+    """
+
+    last_hidden_state: torch.FloatTensor
+    last_hidden_state_ngram: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[Cache] = None
+    decoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    decoder_ngram_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[tuple[torch.FloatTensor]] = None
+    decoder_ngram_attentions: Optional[tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[tuple[torch.FloatTensor]] = None
+
+    @property
+    def decoder_cross_attentions(self):
+        warnings.warn(
+            "`decoder_cross_attentions` is deprecated and will be removed soon. Please use `cross_attentions`"
+            " instead.",
+            FutureWarning,
+        )
+        return self.cross_attentions
+
+
+@dataclass
+class XLMProphetNetDecoderModelOutput(ModelOutput):
+    """
+    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, hidden_size)`):
+            Sequence of main stream hidden-states at the output of the last layer of the decoder of the model.
+
+            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
+            hidden_size)` is output.
+        last_hidden_state_ngram (`torch.FloatTensor` of shape `(batch_size, ngram * decoder_sequence_length, config.vocab_size)`):
+            Sequence of predict stream hidden-states at the output of the last layer of the decoder of the model.
+        past_key_values (`list[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
+            num_attn_heads, decoder_sequence_length, embed_size_per_head)`).
+
+            Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
+            used (see `past_key_values` input) to speed up sequential decoding.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, decoder_sequence_length, hidden_size)`.
+
+            Hidden-states of main stream of the decoder at the output of each layer plus the initial embedding outputs.
+        ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.
+
+            Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
+            outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
+            decoder_sequence_length, decoder_sequence_length)`.
+
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
+            decoder_sequence_length, decoder_sequence_length)`.
+
+            Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
+            weighted average in the
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
+            encoder_sequence_length, decoder_sequence_length)`.
+
+            Attentions weights of the cross-attention layer of the decoder, after the attention softmax, used to
+            compute the weighted average in the
+    """
+
+    last_hidden_state: torch.FloatTensor
+    last_hidden_state_ngram: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[Cache] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    hidden_states_ngram: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    ngram_attentions: Optional[tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class XLMProphetNetDecoderLMOutput(ModelOutput):
+    """
+    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss.
+        logits (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, config.vocab_size)`):
+            Prediction scores of the main stream language modeling head (scores for each vocabulary token before
+            SoftMax).
+        logits_ngram (`torch.FloatTensor` of shape `(batch_size, ngram * decoder_sequence_length, config.vocab_size)`):
+            Prediction scores of the predict stream language modeling head (scores for each vocabulary token before
+            SoftMax).
+        past_key_values (`list[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
+            num_attn_heads, decoder_sequence_length, embed_size_per_head)`).
+
+            Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
+            used (see `past_key_values` input) to speed up sequential decoding.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, decoder_sequence_length, hidden_size)`.
+
+            Hidden-states of main stream of the decoder at the output of each layer plus the initial embedding outputs.
+        ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.
+
+            Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
+            outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
+            decoder_sequence_length, decoder_sequence_length)`.
+
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
+            decoder_sequence_length, decoder_sequence_length)`.
+
+            Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
+            weighted average in the
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
+            encoder_sequence_length, decoder_sequence_length)`.
+
+            Attentions weights of the cross-attention layer of the decoder, after the attention softmax, used to
+            compute the weighted average in the
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    logits_ngram: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[Cache] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    hidden_states_ngram: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    ngram_attentions: Optional[tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[tuple[torch.FloatTensor]] = None
+
+
+class XLMProphetNetPreTrainedModel(PreTrainedModel):
+    config: XLMProphetNetConfig
+    base_model_prefix = "prophetnet"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.init_std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.init_std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    def _shift_right(self, input_ids):
+        decoder_start_token_id = self.config.decoder_start_token_id
+        pad_token_id = self.config.pad_token_id
+
+        assert decoder_start_token_id is not None, (
+            "self.model.config.decoder_start_token_id has to be defined. In XLMProphetNet it is usually set to the"
+            " pad_token_id. See XLMProphetNet docs for more information"
+        )
+
+        # shift inputs to the right
+        shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+        shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
+        shifted_input_ids[..., 0] = decoder_start_token_id
+
+        assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
+        # replace possible -100 values in labels by `pad_token_id`
+        shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+        assert torch.all(shifted_input_ids >= 0).item(), "Verify that `shifted_input_ids` has only positive values"
+
+        return shifted_input_ids
+
+
+class XLMProphetNetPositionalEmbeddings(nn.Embedding):
+    """
+    This module learns positional embeddings up to a fixed maximum size. Padding ids are ignored by either offsetting
+    based on padding_idx or by setting padding_idx to None and ensuring that the appropriate position ids are passed to
+    the forward function.
+    """
+
+    def __init__(self, config: XLMProphetNetConfig) -> None:
+        self.max_length = config.max_position_embeddings
+        super().__init__(config.max_position_embeddings, config.hidden_size, config.pad_token_id)
+
+    def forward(self, inputs_shape, device, attention_mask=None, past_key_values=None, position_ids=None):
+        assert (position_ids is None) or (self.padding_idx is None), (
+            "If position_ids is pre-computed then padding_idx should not be set."
+        )
+
+        if position_ids is None:
+            if past_key_values is not None:
+                # position_ids is the same for every token when decoding a single step
+                # Without the int() cast, it doesn't work in some cases when exporting to ONNX
+                prev_num_input_ids = past_key_values.get_seq_length()
+                num_input_ids = inputs_shape[1] + prev_num_input_ids
+                position_ids = torch.ones((1, 1), dtype=torch.long, device=device) * (
+                    int(self.padding_idx + num_input_ids)
+                )
+            else:
+                if attention_mask is None:
+                    attention_mask = torch.ones(inputs_shape, dtype=torch.long, device=device)
+
+                # retrieve position_ids from input_ids / attention_mask
+                position_ids = (
+                    torch.cumsum(attention_mask, dim=1).type_as(attention_mask) * attention_mask
+                ).long() + self.padding_idx
+
+                # make sure position_ids are not bigger then max_length
+                position_ids = position_ids.clamp(0, self.max_length - 1)
+
+        return super().forward(position_ids), position_ids
+
+    def _forward(self, position_ids):
+        return super().forward(position_ids)
+
+
+class XLMProphetNetAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        config: XLMProphetNetConfig,
+        num_attn_heads: int,
+    ):
+        super().__init__()
+        hidden_size = config.hidden_size
+
+        self.attention_dropout = config.attention_dropout
+        self.dropout = config.dropout
+        self.num_attn_heads = num_attn_heads
+        self.head_dim = hidden_size // num_attn_heads
+
+        assert self.head_dim * num_attn_heads == hidden_size, (
+            "`config.hidden_size` must be divisible by `config.num_encoder_attention_heads` and"
+            " `config.num_decoder_attention_heads`"
+        )
+
+        self.key_proj = nn.Linear(hidden_size, hidden_size)
+        self.value_proj = nn.Linear(hidden_size, hidden_size)
+        self.query_proj = nn.Linear(hidden_size, hidden_size)
+
+        self.out_proj = nn.Linear(hidden_size, hidden_size)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_attn_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states,
+        key_value_states: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        layer_head_mask: Optional[Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: bool = False,
+    ) -> tuple[Tensor, Optional[Tensor]]:
+        batch_size, tgt_len, hidden_size = hidden_states.size()
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        assert list(hidden_states.size()) == [
+            batch_size,
+            tgt_len,
+            hidden_size,
+        ], f"Size of hidden states should be {batch_size, tgt_len, hidden_size}, but is {hidden_states.size()}"
+
+        # previous time steps are cached - no need to recompute key and value if they are static
+        query_states = self.query_proj(hidden_states) / (self.head_dim**0.5)
+
+        if is_cross_attention and past_key_values is not None:
+            # reuse k,v, cross_attentions
+            key_states = past_key_values[0]
+            value_states = past_key_values[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.key_proj(key_value_states), -1, batch_size)
+            value_states = self._shape(self.value_proj(key_value_states), -1, batch_size)
+        else:
+            # self_attention
+            key_states = self._shape(self.key_proj(hidden_states), -1, batch_size)
+            value_states = self._shape(self.value_proj(hidden_states), -1, batch_size)
+
+        if is_cross_attention:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if encoder bi-directional self-attention `past_key_values` is always `None`
+            past_key_values = (key_states, value_states)
+
+        # project states into the correct shape
+        proj_shape = (batch_size, self.num_attn_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, batch_size).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+        src_len = key_states.size(2)
+        attn_weights = torch.einsum("bsij,bsjk->bsik", query_states, key_states.transpose(2, 3))
+        expected_shape = (batch_size, self.num_attn_heads, tgt_len, src_len)
+        if attn_weights.size() != expected_shape:
+            raise ValueError(f"Attention weights should have size {expected_shape}, but is {attn_weights.size()}")
+
+        # This is part of a workaround to get around fork/join parallelism not supporting Optional types.
+        if attention_mask is not None and attention_mask.dim() == 0:
+            attention_mask = None
+
+        expected_shape = (batch_size, self.num_attn_heads, 1, src_len)
+        if attention_mask is not None and attention_mask.size() != expected_shape:
+            raise ValueError(f"Attention mask should have size {expected_shape}, but is {attention_mask.size()}")
+        if attention_mask is not None:  # don't attend to padding symbols
+            attn_weights = attn_weights + attention_mask
+        if output_attentions:
+            attn_weights_reshaped = attn_weights
+        else:
+            attn_weights_reshaped = None
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if layer_head_mask is not None:
+            assert layer_head_mask.size() == (self.num_attn_heads,), (
+                f"Head mask for a single layer should be of size {(self.num_attn_heads,)}, but is"
+                f" {layer_head_mask.size()}"
+            )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(
+                batch_size, self.num_attn_heads, tgt_len, src_len
+            )
+
+            # apply head_mask also on attn_weights_reshaped which is used for n-gram attention inside the model
+            attn_weights_reshaped = layer_head_mask.view(1, -1, 1, 1) * attn_weights_reshaped
+
+        attn_probs = nn.functional.dropout(
+            attn_weights,
+            p=self.attention_dropout,
+            training=self.training,
+        )
+        attn_output = torch.einsum("bsij,bsjk->bsik", attn_probs, value_states)
+        expected_shape = (batch_size, self.num_attn_heads, tgt_len, self.head_dim)
+        if attn_output.size() != expected_shape:
+            raise ValueError(f"`attn_output` should have shape {expected_shape}, but is of shape {attn_output.size()}")
+
+        attn_output = attn_output.transpose(1, 2).reshape(batch_size, tgt_len, hidden_size)
+        attn_output = self.out_proj(attn_output)
+
+        attn_output = nn.functional.dropout(attn_output, p=self.dropout, training=self.training)
+        return attn_output, attn_weights_reshaped, past_key_values
+
+
+class XLMProphetNetFeedForward(nn.Module):
+    """
+    This is the residual two feed-forward layer block based on the original Transformer implementation.
+    """
+
+    def __init__(self, config: XLMProphetNetConfig, ffn_dim: int):
+        super().__init__()
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.intermediate = nn.Linear(config.hidden_size, ffn_dim)
+        self.output = nn.Linear(ffn_dim, config.hidden_size)
+        self.activation_dropout = config.activation_dropout
+        self.dropout = config.dropout
+
+    def forward(self, hidden_states):
+        hidden_states = self.intermediate(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.output(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        return hidden_states
+
+
+class XLMProphetNetNgramSelfAttention(nn.Module):
+    def __init__(self, config: XLMProphetNetConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.num_buckets = config.num_buckets
+        self.relative_max_distance = config.relative_max_distance
+        self.num_attn_heads = config.num_decoder_attention_heads
+        self.dropout = config.dropout
+        self.attention_dropout = config.attention_dropout
+        self.head_dim = config.hidden_size // self.num_attn_heads
+        self.ngram = config.ngram
+
+        assert self.head_dim * self.num_attn_heads == config.hidden_size, (
+            "config.hidden_size must be divisible by num_attn_heads"
+        )
+        # key, value, query projection
+        self.key_proj = nn.Linear(config.hidden_size, config.hidden_size)
+        self.value_proj = nn.Linear(config.hidden_size, config.hidden_size)
+        self.query_proj = nn.Linear(config.hidden_size, config.hidden_size)
+
+        # out projection
+        self.out_proj = nn.Linear(config.hidden_size, config.hidden_size)
+
+        # rel position embeddings
+        self.relative_pos_embeddings = nn.Linear(config.hidden_size, self.num_buckets * self.num_attn_heads)
+
+        # for onnx runtime
+        self.onnx_trace = False
+
+    def _shape(self, tensor, seq_len, batch_size):
+        return tensor.view(batch_size, seq_len, self.num_attn_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def prepare_for_onnx_export_(self):
+        self.onnx_trace = True
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states,
+        past_key_values: Optional[Cache] = None,
+        attention_mask=None,
+        layer_head_mask=None,
+        extended_predict_attention_mask=None,
+        main_relative_position_buckets=None,
+        predict_relative_position_buckets=None,
+        position_ids=None,
+    ):
+        batch_size, ngram_sequence_length, hidden_size = hidden_states.size()
+        assert list(hidden_states.size()) == [batch_size, ngram_sequence_length, hidden_size], (
+            f"`hidden_states` should be of shape {batch_size, ngram_sequence_length, hidden_size}, but is of shape"
+            f" {hidden_states.shape}"
+        )
+
+        # project
+        query_states = self.query_proj(hidden_states)
+        key_states = self.key_proj(hidden_states)
+        value_states = self.value_proj(hidden_states)
+
+        # normalize
+        query_states = query_states / (self.head_dim**0.5)
+
+        # reshape
+        query_states = self._shape(query_states, ngram_sequence_length, batch_size)
+        key_states = self._shape(key_states, -1, batch_size)
+        value_states = self._shape(value_states, -1, batch_size)
+        proj_shape = (batch_size, self.num_attn_heads, -1, self.head_dim)
+
+        query_states = query_states.view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        # chunk into main stream and predict stream
+        hidden_states_list = hidden_states.chunk(1 + self.ngram, dim=1)
+        query_states_list = query_states.chunk(1 + self.ngram, dim=2)
+        key_states_list = key_states.chunk(1 + self.ngram, dim=2)
+        value_states_list = value_states.chunk(1 + self.ngram, dim=2)
+
+        main_hidden_states, hidden_states_predict_list = hidden_states_list[0], hidden_states_list[1:]
+        main_query_states, predict_query_states_list = query_states_list[0], query_states_list[1:]
+        main_key_states, predict_key_states_list = key_states_list[0], key_states_list[1:]
+        main_value_states, predict_value_states_list = value_states_list[0], value_states_list[1:]
+
+        # saved states are stored with shape (batch_size, num_attn_heads, seq_len, head_dim)
+        if past_key_values is not None:
+            prev_main_key_states = past_key_values[0]
+            main_key_states = torch.cat((prev_main_key_states, main_key_states), dim=2)
+            prev_main_value_states = past_key_values[1]
+            main_value_states = torch.cat((prev_main_value_states, main_value_states), dim=2)
+
+        # Update cache
+        past_key_values = (main_key_states, main_value_states)
+
+        # get seq_length of main stream only
+        sequence_length = ngram_sequence_length // (1 + self.ngram)
+
+        # MAIN-STREAM
+        # main attn weights
+        # [batch_size, number_heads, sequence_length, head_dimesion]
+        # x [batch_size, number_heads, head_dimesion, sequence_length]
+        # -> [batch_size, number_heads, sequence_length, sequence_length]
+        main_attn_weights = torch.einsum("bntc,bncs->bnts", main_query_states, main_key_states.transpose(2, 3))
+
+        # retrieve relative position embeddings for each layer -> see paper for more details
+        main_relative_pos_embeddings = self.get_main_relative_pos_embeddings(
+            main_hidden_states, main_attn_weights, position_ids, main_relative_position_buckets
+        )
+
+        main_attn_weights = main_attn_weights + main_relative_pos_embeddings
+
+        if attention_mask is not None:
+            main_attn_weights = main_attn_weights + attention_mask
+
+        main_attn_probs = softmax(
+            main_attn_weights,
+            dim=-1,
+            onnx_trace=self.onnx_trace,
+        ).type_as(main_attn_weights)
+
+        if layer_head_mask is not None:
+            assert layer_head_mask.size() == (self.num_attn_heads,), (
+                f"Head mask for a single layer should be of size {(self.num_attn_heads,)}, but is"
+                f" {layer_head_mask.size()}"
+            )
+            main_attn_probs = layer_head_mask.view(1, -1, 1, 1) * main_attn_probs.view(
+                batch_size, self.num_attn_heads, -1, sequence_length
+            )
+
+        main_attn_probs = nn.functional.dropout(main_attn_probs, p=self.attention_dropout, training=self.training)
+        # project to attn_output
+        # [batch_size, number_heads, sequence_length, sequence_length]
+        # x [batch_size, number_heads, sequence_length, head_dimesion]
+        # -> [batch_size, number_heads, sequence_length, head_dimesion]
+        main_attn_output = torch.einsum("bntc,bncs->bnts", main_attn_probs, main_value_states)
+        # reshape so that num_heads dim is merged into last `head_dim` axis
+        main_attn_output = main_attn_output.transpose(1, 2).reshape(batch_size, 1, sequence_length, hidden_size)
+        main_attn_output = self.out_proj(main_attn_output)
+
+        # PREDICT-STREAM
+        # [batch_size, ngram, number_heads, sequence_length, head_dimesion]
+        predict_query_states = torch.stack(predict_query_states_list, 1).view(
+            batch_size, self.ngram, self.num_attn_heads, sequence_length, self.head_dim
+        )
+
+        # [batch_size, ngram, number_heads, 2*sequence_length, head_dimesion]
+        predict_key_states = torch.stack([torch.cat([main_key_states, key], 2) for key in predict_key_states_list], 1)
+
+        # [batch_size, sequence_length, ngram, hidden_size]
+        predict_hidden_states = torch.stack(hidden_states_predict_list, dim=2)
+
+        # [batch_size, number_heads, ngram, 2*sequence_length, head_dimesion]
+        predict_value_states = torch.cat(
+            [torch.cat([main_value_states, v_p], 2).unsqueeze(2) for v_p in predict_value_states_list], 2
+        )
+
+        # [batch_size, ngram, number_heads, sequence_length, head_dimesion]
+        # x [batch_size, ngram, number_heads, 2*sequence_length, head_dimesion]
+        # -> [batch_size, ngram, number_heads, sequence_length, 2*sequence_length]
+        predict_attn_weights = torch.einsum("bnhtc,bnhsc->bnhts", (predict_query_states, predict_key_states))
+
+        # retrieve relative position embeddings for each layer -> see paper for more details
+        # [batch_size, ngram, number_heads, sequence_length, predict_relative_pos_embeddings]
+        predict_relative_pos_embeddings = self.get_predict_relative_pos_embeddings(
+            predict_hidden_states, predict_attn_weights, position_ids, predict_relative_position_buckets
+        )
+
+        # [batch_size, ngram, number_heads, sequence_length, 2*sequence_length]
+        predict_attn_weights = predict_attn_weights + predict_relative_pos_embeddings
+
+        if extended_predict_attention_mask is not None:
+            # Permuting Predict attention mask to [batch_size, ngram, number_heads, sequence_length, 2*sequence_length]
+            extended_predict_attention_mask = extended_predict_attention_mask.permute(0, 2, 1, 3, 4)
+            extended_predict_attention_mask = extended_predict_attention_mask.to(predict_attn_weights.dtype)
+            predict_attn_weights = predict_attn_weights + extended_predict_attention_mask
+
+        predict_attn_probs = softmax(
+            predict_attn_weights,
+            dim=-1,
+            onnx_trace=self.onnx_trace,
+        ).type_as(predict_attn_weights)
+
+        if layer_head_mask is not None:
+            assert layer_head_mask.size() == (self.num_attn_heads,), (
+                f"Head mask for a single layer should be of size {(self.num_attn_heads,)}, but is"
+                f" {layer_head_mask.size()}"
+            )
+            predict_attn_probs = layer_head_mask.view(1, 1, -1, 1, 1) * predict_attn_probs
+
+        predict_attn_probs = nn.functional.dropout(
+            predict_attn_probs, p=self.attention_dropout, training=self.training
+        )
+        # project to attention output
+        # [batch_size, ngram, number_heads, sequence_length, 2*sequence_length]
+        # x [batch_size, ngram, number_heads, 2*sequence_length, head_dimesion]
+        # -> [batch_size, ngram, number_heads, sequence_length, head_dimesion]
+        predict_attn_output = torch.einsum(
+            "bnhts,bnhsc->bnhtc", (predict_attn_probs, predict_value_states.transpose(1, 2))
+        )
+
+        # reshape so that num_heads dim is merged into last `head_dim` axis
+        # [batch_size, ngram, number_heads, sequence_length, head_dimesion] -> [batch_size, ngram, sequence_length, hidden_size]
+        predict_attn_output = predict_attn_output.transpose(2, 3)
+        predict_attn_output = predict_attn_output.reshape(batch_size, self.ngram, sequence_length, hidden_size)
+        predict_attn_output = self.out_proj(predict_attn_output)
+
+        # concat to single attn output
+        # [batch_size, (1+ngram)*sequence_length, hidden_size]
+        attn_output = torch.cat([main_attn_output, predict_attn_output], 1).view(batch_size, -1, hidden_size)
+        # reshape into better form for `config.output_attentions`
+        main_attn_probs = main_attn_probs.view(batch_size, self.num_attn_heads, sequence_length, -1)
+
+        attn_output = nn.functional.dropout(attn_output, p=self.dropout, training=self.training)
+
+        return attn_output, main_attn_probs, predict_attn_probs, past_key_values
+
+    def get_main_relative_pos_embeddings(
+        self, hidden_states, attn_weights, position_ids, main_relative_position_buckets
+    ):
+        # input hidden_states [batch_size, sequence_length, hidden_size]
+        # input attn_weights [batch_size, num_heads, sequence_length, sequence_length]
+        # input position_ids [batch_size, sequence_length] or [1,1]
+        batch_size, num_attn_heads, tgt_len, src_len = attn_weights.shape
+        attn_weights = attn_weights.view(batch_size, num_attn_heads, tgt_len, src_len)
+        if main_relative_position_buckets is None:
+            batch_size, sequence_length = hidden_states.shape[:2]
+            relative_positions = (
+                torch.arange(1, attn_weights.shape[-1] + 1)
+                .unsqueeze(0)
+                .unsqueeze(0)
+                .repeat(batch_size, sequence_length, 1)
+                .to(position_ids.device)
+            )
+            # [batch_size, sequence_length, sequence_length+1]
+            relative_positions = relative_positions - position_ids.unsqueeze(0).repeat(batch_size, sequence_length, 1)
+            main_relative_position_buckets = compute_relative_buckets(
+                self.num_buckets, self.relative_max_distance, relative_positions, False
+            )
+
+        # [batch_size, sequence_length, num_buckets * num_heads]
+        rel_pos_embeddings = self.relative_pos_embeddings(hidden_states)
+        rel_pos_embeddings = rel_pos_embeddings.view(
+            rel_pos_embeddings.shape[:2] + (self.num_buckets, self.num_attn_heads)
+        )
+        rel_pos_embeddings = rel_pos_embeddings.permute(0, 3, 1, 2)
+        # [batch_size, num_heads, sequence_length, num_buckets]
+        rel_pos_embeddings = rel_pos_embeddings.reshape(attn_weights.shape[:3] + (-1,))
+
+        main_relative_position_buckets = main_relative_position_buckets.repeat(1, self.num_attn_heads, 1)
+        # [batch_size * num_heads * sequence_length, sequence_length]
+        main_relative_position_buckets = main_relative_position_buckets.view(
+            -1, main_relative_position_buckets.shape[-1]
+        )
+        main_relative_position_buckets = main_relative_position_buckets.long()
+        # [batch_size * num_heads * sequence_length, sequence_length]
+        rel_pos_embeddings = rel_pos_embeddings.reshape(-1, rel_pos_embeddings.size(-1))
+
+        main_relative_pos_embeddings = torch.gather(rel_pos_embeddings, dim=1, index=main_relative_position_buckets)
+        main_relative_pos_embeddings = main_relative_pos_embeddings.view(batch_size, num_attn_heads, tgt_len, -1)
+        return main_relative_pos_embeddings
+
+    def get_predict_relative_pos_embeddings(
+        self, hidden_states, attn_weights, position_ids, predict_relative_position_buckets
+    ):
+        # input hidden_states [batch_size, sequence_length, ngram, hidden_size]
+        # input attn_weights [batch_size, ngram, num_heads, sequence_length, 2*sequence_length]
+        # input position_ids [batch_size, sequence_length] or [1,1]
+        # input predict_relative_position_buckets [batch_size, sequence_length, 2*sequence_length] or None
+        batch_size, sequence_length = hidden_states.shape[0:2]
+
+        if predict_relative_position_buckets is None:
+            key_sequence_length = attn_weights.shape[-1]
+            assert position_ids[0][0] == key_sequence_length - 1, (
+                "`position_ids` are incorrect. They should be of the format 1 2 3 4 5 ... (key_sequence_length - 1)"
+            )
+            relative_positions = (
+                torch.arange(0, key_sequence_length)
+                .unsqueeze(0)
+                .unsqueeze(0)
+                .repeat(batch_size, sequence_length, 1)
+                .to(position_ids.device)
+            )
+
+            relative_positions = relative_positions - position_ids.unsqueeze(0).repeat(batch_size, sequence_length, 1)
+            predict_relative_position_buckets = compute_relative_buckets(
+                self.num_buckets, self.relative_max_distance, relative_positions, False
+            )
+
+        # [batch_size, ngram, sequence_length, hidden_size]
+        hidden_states = hidden_states.transpose(1, 2)
+        rel_pos_embeddings = self.relative_pos_embeddings(hidden_states)
+
+        # [batch_size, ngram, sequence_length, num_buckets, num_heads]
+        rel_pos_embeddings = rel_pos_embeddings.view(
+            hidden_states.shape[:-1] + (self.num_buckets, self.num_attn_heads)
+        )
+        rel_pos_embeddings = rel_pos_embeddings.permute(0, 2, 1, 4, 3)
+        # [batch_size * ngram * sequence_length * num_heads, num_buckets]
+        rel_pos_embeddings = rel_pos_embeddings.reshape(-1, self.num_buckets)
+        # [ngram, batch_size, num_heads * sequence_length, -1]
+        predict_relative_position_buckets = predict_relative_position_buckets.unsqueeze(0)
+        predict_relative_position_buckets = predict_relative_position_buckets.repeat(
+            self.ngram, 1, self.num_attn_heads, 1
+        )
+        # [ngram * batch_size * num_heads * sequence_length, -1]
+        predict_relative_position_buckets = predict_relative_position_buckets.view(
+            -1, predict_relative_position_buckets.size(-1)
+        ).long()
+
+        predict_relative_pos_embeddings = torch.gather(
+            rel_pos_embeddings, dim=1, index=predict_relative_position_buckets
+        )
+
+        # [batch_size, gram, num_heads, sequence_length, -1]
+        predict_relative_pos_embeddings = predict_relative_pos_embeddings.view(
+            batch_size, self.ngram, self.num_attn_heads, sequence_length, -1
+        )
+
+        return predict_relative_pos_embeddings
+
+
+class XLMProphetNetEncoderLayer(GradientCheckpointingLayer):
+    """
+    Encoder block for XLMProphetnet
+    """
+
+    def __init__(self, config: XLMProphetNetConfig):
+        super().__init__()
+        # 1st residual block
+        self.self_attn = XLMProphetNetAttention(config, config.num_encoder_attention_heads)
+        self.self_attn_layer_norm = LayerNorm(config.hidden_size)
+
+        # 2nd residual block
+        self.feed_forward = XLMProphetNetFeedForward(config, config.encoder_ffn_dim)
+        self.feed_forward_layer_norm = LayerNorm(config.hidden_size)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        layer_head_mask,
+        output_attentions: bool = False,
+    ):
+        # 1st residual block
+        attention_output, attn_weights, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self.self_attn_layer_norm(attention_output + hidden_states)
+
+        # 2nd residual block
+        feed_forward_output = self.feed_forward(hidden_states)
+        hidden_states = self.feed_forward_layer_norm(feed_forward_output + hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class XLMProphetNetDecoderLayer(GradientCheckpointingLayer):
+    """
+    Decoder block for XLMProphetnet
+    """
+
+    def __init__(self, config: XLMProphetNetConfig):
+        super().__init__()
+        # 1st residual block
+        self.self_attn = XLMProphetNetNgramSelfAttention(config)
+        self.self_attn_layer_norm = LayerNorm(config.hidden_size)
+
+        # 2nd residual block
+        if config.add_cross_attention:
+            self.cross_attn = XLMProphetNetAttention(config, config.num_decoder_attention_heads)
+            self.cross_attn_layer_norm = LayerNorm(config.hidden_size)
+
+        # 3rd residual block
+        self.feed_forward = XLMProphetNetFeedForward(config, config.decoder_ffn_dim)
+        self.feed_forward_layer_norm = LayerNorm(config.hidden_size)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attn_mask=None,
+        layer_head_mask=None,
+        cross_attn_layer_head_mask=None,
+        extended_predict_attention_mask=None,
+        main_relative_position_buckets=None,
+        predict_relative_position_buckets=None,
+        position_ids=None,
+        past_key_values=None,
+        use_cache: bool = True,
+        output_attentions: bool = False,
+    ):
+        # 1st residual block
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_values[:2] if past_key_values is not None else None
+        ngram_attention_output, self_attn_weights, self_attn_weights_ngram, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_values=self_attn_past_key_value,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            extended_predict_attention_mask=extended_predict_attention_mask,
+            main_relative_position_buckets=main_relative_position_buckets,
+            predict_relative_position_buckets=predict_relative_position_buckets,
+            position_ids=position_ids,
+        )
+        hidden_states = self.self_attn_layer_norm(hidden_states + ngram_attention_output)
+
+        # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
+        cross_attn_past_key_value = past_key_values[-2:] if past_key_values is not None else None
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            # 2nd residual block
+            attention_output, cross_attn_weights, cross_attn_present_key_value = self.cross_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attn_mask,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_values=cross_attn_past_key_value,
+                output_attentions=output_attentions,
+            )
+            hidden_states = self.cross_attn_layer_norm(attention_output + hidden_states)
+
+            # add cross-attn to positions 3,4 of present_key_value tuple
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        # 3rd residual block
+        feed_forward_output = self.feed_forward(hidden_states)
+        hidden_states = self.feed_forward_layer_norm(feed_forward_output + hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, self_attn_weights_ngram, cross_attn_weights)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+@add_start_docstrings(
+    "The standalone encoder part of the XLMProphetNetModel.",
+    XLM_PROPHETNET_START_DOCSTRING,
+)
+class XLMProphetNetEncoder(XLMProphetNetPreTrainedModel):
+    r"""
+    word_embeddings  (`torch.nn.Embeddings` of shape `(config.vocab_size, config.hidden_size)`, *optional*):
+        The word embedding parameters. This can be used to initialize [`XLMProphetNetEncoder`] with pre-defined word
+        embeddings instead of randomly initialized word embeddings.
+    """
+
+    def __init__(self, config: XLMProphetNetConfig, word_embeddings: Optional[nn.Embedding] = None):
+        super().__init__(config)
+
+        self.word_embeddings = (
+            word_embeddings
+            if word_embeddings is not None
+            else nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        )
+        self.position_embeddings = XLMProphetNetPositionalEmbeddings(config)
+        self.embeddings_layer_norm = LayerNorm(config.hidden_size)
+
+        self.layers = nn.ModuleList([XLMProphetNetEncoderLayer(config) for _ in range(config.num_encoder_layers)])
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.word_embeddings = value
+
+    @add_start_docstrings_to_model_forward(XLM_PROPHETNET_STANDALONE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutput]:
+        r"""
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, XLMProphetNetEncoder
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("patrickvonplaten/xprophetnet-large-uncased-standalone")
+        >>> model = XLMProphetNetEncoder.from_pretrained("patrickvonplaten/prophetnet-large-uncased-standalone")
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is None and inputs_embeds is None:
+            raise ValueError("Either input_ids or inputs_embeds has to be passed.")
+        elif input_ids is not None and inputs_embeds is not None:
+            raise ValueError("Make sure to only pass input_ids or inputs_embeds.")
+        elif input_ids is not None and inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        # prepare attention mask
+        if attention_mask is not None:
+            extended_attention_mask = (
+                1.0 - attention_mask[:, None, None, :].repeat(1, self.config.num_encoder_attention_heads, 1, 1)
+            ) * torch.finfo(self.dtype).min
+            extended_attention_mask = extended_attention_mask.to(inputs_embeds.dtype)
+        else:
+            extended_attention_mask = None
+
+        position_embeddings, position_ids = self.position_embeddings(inputs_embeds.shape[:2], inputs_embeds.device)
+
+        hidden_states = inputs_embeds + position_embeddings
+        hidden_states = self.embeddings_layer_norm(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.config.dropout, training=self.training)
+
+        encoder_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        # check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            assert head_mask.size()[0] == (len(self.layers)), (
+                f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+            )
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_hidden_states = encoder_hidden_states + (hidden_states,)
+
+            layer_outputs = encoder_layer(
+                hidden_states,
+                attention_mask=extended_attention_mask,
+                layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                output_attentions=output_attentions,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_hidden_states = encoder_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_hidden_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_hidden_states, attentions=all_attentions
+        )
+
+
+@add_start_docstrings(
+    "The standalone decoder part of the XLMProphetNetModel.",
+    XLM_PROPHETNET_START_DOCSTRING,
+)
+class XLMProphetNetDecoder(XLMProphetNetPreTrainedModel):
+    r"""
+    word_embeddings  (`torch.nn.Embeddings` of shape `(config.vocab_size, config.hidden_size)`, *optional*):
+        The word embedding parameters. This can be used to initialize [`XLMProphetNetEncoder`] with pre-defined word
+        embeddings instead of randomly initialized word embeddings.
+    """
+
+    def __init__(self, config: XLMProphetNetConfig, word_embeddings: Optional[nn.Embedding] = None):
+        super().__init__(config)
+
+        self.ngram = config.ngram
+        self.num_buckets = config.num_buckets
+        self.relative_max_distance = config.relative_max_distance
+        self.dropout = config.dropout
+        self.max_target_positions = config.max_position_embeddings
+
+        self.word_embeddings = (
+            word_embeddings
+            if word_embeddings is not None
+            else nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        )
+        self.position_embeddings = XLMProphetNetPositionalEmbeddings(config)
+
+        self.ngram_embeddings = nn.Embedding(self.ngram, config.hidden_size, None)
+        self.layers = nn.ModuleList([XLMProphetNetDecoderLayer(config) for _ in range(config.num_decoder_layers)])
+        self.embeddings_layer_norm = LayerNorm(config.hidden_size)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.word_embeddings = value
+
+    @add_start_docstrings_to_model_forward(XLM_PROPHETNET_STANDALONE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=XLMProphetNetDecoderModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, XLMProphetNetDecoderModelOutput]:
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, XLMProphetNetDecoder
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("patrickvonplaten/xprophetnet-large-uncased-standalone")
+        >>> model = XLMProphetNetDecoder.from_pretrained("patrickvonplaten/xprophetnet-large-uncased-standalone", add_cross_attention=False)
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is None and inputs_embeds is None:
+            raise ValueError("Either `decoder_input_ids` or `decoder_inputs_embeds` has to be passed.")
+        elif input_ids is not None and inputs_embeds is not None:
+            raise ValueError("Make sure to only pass `decoder_input_ids` or `decoder_inputs_embeds`.")
+        elif input_ids is not None and inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        batch_size, sequence_length = inputs_embeds.shape[:2]
+
+        main_stream_pos_embed, position_ids = self.position_embeddings(
+            (batch_size, sequence_length),
+            device=inputs_embeds.device,
+            past_key_values=past_key_values,
+        )
+
+        if past_key_values is not None:
+            main_relative_position_buckets, predict_relative_position_buckets = None, None
+        else:
+            (
+                main_relative_position_buckets,
+                predict_relative_position_buckets,
+            ) = self.compute_buffered_relative_buckets(position_ids)
+        predicting_stream_pos_embed = self.position_embeddings._forward(position_ids + 1)
+
+        # add position embeddings
+        hidden_states = inputs_embeds + main_stream_pos_embed
+
+        ngram_embeddings = self.ngram_embeddings.weight
+
+        # prepare attention mask
+        if past_key_values is not None:
+            assert hidden_states.size(1) == 1, (
+                "At the moment `use_cache` is only supported for `decoder_input_ids` of length 1"
+            )
+
+            ngram_hidden_states = [
+                (ngram_embeddings[ngram - 1] + predicting_stream_pos_embed).repeat(batch_size, 1, 1)
+                for ngram in range(self.ngram)
+            ]
+            extended_attention_mask = None
+            extended_predict_attention_mask = None
+        else:
+            ngram_hidden_states = [
+                (ngram_embeddings[ngram - 1] + predicting_stream_pos_embed) for ngram in range(self.ngram)
+            ]
+            extended_attention_mask = self.prepare_attention_mask(hidden_states, attention_mask)
+            extended_predict_attention_mask = self.prepare_predict_attention_mask(hidden_states, attention_mask)
+
+        # prepare encoder attention mask
+        if encoder_attention_mask is not None:
+            extended_encoder_attention_mask = (
+                1.0 - encoder_attention_mask[:, None, None, :].repeat(1, self.config.num_decoder_attention_heads, 1, 1)
+            ) * torch.finfo(self.dtype).min
+            extended_encoder_attention_mask = extended_encoder_attention_mask.to(inputs_embeds.dtype)
+        else:
+            extended_encoder_attention_mask = None
+
+        hidden_states = torch.cat([hidden_states] + ngram_hidden_states, 1)
+
+        if self.embeddings_layer_norm:
+            hidden_states = self.embeddings_layer_norm(hidden_states)
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # init attentions, hidden_states and cache with empty tuples
+        all_main_stream_hidden_states = () if output_hidden_states else None
+        all_ngram_stream_hidden_states = () if output_hidden_states and self.config.ngram > 0 else None
+
+        all_main_stream_attns = () if output_attentions else None
+        all_ngram_stream_attns = () if output_attentions else None
+        all_cross_attns = () if output_attentions and self.config.add_cross_attention else None
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        present_key_values = () if use_cache else None
+
+        # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
+        for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
+            if attn_mask is not None:
+                assert attn_mask.size()[0] == (len(self.layers)), (
+                    f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
+                    f" {head_mask.size()[0]}."
+                )
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                # grad cannot be kept because tensor is sliced
+                all_main_stream_hidden_states += (hidden_states[:, :sequence_length],)
+                if self.config.ngram > 0:
+                    all_ngram_stream_hidden_states += (hidden_states[:, sequence_length:],)
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=extended_attention_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attn_mask=extended_encoder_attention_mask,
+                layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                cross_attn_layer_head_mask=(cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None),
+                extended_predict_attention_mask=extended_predict_attention_mask,
+                main_relative_position_buckets=main_relative_position_buckets,
+                predict_relative_position_buckets=predict_relative_position_buckets,
+                position_ids=position_ids,
+                past_key_values=past_key_values[idx] if past_key_values is not None else None,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                present_key_values += (layer_outputs[4 if output_attentions else 1],)
+
+            if output_attentions:
+                all_main_stream_attns += (layer_outputs[1],)
+                all_ngram_stream_attns += (layer_outputs[2],)
+
+                if self.config.add_cross_attention:
+                    all_cross_attns += (layer_outputs[3],)
+
+        if output_hidden_states:
+            all_main_stream_hidden_states += (hidden_states[:, :sequence_length],)
+            if self.config.ngram > 0:
+                all_ngram_stream_hidden_states += (hidden_states[:, sequence_length:],)
+
+        # split last_hidden_state for return
+        last_hidden_state = hidden_states[:, :sequence_length]
+        last_hidden_state_ngram = hidden_states[:, sequence_length:] if self.config.ngram > 0 else None
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    last_hidden_state,
+                    last_hidden_state_ngram,
+                    present_key_values,
+                    all_main_stream_hidden_states,
+                    all_ngram_stream_hidden_states,
+                    all_main_stream_attns,
+                    all_ngram_stream_attns,
+                    all_cross_attns,
+                ]
+                if v is not None
+            )
+        return XLMProphetNetDecoderModelOutput(
+            last_hidden_state=last_hidden_state,
+            last_hidden_state_ngram=last_hidden_state_ngram,
+            past_key_values=present_key_values,
+            hidden_states=all_main_stream_hidden_states,
+            hidden_states_ngram=all_ngram_stream_hidden_states,
+            attentions=all_main_stream_attns,
+            ngram_attentions=all_ngram_stream_attns,
+            cross_attentions=all_cross_attns,
+        )
+
+    def compute_buffered_relative_buckets(self, position_ids):
+        batch_size, sequence_length = position_ids.shape
+
+        position_ids = torch.arange(1, self.max_target_positions).to(position_ids.device).repeat(1, 1)
+        main_relative_buckets, predict_relative_buckets = compute_all_stream_relative_buckets(
+            self.num_buckets, self.relative_max_distance, position_ids
+        )
+
+        # buffer relative buckets
+        main_relative_buckets = main_relative_buckets[:, :sequence_length, :sequence_length].repeat(batch_size, 1, 1)
+        predict_relative_buckets = torch.cat(
+            [
+                predict_relative_buckets[:, :sequence_length, :sequence_length],
+                predict_relative_buckets[
+                    :, :sequence_length, self.max_target_positions : self.max_target_positions + sequence_length
+                ],
+            ],
+            2,
+        ).repeat(batch_size, 1, 1)
+
+        return main_relative_buckets, predict_relative_buckets
+
+    def prepare_attention_mask(self, hidden_states, attention_mask):
+        batch_size, seq_length = hidden_states.shape[:2]
+
+        # get causal mask
+        causal_mask = torch.full(
+            (seq_length, seq_length),
+            torch.finfo(hidden_states.dtype).min,
+            dtype=hidden_states.dtype,
+            device=hidden_states.device,
+        )
+        causal_mask = torch.triu(causal_mask, 1)
+
+        extended_causal_mask = causal_mask[:seq_length, :seq_length][None, None, :, :].expand(
+            (batch_size, self.config.num_decoder_attention_heads) + causal_mask.shape
+        )
+
+        # add usual attention mask
+        if attention_mask is not None:
+            extended_attention_mask = (1.0 - attention_mask[:, None, None, :]) * torch.finfo(self.dtype).min
+            extended_attention_mask = extended_causal_mask + extended_attention_mask
+        else:
+            extended_attention_mask = extended_causal_mask
+        return extended_attention_mask.to(hidden_states.dtype)
+
+    def prepare_predict_attention_mask(self, hidden_states, attention_mask):
+        batch_size, seq_length = hidden_states.shape[:2]
+
+        # get causal mask
+        predict_causal_mask = ngram_attention_bias(
+            self.max_target_positions, self.ngram, hidden_states.device, hidden_states.dtype
+        )
+        predict_causal_mask = torch.cat(
+            [
+                predict_causal_mask[:, :seq_length, :seq_length],
+                predict_causal_mask[
+                    :, :seq_length, self.max_target_positions : self.max_target_positions + seq_length
+                ],
+            ],
+            dim=-1,
+        )
+        extended_predict_causal_mask = predict_causal_mask[None, None, :, :, :].expand(
+            (batch_size, self.config.num_decoder_attention_heads) + predict_causal_mask.shape
+        )
+
+        # add usual attention mask
+        if attention_mask is not None:
+            extended_attention_mask = (1.0 - attention_mask[:, None, None, None, :]) * torch.finfo(self.dtype).min
+            extended_attention_mask = extended_attention_mask.expand(
+                (batch_size, self.config.num_decoder_attention_heads, self.ngram, seq_length, seq_length)
+            )
+            # predicted stream attention_mask should always be 0
+            extended_attention_mask = torch.cat(
+                [extended_attention_mask, torch.zeros_like(extended_attention_mask)], dim=-1
+            )
+            extended_predict_attention_mask = extended_predict_causal_mask + extended_attention_mask
+        else:
+            extended_predict_attention_mask = extended_predict_causal_mask
+        return extended_predict_attention_mask.to(hidden_states.dtype)
+
+
+@add_start_docstrings(
+    "The bare XLMProphetNet Model outputting raw hidden-states without any specific head on top.",
+    XLM_PROPHETNET_START_DOCSTRING,
+)
+class XLMProphetNetModel(XLMProphetNetPreTrainedModel):
+    _tied_weights_keys = ["encoder.word_embeddings.weight", "decoder.word_embeddings.weight"]
+
+    def __init__(self, config: XLMProphetNetConfig):
+        super().__init__(config)
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+
+        encoder_config = copy.deepcopy(config)
+        encoder_config.is_encoder_decoder = False
+        encoder_config.use_cache = False
+        self.encoder = XLMProphetNetEncoder(encoder_config, self.word_embeddings)
+
+        decoder_config = copy.deepcopy(config)
+        decoder_config.is_decoder = True
+        decoder_config.is_encoder_decoder = False
+        self.decoder = XLMProphetNetDecoder(decoder_config, self.word_embeddings)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.word_embeddings = value
+        self.encoder.word_embeddings = self.word_embeddings
+        self.decoder.word_embeddings = self.word_embeddings
+
+    def _tie_weights(self):
+        if self.config.tie_word_embeddings:
+            self._tie_or_clone_weights(self.encoder.word_embeddings, self.word_embeddings)
+            self._tie_or_clone_weights(self.decoder.word_embeddings, self.word_embeddings)
+
+    def get_encoder(self):
+        return self.encoder
+
+    @add_start_docstrings_to_model_forward(XLM_PROPHETNET_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=XLMProphetNetSeq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.Tensor] = None,
+        decoder_attention_mask: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[tuple] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        decoder_inputs_embeds: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, XLMProphetNetSeq2SeqModelOutput]:
+        r"""
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, XLMProphetNetModel
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("patrickvonplaten/xprophetnet-large-uncased-standalone")
+        >>> model = XLMProphetNetModel.from_pretrained("patrickvonplaten/xprophetnet-large-uncased-standalone")
+
+        >>> input_ids = tokenizer(
+        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
+        ... ).input_ids  # Batch size 1
+        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
+        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+
+        >>> last_hidden_states = outputs.last_hidden_state  # main stream hidden states
+        >>> last_hidden_states_ngram = outputs.last_hidden_state_ngram  # predict hidden states
+        ```"""
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+
+        # decoder outputs consists of (dec_features, past_key_values, dec_hidden, dec_attn)
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=attention_mask,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            use_cache=use_cache,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+        return XLMProphetNetSeq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            last_hidden_state_ngram=decoder_outputs.last_hidden_state_ngram,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_ngram_hidden_states=decoder_outputs.hidden_states_ngram,
+            decoder_attentions=decoder_outputs.attentions,
+            decoder_ngram_attentions=decoder_outputs.ngram_attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    "The XLMProphetNet Model with a language modeling head. Can be used for sequence generation tasks.",
+    XLM_PROPHETNET_START_DOCSTRING,
+)
+class XLMProphetNetForConditionalGeneration(XLMProphetNetPreTrainedModel):
+    _tied_weights_keys = ["encoder.word_embeddings.weight", "decoder.word_embeddings.weight", "lm_head.weight"]
+
+    def __init__(self, config: XLMProphetNetConfig):
+        super().__init__(config)
+        self.prophetnet = XLMProphetNetModel(config)
+        self.padding_idx = config.pad_token_id
+        self.disable_ngram_loss = config.disable_ngram_loss
+
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def _tie_weights(self):
+        if self.config.tie_word_embeddings:
+            self._tie_or_clone_weights(self.prophetnet.word_embeddings, self.lm_head)
+
+    def get_input_embeddings(self):
+        return self.prophetnet.word_embeddings
+
+    @add_start_docstrings_to_model_forward(XLM_PROPHETNET_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=XLMProphetNetSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.Tensor] = None,
+        decoder_attention_mask: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        decoder_inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, XLMProphetNetSeq2SeqLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
+            labels in `[0, ..., config.vocab_size]`
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, XLMProphetNetForConditionalGeneration
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("patrickvonplaten/xprophetnet-large-uncased-standalone")
+        >>> model = XLMProphetNetForConditionalGeneration.from_pretrained("patrickvonplaten/xprophetnet-large-uncased-standalone")
+
+        >>> input_ids = tokenizer(
+        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
+        ... ).input_ids  # Batch size 1
+        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
+        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+
+        >>> logits_next_token = outputs.logits  # logits to predict next token as usual
+        >>> logits_ngram_next_tokens = outputs.logits_ngram  # logits to predict 2nd, 3rd, ... next tokens
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
+            # get decoder inputs from shifting lm labels to the right
+            decoder_input_ids = self._shift_right(labels)
+
+        outputs = self.prophetnet(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            encoder_outputs=encoder_outputs,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        batch_size, sequence_length = (
+            decoder_input_ids.shape if decoder_input_ids is not None else decoder_inputs_embeds.shape[:2]
+        )
+
+        predicting_streams = outputs[1].view(batch_size, self.config.ngram, sequence_length, -1)
+        predict_logits = self.lm_head(predicting_streams)
+
+        logits = predict_logits[:, 0]
+        logits_ngram = predict_logits[:, 1:] if self.config.ngram > 1 else None
+
+        # To use .view in loss computation, make sure that logits is contiguous.
+        if not logits.is_contiguous():
+            logits = logits.contiguous()
+
+        loss = None
+        if labels is not None:
+            loss = self._compute_loss(predict_logits, labels)
+
+        if not return_dict:
+            all_logits = tuple(v for v in [logits, logits_ngram] if v is not None)
+            return (loss,) + all_logits + outputs[2:] if loss is not None else all_logits + outputs[2:]
+        else:
+            return XLMProphetNetSeq2SeqLMOutput(
+                loss=loss,
+                logits=logits,
+                logits_ngram=logits_ngram,
+                past_key_values=outputs.past_key_values,
+                decoder_hidden_states=outputs.decoder_hidden_states,
+                decoder_ngram_hidden_states=outputs.decoder_ngram_hidden_states,
+                decoder_attentions=outputs.decoder_attentions,
+                decoder_ngram_attentions=outputs.decoder_ngram_attentions,
+                cross_attentions=outputs.cross_attentions,
+                encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+                encoder_hidden_states=outputs.encoder_hidden_states,
+                encoder_attentions=outputs.encoder_attentions,
+            )
+
+    def _compute_loss(self, logits, labels, ignore_index=-100):
+        expend_targets = labels.new_zeros(self.config.ngram, labels.size(0), labels.size(1)).fill_(ignore_index)
+
+        for i in range(self.config.ngram):
+            if i > 0 and self.disable_ngram_loss:
+                break
+            expend_targets[i, :, :] = labels
+
+        logits = logits.transpose(0, 1).contiguous()
+        lprobs = nn.functional.log_softmax(
+            logits.view(-1, logits.size(-1)),
+            dim=-1,
+            dtype=torch.float32,
+        )
+
+        loss = nn.functional.nll_loss(lprobs, expend_targets.view(-1), reduction="mean")
+
+        if self.config.eps > 0.0:
+            smooth_loss = -lprobs.sum(dim=-1, keepdim=True)
+            non_masked_tokens = expend_targets.ne(ignore_index).view(-1)
+            smooth_loss = smooth_loss[non_masked_tokens]
+            smooth_loss = smooth_loss.mean()
+
+            eps_i = self.config.eps / lprobs.size(-1)
+            loss = (1.0 - self.config.eps) * loss + eps_i * smooth_loss
+
+        return loss
+
+    def prepare_inputs_for_generation(
+        self,
+        decoder_input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+        use_cache=None,
+        encoder_outputs=None,
+        **kwargs,
+    ):
+        assert encoder_outputs is not None, "`encoder_outputs` have to be passed for generation."
+
+        if past_key_values:
+            decoder_input_ids = decoder_input_ids[:, -1:]
+        # first step, decoder_cached_states are empty
+        return {
+            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
+            "encoder_outputs": encoder_outputs,
+            "past_key_values": past_key_values,
+            "decoder_input_ids": decoder_input_ids,
+            "attention_mask": attention_mask,
+            "head_mask": head_mask,
+            "decoder_head_mask": decoder_head_mask,
+            "cross_attn_head_mask": cross_attn_head_mask,
+            "use_cache": use_cache,
+        }
+
+    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
+        return self._shift_right(labels)
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            # cached cross_attention states don't have to be reordered -> they are always the same
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2])
+                + layer_past[2:],
+            )
+        return reordered_past
+
+    def get_encoder(self):
+        return self.prophetnet.encoder
+
+    def get_decoder(self):
+        return self.prophetnet.decoder
+
+
+@add_start_docstrings(
+    "The standalone decoder part of the XLMProphetNetModel with a lm head on top. The model can be used for causal"
+    " language modeling.",
+    XLM_PROPHETNET_START_DOCSTRING,
+)
+class XLMProphetNetForCausalLM(XLMProphetNetPreTrainedModel):
+    _tied_weights_keys = [
+        "prophetnet.word_embeddings.weight",
+        "prophetnet.decoder.word_embeddings.weight",
+        "lm_head.weight",
+    ]
+
+    def __init__(self, config: XLMProphetNetConfig):
+        # set config for CLM
+        config = copy.deepcopy(config)
+        config.is_decoder = True
+        config.is_encoder_decoder = False
+        super().__init__(config)
+        self.prophetnet = XLMProphetNetDecoderWrapper(config)
+
+        self.padding_idx = config.pad_token_id
+        self.disable_ngram_loss = config.disable_ngram_loss
+
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.prophetnet.decoder.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.prophetnet.decoder.word_embeddings = value
+
+    def _tie_weights(self):
+        if self.config.tie_word_embeddings:
+            self._tie_or_clone_weights(self.prophetnet.decoder.word_embeddings, self.lm_head)
+
+    def set_decoder(self, decoder):
+        self.prophetnet.decoder = decoder
+
+    def get_decoder(self):
+        return self.prophetnet.decoder
+
+    @add_start_docstrings_to_model_forward(XLM_PROPHETNET_STANDALONE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=XLMProphetNetDecoderLMOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, XLMProphetNetDecoderLMOutput]:
+        r"""
+        encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, XLMProphetNetForCausalLM
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("patrickvonplaten/xprophetnet-large-uncased-standalone")
+        >>> model = XLMProphetNetForCausalLM.from_pretrained("patrickvonplaten/xprophetnet-large-uncased-standalone")
+        >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> logits = outputs.logits
+
+        >>> # Model can also be used with EncoderDecoder framework
+        >>> from transformers import BertTokenizer, EncoderDecoderModel, AutoTokenizer
+        >>> import torch
+
+        >>> tokenizer_enc = BertTokenizer.from_pretrained("google-bert/bert-large-uncased")
+        >>> tokenizer_dec = AutoTokenizer.from_pretrained("patrickvonplaten/xprophetnet-large-uncased-standalone")
+        >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained(
+        ...     "google-bert/bert-large-uncased", "patrickvonplaten/xprophetnet-large-uncased-standalone"
+        ... )
+
+        >>> ARTICLE = (
+        ...     "the us state department said wednesday it had received no "
+        ...     "formal word from bolivia that it was expelling the us ambassador there "
+        ...     "but said the charges made against him are `` baseless ."
+        ... )
+        >>> input_ids = tokenizer_enc(ARTICLE, return_tensors="pt").input_ids
+        >>> labels = tokenizer_dec(
+        ...     "us rejects charges against its ambassador in bolivia", return_tensors="pt"
+        ... ).input_ids
+        >>> outputs = model(input_ids=input_ids, decoder_input_ids=labels[:, :-1], labels=labels[:, 1:])
+
+        >>> loss = outputs.loss
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, past_key_values, dec_hidden, dec_attn)
+        outputs = self.prophetnet.decoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            head_mask=head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        batch_size, sequence_length = input_ids.shape if input_ids is not None else inputs_embeds.shape[:2]
+
+        predicting_streams = outputs[1].view(batch_size, self.config.ngram, sequence_length, -1)
+        predict_logits = self.lm_head(predicting_streams)
+
+        logits = predict_logits[:, 0]
+        logits_ngram = predict_logits[:, 1:] if self.config.ngram > 1 else None
+
+        loss = None
+        if labels is not None:
+            loss = self._compute_loss(predict_logits, labels)
+
+        if not return_dict:
+            all_logits = tuple(v for v in [logits, logits_ngram] if v is not None)
+            return (loss,) + all_logits + outputs[2:] if loss is not None else all_logits + outputs[2:]
+        else:
+            return XLMProphetNetDecoderLMOutput(
+                loss=loss,
+                logits=logits,
+                logits_ngram=logits_ngram,
+                past_key_values=outputs.past_key_values,
+                hidden_states=outputs.hidden_states,
+                hidden_states_ngram=outputs.hidden_states_ngram,
+                attentions=outputs.attentions,
+                ngram_attentions=outputs.ngram_attentions,
+                cross_attentions=outputs.cross_attentions,
+            )
+
+    def _compute_loss(self, logits, labels, ignore_index=-100):
+        expend_targets = labels.new_zeros(self.config.ngram, labels.size(0), labels.size(1)).fill_(ignore_index)
+
+        for i in range(self.config.ngram):
+            if i > 0 and self.disable_ngram_loss:
+                break
+            expend_targets[i, :, :] = labels
+
+        logits = logits.transpose(0, 1).contiguous()
+        lprobs = nn.functional.log_softmax(
+            logits.view(-1, logits.size(-1)),
+            dim=-1,
+            dtype=torch.float32,
+        )
+
+        loss = nn.functional.nll_loss(lprobs, expend_targets.view(-1), reduction="mean")
+
+        if self.config.eps > 0.0:
+            smooth_loss = -lprobs.sum(dim=-1, keepdim=True)
+            non_masked_tokens = expend_targets.ne(ignore_index).view(-1)
+            smooth_loss = smooth_loss[non_masked_tokens]
+            smooth_loss = smooth_loss.mean()
+
+            eps_i = self.config.eps / lprobs.size(-1)
+            loss = (1.0 - self.config.eps) * loss + eps_i * smooth_loss
+
+        return loss
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        head_mask=None,
+        use_cache=None,
+        **kwargs,
+    ):
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_ids.shape)
+
+        if past_key_values:
+            input_ids = input_ids[:, -1:]
+        # first step, decoder_cached_states are empty
+        return {
+            "input_ids": input_ids,  # encoder_outputs is defined. input_ids not needed
+            "attention_mask": attention_mask,
+            "head_mask": head_mask,
+            "past_key_values": past_key_values,
+            "use_cache": use_cache,
+        }
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+
+
+class XLMProphetNetDecoderWrapper(XLMProphetNetPreTrainedModel):
+    """
+    This is a wrapper class, so that [`XLMProphetNetForCausalLM`] can correctly be loaded from pretrained XLMProphetNet
+    classes.
+    """
+
+    def __init__(self, config: XLMProphetNetConfig):
+        super().__init__(config)
+
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.decoder = XLMProphetNetDecoder(config, word_embeddings=self.word_embeddings)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def _tie_weights(self):
+        self._tie_or_clone_weights(self.word_embeddings, self.decoder.get_input_embeddings())
+
+    def forward(self, *args, **kwargs):
+        return self.decoder(*args, **kwargs)
+
+
+__all__ = [
+    "XLMProphetNetDecoder",
+    "XLMProphetNetEncoder",
+    "XLMProphetNetForCausalLM",
+    "XLMProphetNetForConditionalGeneration",
+    "XLMProphetNetModel",
+    "XLMProphetNetPreTrainedModel",
+]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/deprecated/xlm_prophetnet/tokenization_xlm_prophetnet.py b/venv/lib/python3.13/site-packages/transformers/models/deprecated/xlm_prophetnet/tokenization_xlm_prophetnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..77431b13c49f47ac9df31b3d4e76ac9e2060895d
--- /dev/null
+++ b/venv/lib/python3.13/site-packages/transformers/models/deprecated/xlm_prophetnet/tokenization_xlm_prophetnet.py
@@ -0,0 +1,322 @@
+# coding=utf-8
+# Copyright 2020 The Microsoft Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import os
+from shutil import copyfile
+from typing import Any, Optional
+
+from ....tokenization_utils import PreTrainedTokenizer
+from ....utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+SPIECE_UNDERLINE = "▁"
+
+VOCAB_FILES_NAMES = {"vocab_file": "prophetnet.tokenizer"}
+
+
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        tokens = reader.readlines()
+    for index, token in enumerate(tokens):
+        token = token.rstrip("\n")
+        vocab[token] = index
+    return vocab
+
+
+class XLMProphetNetTokenizer(PreTrainedTokenizer):
+    """
+    Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
+    [SentencePiece](https://github.com/google/sentencepiece).
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        bos_token (`str`, *optional*, defaults to `"[SEP]"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"[SEP]"`):
+            The end of sequence token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
+            The token used is the `sep_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
+            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
+            to set:
+
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
+
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+                using forward-filtering-and-backward-sampling algorithm.
+
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+              BPE-dropout.
+
+    Attributes:
+        sp_model (`SentencePieceProcessor`):
+            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        bos_token="[SEP]",
+        eos_token="[SEP]",
+        sep_token="[SEP]",
+        unk_token="[UNK]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        sp_model_kwargs: Optional[dict[str, Any]] = None,
+        **kwargs,
+    ) -> None:
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+
+        try:
+            import sentencepiece as spm
+        except ImportError:
+            logger.warning(
+                "You need to install SentencePiece to use XLMRobertaTokenizer: https://github.com/google/sentencepiece"
+                " pip install sentencepiece"
+            )
+            raise
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(str(vocab_file))
+        self.vocab_file = vocab_file
+
+        # Original fairseq vocab and spm vocab must be "aligned":
+        # Vocab    |    0    |    1    |   2    |    3    |  4  |  5  |  6  |   7   |   8   |  9
+        # -------- | ------- | ------- | ------ | ------- | --- | --- | --- | ----- | ----- | ----
+        # fairseq  | '<s>'   | '<pad>' | '</s>' | '<unk>' | ',' | '.' | '▁' | 's'   | '▁de' | '-'
+        # spm      | '<unk>' | '<s>'   | '</s>' | ','     | '.' | '▁' | 's' | '▁de' | '-'   | '▁a'
+
+        # put special tokens and [unused] tokens into the vocab
+        self.fairseq_tokens_to_ids = {"[PAD]": 0, "[CLS]": 1, "[SEP]": 2, "[UNK]": 3, "[MASK]": 4}
+
+        for i in range(10):
+            tok = f"[unused{i}]"
+            self.fairseq_tokens_to_ids[tok] = 5 + i
+
+        # The first "real" token "," has position 15 in the embedding vocab and position 3 in the spm vocab
+        self.fairseq_offset = 12
+        self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
+
+        # TODO ArthurZ fairseq_ids_to_tokens should be removed
+
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            sep_token=sep_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            sp_model_kwargs=self.sp_model_kwargs,
+            **kwargs,
+        )
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__ = d
+        try:
+            import sentencepiece as spm
+        except ImportError:
+            logger.warning(
+                "You need to install SentencePiece to use XLMRobertaTokenizer: https://github.com/google/sentencepiece"
+                " pip install sentencepiece"
+            )
+            raise
+
+        # for backward compatibility
+        if not hasattr(self, "sp_model_kwargs"):
+            self.sp_model_kwargs = {}
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(self.vocab_file)
+
+    def get_special_tokens_mask(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
+    ) -> list[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if token_ids_1 is None:
+            return ([0] * len(token_ids_0)) + [1]
+        return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLMProphetNet
+        does not make use of token type ids, therefore a list of zeros is returned.
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `list[int]`: List of zeros.
+
+        """
+
+        sep = [self.sep_token_id]
+
+        if token_ids_1 is None:
+            return len(token_ids_0 + sep) * [0]
+        return len(token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
+
+    @property
+    def vocab_size(self):
+        return len(self.sp_model) + self.fairseq_offset
+
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def _tokenize(self, text: str) -> str:
+        return self.sp_model.encode(text, out_type=str)
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        if token in self.fairseq_tokens_to_ids:
+            return self.fairseq_tokens_to_ids[token]
+        spm_id = self.sp_model.PieceToId(token)
+
+        # Need to return unknown token if the SP model returned 0
+        return spm_id + self.fairseq_offset if spm_id else self.unk_token_id
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        if index in self.fairseq_ids_to_tokens:
+            return self.fairseq_ids_to_tokens[index]
+        return self.sp_model.IdToPiece(index - self.fairseq_offset)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (strings for sub-words) in a single string."""
+        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
+        return out_string
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+
+        return (out_vocab_file,)
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A XLMProphetNet sequence has the following format:
+
+        - single sequence: `X [SEP]`
+        - pair of sequences: `A [SEP] B [SEP]`
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs to which the special tokens will be added
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `list[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+
+        if token_ids_1 is None:
+            return token_ids_0 + [self.sep_token_id]
+        sep = [self.sep_token_id]
+        return token_ids_0 + sep + token_ids_1 + sep
+
+
+__all__ = ["XLMProphetNetTokenizer"]
diff --git a/venv/lib/python3.13/site-packages/transformers/models/dinov3_vit/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/dinov3_vit/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5f251eb68580c4ef4d77ad63447e7bd80d4d0b5b
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/dinov3_vit/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/dinov3_vit/__pycache__/configuration_dinov3_vit.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/dinov3_vit/__pycache__/configuration_dinov3_vit.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7a3e427bfb1dae7581fd9a5d4becef3b89f5f046
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/dinov3_vit/__pycache__/configuration_dinov3_vit.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/dinov3_vit/__pycache__/image_processing_dinov3_vit_fast.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/dinov3_vit/__pycache__/image_processing_dinov3_vit_fast.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3a3539697b4561073d90dab98e6269632c7f4722
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/dinov3_vit/__pycache__/image_processing_dinov3_vit_fast.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/dinov3_vit/__pycache__/modeling_dinov3_vit.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/dinov3_vit/__pycache__/modeling_dinov3_vit.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..14570c29f3c1c819725c626c9ae2505a8ceabe56
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/dinov3_vit/__pycache__/modeling_dinov3_vit.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/dinov3_vit/__pycache__/modular_dinov3_vit.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/dinov3_vit/__pycache__/modular_dinov3_vit.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..77a23df7042247dd1ebc28e3373b00e91dc09991
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/dinov3_vit/__pycache__/modular_dinov3_vit.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/dpt/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/dpt/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..05cd049a2f6549eccf692d95163b79854bb6f7c0
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/dpt/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/dpt/__pycache__/configuration_dpt.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/dpt/__pycache__/configuration_dpt.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e88c8a76dced41689d397e69cf0a500fb358945c
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/dpt/__pycache__/configuration_dpt.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/dpt/__pycache__/feature_extraction_dpt.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/dpt/__pycache__/feature_extraction_dpt.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d3caa5cf90792cc976c7bce2c2fcc8cf1bb9b8fa
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/dpt/__pycache__/feature_extraction_dpt.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/dpt/__pycache__/image_processing_dpt.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/dpt/__pycache__/image_processing_dpt.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6f9ab5b339724275b59d1331d8f457dca233c2c3
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/dpt/__pycache__/image_processing_dpt.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/dpt/__pycache__/image_processing_dpt_fast.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/dpt/__pycache__/image_processing_dpt_fast.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..66b258538c749aeff68c526861044f2e200f39c1
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/dpt/__pycache__/image_processing_dpt_fast.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/dpt/__pycache__/modeling_dpt.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/dpt/__pycache__/modeling_dpt.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a6c211c132462477a2b5d3a898903ec5c053ee3e
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/dpt/__pycache__/modeling_dpt.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/dpt/__pycache__/modular_dpt.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/dpt/__pycache__/modular_dpt.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e9920b67dfd5f8d9e34a6ea26baa607851ae41e2
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/dpt/__pycache__/modular_dpt.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/exaone4/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/exaone4/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5b1df85d5101ea1a043773bc15209c063ddf26bc
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/exaone4/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/exaone4/__pycache__/configuration_exaone4.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/exaone4/__pycache__/configuration_exaone4.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..24ce41198a9a03d580f152346065d40d85932a2f
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/exaone4/__pycache__/configuration_exaone4.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/exaone4/__pycache__/modeling_exaone4.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/exaone4/__pycache__/modeling_exaone4.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..67ca8d85aa6795bcc3420b1fc152cde191be698c
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/exaone4/__pycache__/modeling_exaone4.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/exaone4/__pycache__/modular_exaone4.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/exaone4/__pycache__/modular_exaone4.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2059995b0cd67b579e4a4527311504cbbfaf89e1
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/exaone4/__pycache__/modular_exaone4.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/falcon/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/falcon/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a5d7f9dd5ad6861a76d918fe61e980cda486e53f
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/falcon/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/falcon/__pycache__/configuration_falcon.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/falcon/__pycache__/configuration_falcon.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..72e191c728fe4ddbe2498d70ea25547815cc996c
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/falcon/__pycache__/configuration_falcon.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/falcon/__pycache__/modeling_falcon.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/falcon/__pycache__/modeling_falcon.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c6d2f165ace7b11c2b2b1213bdcfe5c06e951c20
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/falcon/__pycache__/modeling_falcon.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/fsmt/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/fsmt/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f553241b42c33b831b6b297f0ce4714e83d71b5a
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/fsmt/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/fsmt/__pycache__/configuration_fsmt.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/fsmt/__pycache__/configuration_fsmt.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1bf78651537887f948469ec9b94ebb96fb95ec61
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/fsmt/__pycache__/configuration_fsmt.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/fsmt/__pycache__/modeling_fsmt.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/fsmt/__pycache__/modeling_fsmt.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6c73ae221e123614ae471e8ac657a07a5eeb89fa
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/fsmt/__pycache__/modeling_fsmt.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/fsmt/__pycache__/tokenization_fsmt.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/fsmt/__pycache__/tokenization_fsmt.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e13ee8703e51b248e4abf07ce215905c3f58a39d
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/fsmt/__pycache__/tokenization_fsmt.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/funnel/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/funnel/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..78424ddb3f4df99be0fc1326cd1293ed09e8cc18
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/funnel/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/funnel/__pycache__/configuration_funnel.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/funnel/__pycache__/configuration_funnel.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..77958a5c3722198770ddcff8873527e996cd2a81
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/funnel/__pycache__/configuration_funnel.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/funnel/__pycache__/modeling_funnel.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/funnel/__pycache__/modeling_funnel.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8deef8458f4f01eb09a191d09ab9cdb4e4d03cf0
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/funnel/__pycache__/modeling_funnel.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/funnel/__pycache__/modeling_tf_funnel.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/funnel/__pycache__/modeling_tf_funnel.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cb1bf4684a76c1835e41ecfa395be3b1fe0e5f78
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/funnel/__pycache__/modeling_tf_funnel.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/funnel/__pycache__/tokenization_funnel.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/funnel/__pycache__/tokenization_funnel.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..28a6e5aa87470920465113e377594ce94f24b763
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/funnel/__pycache__/tokenization_funnel.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/funnel/__pycache__/tokenization_funnel_fast.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/funnel/__pycache__/tokenization_funnel_fast.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f7017117871df17a2bf83896f238b96ac7c060dc
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/funnel/__pycache__/tokenization_funnel_fast.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/gemma2/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/gemma2/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..31e2428541f0fa4f9e8bf19b774cb6626ccb58b0
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/gemma2/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/gemma2/__pycache__/configuration_gemma2.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/gemma2/__pycache__/configuration_gemma2.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b61707b795573d28925892ef79d9a07bb637b02d
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/gemma2/__pycache__/configuration_gemma2.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/gemma2/__pycache__/modeling_gemma2.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/gemma2/__pycache__/modeling_gemma2.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7eba0f3a4403897772fba8c50079451127153247
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/gemma2/__pycache__/modeling_gemma2.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/gemma2/__pycache__/modular_gemma2.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/gemma2/__pycache__/modular_gemma2.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9570b8fc838ca729e4219780cc8fec568b105036
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/gemma2/__pycache__/modular_gemma2.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/gemma3n/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/gemma3n/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..da0509e3bb918db9fb007a0bb49530554ad327bd
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/gemma3n/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/gemma3n/__pycache__/configuration_gemma3n.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/gemma3n/__pycache__/configuration_gemma3n.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..69e8855c21df67590b1d4eedb8b16d88bd8f53f5
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/gemma3n/__pycache__/configuration_gemma3n.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/gemma3n/__pycache__/feature_extraction_gemma3n.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/gemma3n/__pycache__/feature_extraction_gemma3n.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..48a3064e8b5d8bd752339cf7adc9924ad5cba3b9
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/gemma3n/__pycache__/feature_extraction_gemma3n.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/gemma3n/__pycache__/processing_gemma3n.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/gemma3n/__pycache__/processing_gemma3n.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..32ed11e5209e7569838d831275aeca395d906c9c
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/gemma3n/__pycache__/processing_gemma3n.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/glm4v_moe/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/glm4v_moe/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c7c2cb121df6ac4e9db3974d999de738ca5ea452
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/glm4v_moe/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/glm4v_moe/__pycache__/configuration_glm4v_moe.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/glm4v_moe/__pycache__/configuration_glm4v_moe.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7d5a216dd14675de423be5019d3fd8d59b8b67b4
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/glm4v_moe/__pycache__/configuration_glm4v_moe.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/glm4v_moe/__pycache__/modeling_glm4v_moe.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/glm4v_moe/__pycache__/modeling_glm4v_moe.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dd26eb8190d5e46ef3e588b92e5737d5ddcddf16
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/glm4v_moe/__pycache__/modeling_glm4v_moe.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/glm4v_moe/__pycache__/modular_glm4v_moe.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/glm4v_moe/__pycache__/modular_glm4v_moe.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fe59149acbc8db027f659361e8bc4b0c256af074
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/glm4v_moe/__pycache__/modular_glm4v_moe.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/gpt2/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/gpt2/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..358739f9486d7e7eebce17ef609ebf3d40a356ac
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/gpt2/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/gpt2/__pycache__/configuration_gpt2.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/gpt2/__pycache__/configuration_gpt2.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..22cbafa1a7a4290a3054c4c96bb4843fd0dc0a70
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/gpt2/__pycache__/configuration_gpt2.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/gpt2/__pycache__/modeling_flax_gpt2.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/gpt2/__pycache__/modeling_flax_gpt2.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5f405d50c457bb8a9778a7a485e4a37ccd8cc0a9
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/gpt2/__pycache__/modeling_flax_gpt2.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/gpt2/__pycache__/modeling_gpt2.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/gpt2/__pycache__/modeling_gpt2.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c52c88eb3b98c1baf5d816834db570367b4f75bc
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/gpt2/__pycache__/modeling_gpt2.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/gpt2/__pycache__/modeling_tf_gpt2.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/gpt2/__pycache__/modeling_tf_gpt2.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..942c9ed1383f64c72d6bb7f2c618735586e73fbf
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/gpt2/__pycache__/modeling_tf_gpt2.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/gpt2/__pycache__/tokenization_gpt2.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/gpt2/__pycache__/tokenization_gpt2.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f5580f14b0c92d96441354bc20ab5061212f9eb1
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/gpt2/__pycache__/tokenization_gpt2.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/gpt2/__pycache__/tokenization_gpt2_fast.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/gpt2/__pycache__/tokenization_gpt2_fast.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f4ba20e9b5937ff2b131c1ce884fb8f05788dded
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/gpt2/__pycache__/tokenization_gpt2_fast.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/gpt2/__pycache__/tokenization_gpt2_tf.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/gpt2/__pycache__/tokenization_gpt2_tf.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4963880abe7faa90332f858f0a68e4f47bcec2e9
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/gpt2/__pycache__/tokenization_gpt2_tf.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/granitemoeshared/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/granitemoeshared/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ad66b0b1cb906369a1264ec7477c1ae7da449f17
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/granitemoeshared/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/granitemoeshared/__pycache__/configuration_granitemoeshared.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/granitemoeshared/__pycache__/configuration_granitemoeshared.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b59ab6d3cf18663d1c31d5e4b218df551ce956f1
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/granitemoeshared/__pycache__/configuration_granitemoeshared.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/granitemoeshared/__pycache__/modeling_granitemoeshared.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/granitemoeshared/__pycache__/modeling_granitemoeshared.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5c99138b769f8b6fc7b32f2ae91d2fbe47cbd654
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/granitemoeshared/__pycache__/modeling_granitemoeshared.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/granitemoeshared/__pycache__/modular_granitemoeshared.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/granitemoeshared/__pycache__/modular_granitemoeshared.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..77808e8e89df24b8c79f982985fb31172972228a
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/granitemoeshared/__pycache__/modular_granitemoeshared.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/groupvit/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/groupvit/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1fd88715284010568f201aaedc84d305e3fe8464
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/groupvit/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/groupvit/__pycache__/configuration_groupvit.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/groupvit/__pycache__/configuration_groupvit.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3ca3166f0ad145ffc8bc619ba0126a6ff2dbfea8
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/groupvit/__pycache__/configuration_groupvit.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/groupvit/__pycache__/modeling_groupvit.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/groupvit/__pycache__/modeling_groupvit.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d5d3d77c3ab0f53b456da9045cb766dc716e60b9
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/groupvit/__pycache__/modeling_groupvit.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/hiera/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/hiera/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8df1f4eba45f63d74315df68b381263800d94108
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/hiera/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/hiera/__pycache__/configuration_hiera.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/hiera/__pycache__/configuration_hiera.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..786545eeef78b5d6baeeb5a5359086a9444907df
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/hiera/__pycache__/configuration_hiera.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/hiera/__pycache__/modeling_hiera.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/hiera/__pycache__/modeling_hiera.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..209c09a883bff69b9231b0aaf0cb04582de31a1f
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/hiera/__pycache__/modeling_hiera.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/hubert/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/hubert/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d93a4cebfdae322a4c05503c6e5ba8e646d75b13
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/hubert/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/hubert/__pycache__/configuration_hubert.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/hubert/__pycache__/configuration_hubert.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5cef03abca4228bc9b96934c9a91fde6eca13e11
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/hubert/__pycache__/configuration_hubert.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/hubert/__pycache__/modeling_hubert.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/hubert/__pycache__/modeling_hubert.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..45270088714bc4489b82d4fe854a5f1e3156ddcb
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/hubert/__pycache__/modeling_hubert.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/hubert/__pycache__/modeling_tf_hubert.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/hubert/__pycache__/modeling_tf_hubert.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..25c7a158a4a5a625fb118d2be207397bd70d12fd
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/hubert/__pycache__/modeling_tf_hubert.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/hubert/__pycache__/modular_hubert.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/hubert/__pycache__/modular_hubert.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..94cbcf46639c737687a557cc143147334df36b4f
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/hubert/__pycache__/modular_hubert.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/hunyuan_v1_moe/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/hunyuan_v1_moe/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a6e2b08bf5bb200f81a9484123b751fad4ae5223
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/hunyuan_v1_moe/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/hunyuan_v1_moe/__pycache__/configuration_hunyuan_v1_moe.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/hunyuan_v1_moe/__pycache__/configuration_hunyuan_v1_moe.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..528786511ee5ec4a39e0e74063b7e5c521eb0f81
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/hunyuan_v1_moe/__pycache__/configuration_hunyuan_v1_moe.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/hunyuan_v1_moe/__pycache__/modeling_hunyuan_v1_moe.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/hunyuan_v1_moe/__pycache__/modeling_hunyuan_v1_moe.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e872630f9ec7b3203a0d05dd17d9a85efe7cdb76
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/hunyuan_v1_moe/__pycache__/modeling_hunyuan_v1_moe.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/hunyuan_v1_moe/__pycache__/modular_hunyuan_v1_moe.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/hunyuan_v1_moe/__pycache__/modular_hunyuan_v1_moe.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6d8d20773728bd51c46df69be82a04896aff5cfd
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/hunyuan_v1_moe/__pycache__/modular_hunyuan_v1_moe.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/ibert/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/ibert/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..837dea313d2a588d32bdade5aac8e5e5cc2eb444
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/ibert/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/ibert/__pycache__/configuration_ibert.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/ibert/__pycache__/configuration_ibert.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b0d5357fa325741034560be9fc6ab8c07a565ea1
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/ibert/__pycache__/configuration_ibert.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/ibert/__pycache__/modeling_ibert.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/ibert/__pycache__/modeling_ibert.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c4e1dd95221aaa7ef2aa689e76bc1908a922e2db
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/ibert/__pycache__/modeling_ibert.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/ibert/__pycache__/quant_modules.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/ibert/__pycache__/quant_modules.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3a740c2a2406bb2d1acd6692ec860c4f35d6e8fb
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/ibert/__pycache__/quant_modules.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/idefics3/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/idefics3/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e1c05a2088fdf715a31122e4faee3975c34288ef
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/idefics3/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/idefics3/__pycache__/configuration_idefics3.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/idefics3/__pycache__/configuration_idefics3.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9a23d95e32439e5f96e90450726290331279878a
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/idefics3/__pycache__/configuration_idefics3.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/idefics3/__pycache__/image_processing_idefics3.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/idefics3/__pycache__/image_processing_idefics3.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..59da5d40d496bd0388a53eb410fc14eb2fe976e3
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/idefics3/__pycache__/image_processing_idefics3.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/idefics3/__pycache__/image_processing_idefics3_fast.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/idefics3/__pycache__/image_processing_idefics3_fast.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6982697a87ed1308999f58cf6218e5f5bd570aa6
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/idefics3/__pycache__/image_processing_idefics3_fast.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/idefics3/__pycache__/modeling_idefics3.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/idefics3/__pycache__/modeling_idefics3.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1e985fd4b027c0dcfb2cf507d9f197054bdad1d8
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/idefics3/__pycache__/modeling_idefics3.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/idefics3/__pycache__/processing_idefics3.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/idefics3/__pycache__/processing_idefics3.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1c04bccea408b05e7e273fd3b79b6dcbe39fe828
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/idefics3/__pycache__/processing_idefics3.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/imagegpt/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/imagegpt/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5fbd79d9432b9ddb5024abe90f5bb6001129a3dc
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/imagegpt/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/imagegpt/__pycache__/configuration_imagegpt.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/imagegpt/__pycache__/configuration_imagegpt.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6229f3f29d9c0020bf9e86fd948076f9dccb3d0f
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/imagegpt/__pycache__/configuration_imagegpt.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/imagegpt/__pycache__/feature_extraction_imagegpt.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/imagegpt/__pycache__/feature_extraction_imagegpt.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5fb5bccc8424ee9deddee6b35b4efc61974fdc20
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/imagegpt/__pycache__/feature_extraction_imagegpt.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/imagegpt/__pycache__/image_processing_imagegpt.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/imagegpt/__pycache__/image_processing_imagegpt.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e4b8417e47fbe8d87017eaf8465d0c4a1a988584
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/imagegpt/__pycache__/image_processing_imagegpt.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/imagegpt/__pycache__/image_processing_imagegpt_fast.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/imagegpt/__pycache__/image_processing_imagegpt_fast.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..41271ca11f2adf25c58ff5654015485dd755e07a
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/imagegpt/__pycache__/image_processing_imagegpt_fast.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/imagegpt/__pycache__/modeling_imagegpt.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/imagegpt/__pycache__/modeling_imagegpt.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5b650417ac9251d633230e0d6e62134c04a6ecfd
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/imagegpt/__pycache__/modeling_imagegpt.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/layoutlm/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/layoutlm/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..90cd4cefc6e6ae01314a84bb420b4b507a9e4e1b
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/layoutlm/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/layoutlm/__pycache__/configuration_layoutlm.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/layoutlm/__pycache__/configuration_layoutlm.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ea7b93d78b018e6db991094c76461dda6115d54d
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/layoutlm/__pycache__/configuration_layoutlm.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/layoutlm/__pycache__/modeling_layoutlm.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/layoutlm/__pycache__/modeling_layoutlm.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9786819f3d50bd05ebcc3c696ceeea34c86fe223
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/layoutlm/__pycache__/modeling_layoutlm.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/layoutlm/__pycache__/modeling_tf_layoutlm.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/layoutlm/__pycache__/modeling_tf_layoutlm.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..471858248364e0a758a4e278170f6e60eb19d58e
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/layoutlm/__pycache__/modeling_tf_layoutlm.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/layoutlm/__pycache__/tokenization_layoutlm.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/layoutlm/__pycache__/tokenization_layoutlm.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4926a49b46a8707b5b8378d484d0e1744aa2d764
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/layoutlm/__pycache__/tokenization_layoutlm.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/layoutlm/__pycache__/tokenization_layoutlm_fast.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/layoutlm/__pycache__/tokenization_layoutlm_fast.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d8941e893082044e93ba96c6b9b81d9739c1d096
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/layoutlm/__pycache__/tokenization_layoutlm_fast.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/lxmert/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/lxmert/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6d87bf56a9bde8a8ce167c6fa04aa70d0b557494
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/lxmert/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/lxmert/__pycache__/configuration_lxmert.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/lxmert/__pycache__/configuration_lxmert.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8e006ff890a0c812a2fb5ebf7aa15d9beec9bd0d
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/lxmert/__pycache__/configuration_lxmert.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/lxmert/__pycache__/modeling_lxmert.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/lxmert/__pycache__/modeling_lxmert.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e2c702d3d438223290a4def036f3e88218e1065e
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/lxmert/__pycache__/modeling_lxmert.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/lxmert/__pycache__/modeling_tf_lxmert.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/lxmert/__pycache__/modeling_tf_lxmert.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..51a6c63cc73689a52f9e16ac9a82864145610e24
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/lxmert/__pycache__/modeling_tf_lxmert.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/lxmert/__pycache__/tokenization_lxmert.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/lxmert/__pycache__/tokenization_lxmert.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5fdc1b2ef73a2ed8181145087627774627a9c998
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/lxmert/__pycache__/tokenization_lxmert.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/lxmert/__pycache__/tokenization_lxmert_fast.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/lxmert/__pycache__/tokenization_lxmert_fast.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..adb15ce2f27aa427f7a93c9370a7ed9c6957e383
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/lxmert/__pycache__/tokenization_lxmert_fast.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/mamba/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/mamba/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..26ff89a39227b7a294836245c3690f8fc600b62b
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/mamba/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/mamba/__pycache__/configuration_mamba.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/mamba/__pycache__/configuration_mamba.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..912685431f2c78a3f501ad0244b1d4026b6d34f9
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/mamba/__pycache__/configuration_mamba.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/mamba/__pycache__/modeling_mamba.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/mamba/__pycache__/modeling_mamba.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4cd0a3a307143917831f87d944401ef7f64e7adf
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/mamba/__pycache__/modeling_mamba.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/marian/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/marian/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7bdbccdeb24cd1aa161c746ea57bf3a60edf2a9b
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/marian/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/marian/__pycache__/configuration_marian.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/marian/__pycache__/configuration_marian.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..52ef0166d43bde248a6aa3e2e5b4dcdadfacac44
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/marian/__pycache__/configuration_marian.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/marian/__pycache__/modeling_flax_marian.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/marian/__pycache__/modeling_flax_marian.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e50b4d5af7d920c237135025c79e973252c50283
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/marian/__pycache__/modeling_flax_marian.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/marian/__pycache__/modeling_marian.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/marian/__pycache__/modeling_marian.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0eadefaff442f70ff9d911178c2ee50e09b02d16
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/marian/__pycache__/modeling_marian.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/marian/__pycache__/modeling_tf_marian.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/marian/__pycache__/modeling_tf_marian.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..800a49426c16fb83410c75bdd24c0400bd71743f
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/marian/__pycache__/modeling_tf_marian.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/marian/__pycache__/tokenization_marian.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/marian/__pycache__/tokenization_marian.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e95aff81815721ba263434d380e687fc5f22e248
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/marian/__pycache__/tokenization_marian.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/mask2former/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/mask2former/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b9a53f87b71ba3f007356604564bf9db83493f05
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/mask2former/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/mask2former/__pycache__/configuration_mask2former.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/mask2former/__pycache__/configuration_mask2former.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fa72d5e2c1eea83d9531c9c40e329aca9dab6c3b
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/mask2former/__pycache__/configuration_mask2former.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/mask2former/__pycache__/image_processing_mask2former.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/mask2former/__pycache__/image_processing_mask2former.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..051483616f94e932493c57038e6b1ab84d5edd1b
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/mask2former/__pycache__/image_processing_mask2former.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/mask2former/__pycache__/image_processing_mask2former_fast.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/mask2former/__pycache__/image_processing_mask2former_fast.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..304bfc35ec8014d0ff7b173a9cb2464fe509260f
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/mask2former/__pycache__/image_processing_mask2former_fast.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/mask2former/__pycache__/modular_mask2former.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/mask2former/__pycache__/modular_mask2former.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..07187cfdecde79c8027cbea4d0ebc310446f8cf5
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/mask2former/__pycache__/modular_mask2former.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/metaclip_2/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/metaclip_2/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6103b5ff062170aa1397a3e87bfd126d56094fdf
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/metaclip_2/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/metaclip_2/__pycache__/configuration_metaclip_2.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/metaclip_2/__pycache__/configuration_metaclip_2.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..57013002b69ea39066a2ae1f2835e062a5d9b29a
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/metaclip_2/__pycache__/configuration_metaclip_2.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/metaclip_2/__pycache__/modeling_metaclip_2.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/metaclip_2/__pycache__/modeling_metaclip_2.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6be0f9d4ae0195a5f4218ed1b563b343f4b0285b
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/metaclip_2/__pycache__/modeling_metaclip_2.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/metaclip_2/__pycache__/modular_metaclip_2.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/metaclip_2/__pycache__/modular_metaclip_2.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b21bd25771343a65fb79fc152890ac4be2e05472
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/metaclip_2/__pycache__/modular_metaclip_2.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/mimi/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/mimi/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..74d4d0da721dcc66328604d1f0b5d66838c41d94
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/mimi/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/mimi/__pycache__/configuration_mimi.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/mimi/__pycache__/configuration_mimi.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..69df89920c35dc3e9e32860d00719dc0e2f8ae11
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/mimi/__pycache__/configuration_mimi.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/mimi/__pycache__/modeling_mimi.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/mimi/__pycache__/modeling_mimi.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..33cfd4c45df8dab0cb0406d25992c8db09324ab8
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/mimi/__pycache__/modeling_mimi.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/mobilebert/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/mobilebert/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d34f9334195be05b8516cfa81dfafe35cd330006
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/mobilebert/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/mobilebert/__pycache__/configuration_mobilebert.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/mobilebert/__pycache__/configuration_mobilebert.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0fa3f202ef75a1ca5838ccd99a5be1dec7c122d2
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/mobilebert/__pycache__/configuration_mobilebert.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/mobilebert/__pycache__/modeling_mobilebert.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/mobilebert/__pycache__/modeling_mobilebert.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ae0479b62b911db6fee338a4ca4e2ebb7cc75a3e
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/mobilebert/__pycache__/modeling_mobilebert.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/mobilebert/__pycache__/modeling_tf_mobilebert.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/mobilebert/__pycache__/modeling_tf_mobilebert.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f33f31797ed7026938a1139dd6680ee0db67f16c
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/mobilebert/__pycache__/modeling_tf_mobilebert.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/mobilebert/__pycache__/tokenization_mobilebert.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/mobilebert/__pycache__/tokenization_mobilebert.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..75355796d10d5aa509cf452b65cb2dd3f60d9d9a
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/mobilebert/__pycache__/tokenization_mobilebert.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/mobilebert/__pycache__/tokenization_mobilebert_fast.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/mobilebert/__pycache__/tokenization_mobilebert_fast.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..150467e9ebf8f734e2ef177c2ec64146cf685665
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/mobilebert/__pycache__/tokenization_mobilebert_fast.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/moshi/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/moshi/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0f389620211433b6840b6991ab340b9a83e6d644
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/moshi/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/moshi/__pycache__/configuration_moshi.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/moshi/__pycache__/configuration_moshi.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c752ba042e3a0044e47bde1a711e73337417e51a
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/moshi/__pycache__/configuration_moshi.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/musicgen/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/musicgen/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..001ea340f7575bfce28efa2cd5df9797c68a63c0
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/musicgen/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/musicgen/__pycache__/configuration_musicgen.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/musicgen/__pycache__/configuration_musicgen.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..109e8d5392a0d908ecb8efc8e3023996f124710b
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/musicgen/__pycache__/configuration_musicgen.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/musicgen/__pycache__/processing_musicgen.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/musicgen/__pycache__/processing_musicgen.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e8ebcf3b1574bba3e4b65537c75011f47ccffb0b
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/musicgen/__pycache__/processing_musicgen.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/musicgen_melody/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/musicgen_melody/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6a17871293269e1e5c1da39636989d3e49a20d2a
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/musicgen_melody/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/musicgen_melody/__pycache__/configuration_musicgen_melody.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/musicgen_melody/__pycache__/configuration_musicgen_melody.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3675863a3d4a14702ff57b4f7772df0897bbfde8
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/musicgen_melody/__pycache__/configuration_musicgen_melody.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/musicgen_melody/__pycache__/feature_extraction_musicgen_melody.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/musicgen_melody/__pycache__/feature_extraction_musicgen_melody.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..57e44ebf76ead394dc1233cacbad0d0ab2f2b061
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/musicgen_melody/__pycache__/feature_extraction_musicgen_melody.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/musicgen_melody/__pycache__/modeling_musicgen_melody.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/musicgen_melody/__pycache__/modeling_musicgen_melody.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4dd406ad0749a3ba1c3370df4577fe0da48cd486
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/musicgen_melody/__pycache__/modeling_musicgen_melody.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/musicgen_melody/__pycache__/processing_musicgen_melody.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/musicgen_melody/__pycache__/processing_musicgen_melody.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e03f4a5d8878f90399e27d77f311f71ea7331c65
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/musicgen_melody/__pycache__/processing_musicgen_melody.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/myt5/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/myt5/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..05f998902cf91a71275568ee3376e06796a45737
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/myt5/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/olmo/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/olmo/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1dbb5343202ba048cba5f919a9f279a5b6dd8aec
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/olmo/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/olmo/__pycache__/configuration_olmo.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/olmo/__pycache__/configuration_olmo.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ea6c09a2ec27f541255a8deed92c6320f399aa1d
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/olmo/__pycache__/configuration_olmo.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/olmo/__pycache__/modeling_olmo.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/olmo/__pycache__/modeling_olmo.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..123a13e5ecec16939614e2b0f11a5fb894599daf
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/olmo/__pycache__/modeling_olmo.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/olmo/__pycache__/modular_olmo.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/olmo/__pycache__/modular_olmo.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2d4c6904b7acbcef6914f48ba597842fb003a38c
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/olmo/__pycache__/modular_olmo.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/olmo3/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/olmo3/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..67ef86e1fc3d41db812803a3fe5f78cfdab3724a
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/olmo3/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/olmo3/__pycache__/configuration_olmo3.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/olmo3/__pycache__/configuration_olmo3.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..74595ce7df549bccbbe4cf854963f93dec22600b
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/olmo3/__pycache__/configuration_olmo3.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/olmo3/__pycache__/modeling_olmo3.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/olmo3/__pycache__/modeling_olmo3.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3180b15d57e7ef83a14478f54d72270cef63cb7a
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/olmo3/__pycache__/modeling_olmo3.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/olmo3/__pycache__/modular_olmo3.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/olmo3/__pycache__/modular_olmo3.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a218daf26d3e4608c4ebc0851a9df73b214eba18
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/olmo3/__pycache__/modular_olmo3.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/omdet_turbo/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/omdet_turbo/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..439c0c78f23d3552cad8ff98c9a28c785ab7b1de
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/omdet_turbo/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/omdet_turbo/__pycache__/configuration_omdet_turbo.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/omdet_turbo/__pycache__/configuration_omdet_turbo.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..93c72e3d9dc717b17e3cee6ab81ed038e85e9a82
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/omdet_turbo/__pycache__/configuration_omdet_turbo.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/omdet_turbo/__pycache__/modeling_omdet_turbo.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/omdet_turbo/__pycache__/modeling_omdet_turbo.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9729deb921c1de54b80e0d8c758c71ec2fc3afc6
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/omdet_turbo/__pycache__/modeling_omdet_turbo.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/omdet_turbo/__pycache__/processing_omdet_turbo.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/omdet_turbo/__pycache__/processing_omdet_turbo.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b6c524db5221214541869b8756bcc0745c064297
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/omdet_turbo/__pycache__/processing_omdet_turbo.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/oneformer/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/oneformer/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bf45f0772e2827a4ad76682d757dcd8f10626de3
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/oneformer/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/oneformer/__pycache__/configuration_oneformer.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/oneformer/__pycache__/configuration_oneformer.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ae862a3fda60ec308b22f20ee4a9a2a11f4ac6ac
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/oneformer/__pycache__/configuration_oneformer.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/oneformer/__pycache__/image_processing_oneformer.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/oneformer/__pycache__/image_processing_oneformer.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1a3adfe68bb4c84691ee95415db9ad64fe87e46a
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/oneformer/__pycache__/image_processing_oneformer.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/oneformer/__pycache__/image_processing_oneformer_fast.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/oneformer/__pycache__/image_processing_oneformer_fast.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7429f57b1f5f4c764de45ebbf7a4669853d4979e
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/oneformer/__pycache__/image_processing_oneformer_fast.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/oneformer/__pycache__/processing_oneformer.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/oneformer/__pycache__/processing_oneformer.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8d892e7a9f24402fdfe86868c3fbb40b16556c4e
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/oneformer/__pycache__/processing_oneformer.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/opt/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/opt/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ba7cd106933a7facc0fef802be54120898692a09
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/opt/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/opt/__pycache__/configuration_opt.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/opt/__pycache__/configuration_opt.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f8e6ba772b3592f43376d34fd69291e0fc866998
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/opt/__pycache__/configuration_opt.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/opt/__pycache__/modeling_flax_opt.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/opt/__pycache__/modeling_flax_opt.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eca71b179835c081232db0cf3bbcc642d937fc6c
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/opt/__pycache__/modeling_flax_opt.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/opt/__pycache__/modeling_opt.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/opt/__pycache__/modeling_opt.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..581893ee16d7a74de70654a77fcc6527684986c1
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/opt/__pycache__/modeling_opt.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/opt/__pycache__/modeling_tf_opt.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/opt/__pycache__/modeling_tf_opt.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3b267b85811694e300dfc16e51650f22fc6d28ba
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/opt/__pycache__/modeling_tf_opt.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/owlv2/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/owlv2/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6268d80b8a33a0a128350eb735df3fcdcd642556
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/owlv2/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/owlv2/__pycache__/configuration_owlv2.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/owlv2/__pycache__/configuration_owlv2.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..558503ea6515cf9ee5d75157a0f9aeda0819dda4
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/owlv2/__pycache__/configuration_owlv2.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/owlv2/__pycache__/image_processing_owlv2.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/owlv2/__pycache__/image_processing_owlv2.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f1baf08bf0247e1f2a11467a20f2c42d2c7e9d4e
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/owlv2/__pycache__/image_processing_owlv2.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/owlv2/__pycache__/image_processing_owlv2_fast.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/owlv2/__pycache__/image_processing_owlv2_fast.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..53b325d3f1c43a2e8d86c34f8a81549124c55d9b
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/owlv2/__pycache__/image_processing_owlv2_fast.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/owlv2/__pycache__/modeling_owlv2.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/owlv2/__pycache__/modeling_owlv2.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..87e6c6b816a5d10aba36c0664fb883c8ca65bfc2
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/owlv2/__pycache__/modeling_owlv2.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/owlv2/__pycache__/modular_owlv2.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/owlv2/__pycache__/modular_owlv2.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cdc253dcb51d1bc1ae15ac4714ad04d1ca0218a2
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/owlv2/__pycache__/modular_owlv2.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/owlv2/__pycache__/processing_owlv2.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/owlv2/__pycache__/processing_owlv2.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ba4378fbb4f27b9346342bd17a923af2e4f2a33a
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/owlv2/__pycache__/processing_owlv2.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/patchtsmixer/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/patchtsmixer/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2d2a733e421247a382d56038a67140ffee349f5b
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/patchtsmixer/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/patchtsmixer/__pycache__/configuration_patchtsmixer.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/patchtsmixer/__pycache__/configuration_patchtsmixer.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..07db206f5d10657dcf0f71ae9d33bfdf4ccd7f10
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/patchtsmixer/__pycache__/configuration_patchtsmixer.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/patchtsmixer/__pycache__/modeling_patchtsmixer.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/patchtsmixer/__pycache__/modeling_patchtsmixer.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0d49da07e78bb8c6db7a9dfc28dcae921a16d65a
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/patchtsmixer/__pycache__/modeling_patchtsmixer.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/phi/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/phi/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..da08a59e17c862042cda428cdfa6bae8c461d4fc
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/phi/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/phi/__pycache__/configuration_phi.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/phi/__pycache__/configuration_phi.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..38b607846fbecd9410a7bb0fc41779be6c924478
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/phi/__pycache__/configuration_phi.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/phi/__pycache__/modeling_phi.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/phi/__pycache__/modeling_phi.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bf3dbe547321bbccf4f111f828cae6a350bae045
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/phi/__pycache__/modeling_phi.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/phi/__pycache__/modular_phi.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/phi/__pycache__/modular_phi.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..108143fd497c7e69588db75e8dfd241af35b0243
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/phi/__pycache__/modular_phi.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/phimoe/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/phimoe/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0cfa2dff2c3147e02fda47047a03a6a19f5d66e1
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/phimoe/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/phimoe/__pycache__/configuration_phimoe.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/phimoe/__pycache__/configuration_phimoe.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bd8fdd1163f917580122092ccf516f2175383e6a
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/phimoe/__pycache__/configuration_phimoe.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/phimoe/__pycache__/modeling_phimoe.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/phimoe/__pycache__/modeling_phimoe.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8bb4e0f8852eb144a64ffe5c97cd22409e41ca4c
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/phimoe/__pycache__/modeling_phimoe.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/poolformer/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/poolformer/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dd56d98486385e727a552ba9987826a7e1ce9423
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/poolformer/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/poolformer/__pycache__/configuration_poolformer.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/poolformer/__pycache__/configuration_poolformer.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..67262fee8cf9b91b11bb520128a4e65a7fe99a66
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/poolformer/__pycache__/configuration_poolformer.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/poolformer/__pycache__/feature_extraction_poolformer.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/poolformer/__pycache__/feature_extraction_poolformer.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6d833afa588c92f9744a2a4a2f6b9a94975950be
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/poolformer/__pycache__/feature_extraction_poolformer.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/poolformer/__pycache__/image_processing_poolformer.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/poolformer/__pycache__/image_processing_poolformer.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5702583088af2fe04c462344d34e2899e63b73a8
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/poolformer/__pycache__/image_processing_poolformer.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/poolformer/__pycache__/image_processing_poolformer_fast.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/poolformer/__pycache__/image_processing_poolformer_fast.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8e14e9e5ed5a942022cbc0c2c3f5fb3cf26aa9a6
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/poolformer/__pycache__/image_processing_poolformer_fast.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/poolformer/__pycache__/modeling_poolformer.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/poolformer/__pycache__/modeling_poolformer.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d6935826f46046726c8a43a0c1d20fc748dde722
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/poolformer/__pycache__/modeling_poolformer.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/prompt_depth_anything/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/prompt_depth_anything/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..acd02b9e892ea239f5bb5c58bf9f0cd48da8bcda
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/prompt_depth_anything/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/prompt_depth_anything/__pycache__/configuration_prompt_depth_anything.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/prompt_depth_anything/__pycache__/configuration_prompt_depth_anything.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b07ddd779681b737c8d72a612a12b8329f6f73ce
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/prompt_depth_anything/__pycache__/configuration_prompt_depth_anything.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/prompt_depth_anything/__pycache__/image_processing_prompt_depth_anything.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/prompt_depth_anything/__pycache__/image_processing_prompt_depth_anything.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2dfe8636bede825df058deef11229ab20f6589d9
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/prompt_depth_anything/__pycache__/image_processing_prompt_depth_anything.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/prompt_depth_anything/__pycache__/image_processing_prompt_depth_anything_fast.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/prompt_depth_anything/__pycache__/image_processing_prompt_depth_anything_fast.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cab9efec2c3f8114bc925e9c8bbd790da35550a3
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/prompt_depth_anything/__pycache__/image_processing_prompt_depth_anything_fast.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/prompt_depth_anything/__pycache__/modeling_prompt_depth_anything.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/prompt_depth_anything/__pycache__/modeling_prompt_depth_anything.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..35ff678c9c4e4c8fb9c1e01a64ea9f3441cb4027
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/prompt_depth_anything/__pycache__/modeling_prompt_depth_anything.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/prompt_depth_anything/__pycache__/modular_prompt_depth_anything.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/prompt_depth_anything/__pycache__/modular_prompt_depth_anything.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..87af86053192c4d54b3e5c55f92956216438b8b1
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/prompt_depth_anything/__pycache__/modular_prompt_depth_anything.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/qwen2_5_vl/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/qwen2_5_vl/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6c4abcbb293c51263edc824f595676337b87cef5
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/qwen2_5_vl/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/qwen2_5_vl/__pycache__/configuration_qwen2_5_vl.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/qwen2_5_vl/__pycache__/configuration_qwen2_5_vl.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1bddac96ff09c17f4e1d05b2c7a7ee78c33035be
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/qwen2_5_vl/__pycache__/configuration_qwen2_5_vl.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/qwen2_5_vl/__pycache__/modeling_qwen2_5_vl.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/qwen2_5_vl/__pycache__/modeling_qwen2_5_vl.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c5fc9de972979f721fbf6641706d20fb6f23a082
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/qwen2_5_vl/__pycache__/modeling_qwen2_5_vl.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/qwen2_5_vl/__pycache__/modular_qwen2_5_vl.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/qwen2_5_vl/__pycache__/modular_qwen2_5_vl.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7c270376770014052767feb703fe8d06f8aa09c7
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/qwen2_5_vl/__pycache__/modular_qwen2_5_vl.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/qwen2_5_vl/__pycache__/processing_qwen2_5_vl.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/qwen2_5_vl/__pycache__/processing_qwen2_5_vl.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4f9f24723c721d1ea79f3f46e4326c3a6d76c842
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/qwen2_5_vl/__pycache__/processing_qwen2_5_vl.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/qwen3/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/qwen3/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fb10fbe5ecbe6e28c69f58e97dccfba2a04c9785
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/qwen3/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/qwen3/__pycache__/configuration_qwen3.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/qwen3/__pycache__/configuration_qwen3.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ba2a2265c7e7808756c766e8aa92dbd69c89ee12
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/qwen3/__pycache__/configuration_qwen3.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/qwen3/__pycache__/modeling_qwen3.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/qwen3/__pycache__/modeling_qwen3.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..be4be406801abe30a1990003b8c0dffb0a5e5c22
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/qwen3/__pycache__/modeling_qwen3.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/qwen3/__pycache__/modular_qwen3.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/qwen3/__pycache__/modular_qwen3.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fe234f40aae6850dd16d180eeabb26313dcf9ad9
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/qwen3/__pycache__/modular_qwen3.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/qwen3_omni_moe/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/qwen3_omni_moe/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c4e17080ef340139b7f15a52528c9d526d9e4503
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/qwen3_omni_moe/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/qwen3_omni_moe/__pycache__/configuration_qwen3_omni_moe.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/qwen3_omni_moe/__pycache__/configuration_qwen3_omni_moe.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..af289fb20575a849b134cdfa6778db66d308691f
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/qwen3_omni_moe/__pycache__/configuration_qwen3_omni_moe.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/qwen3_omni_moe/__pycache__/processing_qwen3_omni_moe.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/qwen3_omni_moe/__pycache__/processing_qwen3_omni_moe.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b75be69b56a48486d6c1b7d103dbd937616f2715
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/qwen3_omni_moe/__pycache__/processing_qwen3_omni_moe.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/rag/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/rag/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..14e3e13dea8f4bd51cb391a10e084dd1dd8d7dea
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/rag/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/rag/__pycache__/configuration_rag.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/rag/__pycache__/configuration_rag.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..565640c60b7afb1bc21410bcea7d40a9173dacfa
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/rag/__pycache__/configuration_rag.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/rag/__pycache__/modeling_rag.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/rag/__pycache__/modeling_rag.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..419ff348ef17571d8b891dec61b5e96a4be151a2
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/rag/__pycache__/modeling_rag.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/rag/__pycache__/modeling_tf_rag.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/rag/__pycache__/modeling_tf_rag.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..efe0db90cc3764544a196924b33bebd391f3fdcd
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/rag/__pycache__/modeling_tf_rag.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/rag/__pycache__/retrieval_rag.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/rag/__pycache__/retrieval_rag.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3d9fba68d86f0fee037a8647e0e7b65a3c659b84
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/rag/__pycache__/retrieval_rag.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/rag/__pycache__/tokenization_rag.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/rag/__pycache__/tokenization_rag.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..45a5a06ecc82acc5ceade4fee45b086a8c8d2eca
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/rag/__pycache__/tokenization_rag.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/rembert/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/rembert/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..590ad9361c1236582fb494432ad570ef9cce2401
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/rembert/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/rembert/__pycache__/configuration_rembert.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/rembert/__pycache__/configuration_rembert.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..574af2fef42f9ea815e6bb18919756df4cc914f2
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/rembert/__pycache__/configuration_rembert.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/rembert/__pycache__/modeling_rembert.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/rembert/__pycache__/modeling_rembert.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5f5016a5230ab46ed3ef539122752b30ed8df99d
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/rembert/__pycache__/modeling_rembert.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/rembert/__pycache__/modeling_tf_rembert.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/rembert/__pycache__/modeling_tf_rembert.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..432fea14ff533cbef8b411f67450da97361e33dd
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/rembert/__pycache__/modeling_tf_rembert.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/rembert/__pycache__/tokenization_rembert.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/rembert/__pycache__/tokenization_rembert.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e1f6f9b4bc9f2a143a8f996f55d8ee7a03a9bd05
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/rembert/__pycache__/tokenization_rembert.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/rembert/__pycache__/tokenization_rembert_fast.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/rembert/__pycache__/tokenization_rembert_fast.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4288d3744e909bcc60a27b99e1616f7e7cd013aa
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/rembert/__pycache__/tokenization_rembert_fast.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/roberta_prelayernorm/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/roberta_prelayernorm/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ecc29a7e080167a9863c9be03e4392a3bab54d87
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/roberta_prelayernorm/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/roberta_prelayernorm/__pycache__/configuration_roberta_prelayernorm.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/roberta_prelayernorm/__pycache__/configuration_roberta_prelayernorm.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a1e9404fb1b2320a3b6f46290345fc85f104bd32
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/roberta_prelayernorm/__pycache__/configuration_roberta_prelayernorm.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/roberta_prelayernorm/__pycache__/modeling_flax_roberta_prelayernorm.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/roberta_prelayernorm/__pycache__/modeling_flax_roberta_prelayernorm.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..47eb1fea5ec2ac4b905d3f649ef8b267b1b4945e
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/roberta_prelayernorm/__pycache__/modeling_flax_roberta_prelayernorm.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/roberta_prelayernorm/__pycache__/modeling_roberta_prelayernorm.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/roberta_prelayernorm/__pycache__/modeling_roberta_prelayernorm.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a1fdedf39ca0400342e0ce3e2b33ca10a46542f0
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/roberta_prelayernorm/__pycache__/modeling_roberta_prelayernorm.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/roberta_prelayernorm/__pycache__/modeling_tf_roberta_prelayernorm.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/roberta_prelayernorm/__pycache__/modeling_tf_roberta_prelayernorm.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e74e0048a1d50662d8027e828aa608222a070856
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/roberta_prelayernorm/__pycache__/modeling_tf_roberta_prelayernorm.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/sam2_video/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/sam2_video/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0ac7c4d03130d0f3ad5a979410bfba38c41a3b76
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/sam2_video/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/sam2_video/__pycache__/configuration_sam2_video.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/sam2_video/__pycache__/configuration_sam2_video.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f2b12c444f570e66a829c112e856dca7bc41c7b6
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/sam2_video/__pycache__/configuration_sam2_video.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/sam2_video/__pycache__/processing_sam2_video.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/sam2_video/__pycache__/processing_sam2_video.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1c0be4f9627e4a6f669cda234561cd79cca9b66d
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/sam2_video/__pycache__/processing_sam2_video.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/sam2_video/__pycache__/video_processing_sam2_video.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/sam2_video/__pycache__/video_processing_sam2_video.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..82168f95806b54859605a18267f0b2522ec09197
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/sam2_video/__pycache__/video_processing_sam2_video.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/sam_hq/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/sam_hq/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c005476ab8c35203c4ce2fafb85e8b3c3de6bfcd
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/sam_hq/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/sam_hq/__pycache__/configuration_sam_hq.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/sam_hq/__pycache__/configuration_sam_hq.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..77519e06771140145b87fb009afedd735faf9981
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/sam_hq/__pycache__/configuration_sam_hq.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/sam_hq/__pycache__/modeling_sam_hq.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/sam_hq/__pycache__/modeling_sam_hq.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5cff01ca8213bc348358485a3f04c91a26abfabf
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/sam_hq/__pycache__/modeling_sam_hq.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/sam_hq/__pycache__/modular_sam_hq.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/sam_hq/__pycache__/modular_sam_hq.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b13fd87fe8cee470c550d8076814932ddfc178eb
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/sam_hq/__pycache__/modular_sam_hq.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/sam_hq/__pycache__/processing_samhq.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/sam_hq/__pycache__/processing_samhq.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0a057d06d7ad584d5680c2f48c01521a59519f17
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/sam_hq/__pycache__/processing_samhq.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/shieldgemma2/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/shieldgemma2/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b5c6c86c78c2e084b0281a491f74e75fe026994a
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/shieldgemma2/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/shieldgemma2/__pycache__/configuration_shieldgemma2.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/shieldgemma2/__pycache__/configuration_shieldgemma2.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8abdddf59d5766d24bee7efd9ef8c644abbf1418
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/shieldgemma2/__pycache__/configuration_shieldgemma2.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/shieldgemma2/__pycache__/modeling_shieldgemma2.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/shieldgemma2/__pycache__/modeling_shieldgemma2.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a5b748fa684156ed1b6997cce71a29096d7ac5f7
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/shieldgemma2/__pycache__/modeling_shieldgemma2.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/shieldgemma2/__pycache__/processing_shieldgemma2.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/shieldgemma2/__pycache__/processing_shieldgemma2.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0f77c943a31499e9e13c8f9f1e2a612d03aa2d25
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/shieldgemma2/__pycache__/processing_shieldgemma2.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/smollm3/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/smollm3/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..32c58064f61e2ce7af2cea9abda10a0531270c44
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/smollm3/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/smollm3/__pycache__/configuration_smollm3.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/smollm3/__pycache__/configuration_smollm3.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5dbdb664809ee63c4e34cd2ea79b2d484922bb2b
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/smollm3/__pycache__/configuration_smollm3.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/smollm3/__pycache__/modeling_smollm3.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/smollm3/__pycache__/modeling_smollm3.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..64aa62cdbf095daf9e3abfe1914c4e818c4b4134
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/smollm3/__pycache__/modeling_smollm3.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/smollm3/__pycache__/modular_smollm3.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/smollm3/__pycache__/modular_smollm3.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a89f5a8160948f7885df64e679677d5774425a4a
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/smollm3/__pycache__/modular_smollm3.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/speech_to_text/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/speech_to_text/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b40c0e6a1671170a1c1dcffea785be47510375c1
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/speech_to_text/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/speech_to_text/__pycache__/configuration_speech_to_text.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/speech_to_text/__pycache__/configuration_speech_to_text.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..441f853893236bc467fd1d17e34234ca34e2d9d1
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/speech_to_text/__pycache__/configuration_speech_to_text.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/speech_to_text/__pycache__/feature_extraction_speech_to_text.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/speech_to_text/__pycache__/feature_extraction_speech_to_text.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9f0f4db7abb8372fef4da6b312560666b99bbf71
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/speech_to_text/__pycache__/feature_extraction_speech_to_text.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/speech_to_text/__pycache__/modeling_speech_to_text.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/speech_to_text/__pycache__/modeling_speech_to_text.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eda60b467f43563cd6399b35bcb13d657973dfd9
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/speech_to_text/__pycache__/modeling_speech_to_text.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/speech_to_text/__pycache__/modeling_tf_speech_to_text.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/speech_to_text/__pycache__/modeling_tf_speech_to_text.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a583a41b8feb16bddcc7ac801e6bb798dd837637
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/speech_to_text/__pycache__/modeling_tf_speech_to_text.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/speech_to_text/__pycache__/processing_speech_to_text.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/speech_to_text/__pycache__/processing_speech_to_text.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8896e9aef1a40f7f2f2e50999ffb028991a454da
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/speech_to_text/__pycache__/processing_speech_to_text.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/speech_to_text/__pycache__/tokenization_speech_to_text.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/speech_to_text/__pycache__/tokenization_speech_to_text.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dab5bca74333a4bb2c1340ba40d3472d6de95b20
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/speech_to_text/__pycache__/tokenization_speech_to_text.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/splinter/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/splinter/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5524c741061127890522170e57cf65264322fc9a
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/splinter/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/splinter/__pycache__/configuration_splinter.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/splinter/__pycache__/configuration_splinter.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..68a90c18d16fc643e1f907ce278e260ab6a3d141
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/splinter/__pycache__/configuration_splinter.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/splinter/__pycache__/modeling_splinter.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/splinter/__pycache__/modeling_splinter.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..11686224eb2eec4d848caf230f3ca7bd0893dfab
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/splinter/__pycache__/modeling_splinter.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/splinter/__pycache__/tokenization_splinter.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/splinter/__pycache__/tokenization_splinter.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ea0157e67c6910e3878267aa492eacd5caf0f2ce
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/splinter/__pycache__/tokenization_splinter.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/splinter/__pycache__/tokenization_splinter_fast.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/splinter/__pycache__/tokenization_splinter_fast.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d2b19d0488b421336b463017ff7f9fdcdd8fc8cd
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/splinter/__pycache__/tokenization_splinter_fast.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/squeezebert/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/squeezebert/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eaa04718091f25c640fd89f0a13a75a77cb8de7e
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/squeezebert/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/squeezebert/__pycache__/configuration_squeezebert.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/squeezebert/__pycache__/configuration_squeezebert.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..122bab8359cf0bd8cd2b09f129371bc376e1665a
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/squeezebert/__pycache__/configuration_squeezebert.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/squeezebert/__pycache__/modeling_squeezebert.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/squeezebert/__pycache__/modeling_squeezebert.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c4493cbd2ab3445b5707d5ec2f488bdd05af569c
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/squeezebert/__pycache__/modeling_squeezebert.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/squeezebert/__pycache__/tokenization_squeezebert.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/squeezebert/__pycache__/tokenization_squeezebert.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a03bf143995340b4a096ef18fa9f70f564f7dcad
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/squeezebert/__pycache__/tokenization_squeezebert.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/squeezebert/__pycache__/tokenization_squeezebert_fast.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/squeezebert/__pycache__/tokenization_squeezebert_fast.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3a8468d87ab90fa0745f80ba92f9a93e3339ba51
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/squeezebert/__pycache__/tokenization_squeezebert_fast.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/starcoder2/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/starcoder2/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6d98040deee1ab6d8b08edb8a347f74163d80ec2
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/starcoder2/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/starcoder2/__pycache__/configuration_starcoder2.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/starcoder2/__pycache__/configuration_starcoder2.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0909bad93d84b735ba94f57e3e0cf144a3b31c9b
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/starcoder2/__pycache__/configuration_starcoder2.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/starcoder2/__pycache__/modeling_starcoder2.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/starcoder2/__pycache__/modeling_starcoder2.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4141d6a87ad3c90f4d2e663ff832fbbffe2b687b
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/starcoder2/__pycache__/modeling_starcoder2.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/starcoder2/__pycache__/modular_starcoder2.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/starcoder2/__pycache__/modular_starcoder2.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..186cee5ea00dff3cddb28b639d5f548ebea3bce6
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/starcoder2/__pycache__/modular_starcoder2.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/superglue/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/superglue/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aef0cfeebe6e3a5050d51f3d33bdfa2b1e1d964d
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/superglue/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/superglue/__pycache__/configuration_superglue.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/superglue/__pycache__/configuration_superglue.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7015177c8d3fac473b278ad8bdadef0e79255540
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/superglue/__pycache__/configuration_superglue.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/superglue/__pycache__/image_processing_superglue.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/superglue/__pycache__/image_processing_superglue.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6dbf006713900c6bceb71c5ca6315a8020b2fde7
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/superglue/__pycache__/image_processing_superglue.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/superglue/__pycache__/modeling_superglue.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/superglue/__pycache__/modeling_superglue.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8879ae5ccc0efe821b04b57925f3b4ca84c5a82c
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/superglue/__pycache__/modeling_superglue.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/swinv2/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/swinv2/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..21f3d5f5315ff3c2a9988b77f1b93b9a14d3f509
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/swinv2/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/swinv2/__pycache__/configuration_swinv2.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/swinv2/__pycache__/configuration_swinv2.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..994b8d0e2cab2227bfb4e65af12ae3eeb5331113
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/swinv2/__pycache__/configuration_swinv2.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/swinv2/__pycache__/modeling_swinv2.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/swinv2/__pycache__/modeling_swinv2.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3695cebbf778661927ac90dc54de3743de85b55a
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/swinv2/__pycache__/modeling_swinv2.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/switch_transformers/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/switch_transformers/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b93d8701cfffc90c5b2abda67c3b733f25f02384
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/switch_transformers/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/switch_transformers/__pycache__/configuration_switch_transformers.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/switch_transformers/__pycache__/configuration_switch_transformers.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9dda2ee9116d137a1da36e283457882cbd04032f
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/switch_transformers/__pycache__/configuration_switch_transformers.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/switch_transformers/__pycache__/modeling_switch_transformers.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/switch_transformers/__pycache__/modeling_switch_transformers.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..11e89f8df3e9b13062973fa43d0ebdc7904d9362
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/switch_transformers/__pycache__/modeling_switch_transformers.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/tapas/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/tapas/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5cf5ac95dc1741f775004fb9f89dc2eb7d93502e
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/tapas/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/tapas/__pycache__/configuration_tapas.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/tapas/__pycache__/configuration_tapas.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aba8d246dd7153ebd2ca3fedfcdcf429a2bdd4c8
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/tapas/__pycache__/configuration_tapas.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/udop/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/udop/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9c4f4031131fa20155d0cd5f9284ad9dfaa7257f
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/udop/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/udop/__pycache__/configuration_udop.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/udop/__pycache__/configuration_udop.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1198a32fa184f94c4e109f160c340751cc219652
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/udop/__pycache__/configuration_udop.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/udop/__pycache__/modeling_udop.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/udop/__pycache__/modeling_udop.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1daccc9653ec35f8938c1cfa663a5b2e9bd6ee09
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/udop/__pycache__/modeling_udop.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/udop/__pycache__/processing_udop.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/udop/__pycache__/processing_udop.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e1acc0a8fb9ab36c66c11cc3f346e54e9d69e764
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/udop/__pycache__/processing_udop.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/udop/__pycache__/tokenization_udop.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/udop/__pycache__/tokenization_udop.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c5ebfe452e69135ff23c163604b92ed4c1b3d9e4
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/udop/__pycache__/tokenization_udop.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/udop/__pycache__/tokenization_udop_fast.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/udop/__pycache__/tokenization_udop_fast.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..990e121086179e977faa1ee1284cf3dda9e2e2c5
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/udop/__pycache__/tokenization_udop_fast.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/umt5/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/umt5/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d5c81c2e0148720d00b0c530b7e35b8180b7e384
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/umt5/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/umt5/__pycache__/configuration_umt5.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/umt5/__pycache__/configuration_umt5.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e399c597c5ff07dd10ef7a584cdfd8e35cc7801a
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/umt5/__pycache__/configuration_umt5.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/umt5/__pycache__/modeling_umt5.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/umt5/__pycache__/modeling_umt5.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5236edbc495ea6f743cd10d313ff43d8acf340fb
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/umt5/__pycache__/modeling_umt5.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/vaultgemma/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/vaultgemma/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a0270869fda883c8bd1b7982d36f4da7b2594e78
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/vaultgemma/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/vaultgemma/__pycache__/configuration_vaultgemma.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/vaultgemma/__pycache__/configuration_vaultgemma.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d82ab62d3de15d8d44fb43e77eac2b2bdff86449
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/vaultgemma/__pycache__/configuration_vaultgemma.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/vaultgemma/__pycache__/modeling_vaultgemma.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/vaultgemma/__pycache__/modeling_vaultgemma.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d5c38ddf95f6d2dff32018ec1b4c7548e20e6101
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/vaultgemma/__pycache__/modeling_vaultgemma.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/vaultgemma/__pycache__/modular_vaultgemma.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/vaultgemma/__pycache__/modular_vaultgemma.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8800a72ebe1edc4db95aabbae41176e73e21cd1e
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/vaultgemma/__pycache__/modular_vaultgemma.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/vision_text_dual_encoder/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/vision_text_dual_encoder/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2a7faa791e3e5f2d34f19feb95db8551c8eabc80
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/vision_text_dual_encoder/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/vision_text_dual_encoder/__pycache__/configuration_vision_text_dual_encoder.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/vision_text_dual_encoder/__pycache__/configuration_vision_text_dual_encoder.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1358771a358a551d0804b3d90fb14fde63c22c53
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/vision_text_dual_encoder/__pycache__/configuration_vision_text_dual_encoder.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/vision_text_dual_encoder/__pycache__/modeling_flax_vision_text_dual_encoder.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/vision_text_dual_encoder/__pycache__/modeling_flax_vision_text_dual_encoder.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..34bc668361c7907b7e6751364103117225476810
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/vision_text_dual_encoder/__pycache__/modeling_flax_vision_text_dual_encoder.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/vision_text_dual_encoder/__pycache__/modeling_tf_vision_text_dual_encoder.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/vision_text_dual_encoder/__pycache__/modeling_tf_vision_text_dual_encoder.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e698fd6e9554b7f23f2938d42e53cd41a7b85ead
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/vision_text_dual_encoder/__pycache__/modeling_tf_vision_text_dual_encoder.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/vision_text_dual_encoder/__pycache__/modeling_vision_text_dual_encoder.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/vision_text_dual_encoder/__pycache__/modeling_vision_text_dual_encoder.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eb1157b4ebfc5deb711158c7444963c83a6a595e
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/vision_text_dual_encoder/__pycache__/modeling_vision_text_dual_encoder.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/vision_text_dual_encoder/__pycache__/processing_vision_text_dual_encoder.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/vision_text_dual_encoder/__pycache__/processing_vision_text_dual_encoder.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9327366a6c8fde3efee827f2462d4f68c38e9367
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/vision_text_dual_encoder/__pycache__/processing_vision_text_dual_encoder.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/vit_mae/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/vit_mae/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c55f692d1edc68fcc1a79d7376081c3a7f85b746
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/vit_mae/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/vit_mae/__pycache__/configuration_vit_mae.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/vit_mae/__pycache__/configuration_vit_mae.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8432dda4244ec045c301197293d0e2ce492524d4
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/vit_mae/__pycache__/configuration_vit_mae.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/vit_mae/__pycache__/modeling_tf_vit_mae.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/vit_mae/__pycache__/modeling_tf_vit_mae.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c8263f3e8a2e53a9fa90fbe7f309ff4d6e41edff
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/vit_mae/__pycache__/modeling_tf_vit_mae.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/vit_mae/__pycache__/modeling_vit_mae.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/vit_mae/__pycache__/modeling_vit_mae.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b5311e8defde36df62bdc60cdeb527ef070b57e2
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/vit_mae/__pycache__/modeling_vit_mae.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/vitdet/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/vitdet/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8f9a905038c9b8808d8407423bf3997e20b3911e
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/vitdet/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/vitdet/__pycache__/configuration_vitdet.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/vitdet/__pycache__/configuration_vitdet.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c3f851a46a9e659ac3b5bc1e6df167ece36d9e29
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/vitdet/__pycache__/configuration_vitdet.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/vitdet/__pycache__/modeling_vitdet.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/vitdet/__pycache__/modeling_vitdet.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..92a019b5074926fedb44557df685f034a89ba9a6
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/vitdet/__pycache__/modeling_vitdet.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/vivit/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/vivit/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..44ac6fdd0b24dc72396ca915649c9740185127f0
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/vivit/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/vivit/__pycache__/configuration_vivit.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/vivit/__pycache__/configuration_vivit.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..69bef1a255d7aeb33a95d3ec809021d5610ade5a
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/vivit/__pycache__/configuration_vivit.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/vivit/__pycache__/image_processing_vivit.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/vivit/__pycache__/image_processing_vivit.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b034ced099d0df9e4ec7e2939755ebc33dbee418
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/vivit/__pycache__/image_processing_vivit.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/vivit/__pycache__/modeling_vivit.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/vivit/__pycache__/modeling_vivit.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5ce3136d819583ccccaf9a0606c8def6f6f18625
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/vivit/__pycache__/modeling_vivit.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/wav2vec2/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/wav2vec2/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..18406b35123ff7c9f8b567f79593c96555892173
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/wav2vec2/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/wav2vec2/__pycache__/configuration_wav2vec2.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/wav2vec2/__pycache__/configuration_wav2vec2.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ae517e9acc2d43c9ad945025418d6ade7e379146
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/wav2vec2/__pycache__/configuration_wav2vec2.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/wav2vec2/__pycache__/feature_extraction_wav2vec2.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/wav2vec2/__pycache__/feature_extraction_wav2vec2.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e79ed6b41b85c908f840d7b605a47ae0a3f9b1bf
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/wav2vec2/__pycache__/feature_extraction_wav2vec2.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/wav2vec2/__pycache__/modeling_flax_wav2vec2.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/wav2vec2/__pycache__/modeling_flax_wav2vec2.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c3d3394e062ab4d20f94430cf612b402d7f050f4
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/wav2vec2/__pycache__/modeling_flax_wav2vec2.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/wav2vec2/__pycache__/modeling_tf_wav2vec2.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/wav2vec2/__pycache__/modeling_tf_wav2vec2.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..04a03992deb952115127e78feef25b3412b8c6ca
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/wav2vec2/__pycache__/modeling_tf_wav2vec2.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/wav2vec2/__pycache__/processing_wav2vec2.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/wav2vec2/__pycache__/processing_wav2vec2.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9a91186eadb431b156a7bf7dc87cbf71cc1aa48a
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/wav2vec2/__pycache__/processing_wav2vec2.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/wav2vec2/__pycache__/tokenization_wav2vec2.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/wav2vec2/__pycache__/tokenization_wav2vec2.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ba2e934c3fa28ca5d98602a13c12cc6612e7756e
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/wav2vec2/__pycache__/tokenization_wav2vec2.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/xglm/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/xglm/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2467310e58c1b1f4c62cc45888fe68c0d8d8cddd
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/xglm/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/xglm/__pycache__/configuration_xglm.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/xglm/__pycache__/configuration_xglm.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b41e054ed69b83a1124fdfa7a4a58a8010eb5c3d
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/xglm/__pycache__/configuration_xglm.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/xglm/__pycache__/modeling_flax_xglm.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/xglm/__pycache__/modeling_flax_xglm.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..92accdcdf0cc59a1d40db60970183b4c95e1e34e
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/xglm/__pycache__/modeling_flax_xglm.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/xglm/__pycache__/modeling_tf_xglm.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/xglm/__pycache__/modeling_tf_xglm.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7cd9050d74209367b823bf15792abbd2e176ae6e
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/xglm/__pycache__/modeling_tf_xglm.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/xglm/__pycache__/modeling_xglm.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/xglm/__pycache__/modeling_xglm.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..39e5ddcbd4f024870a61f79beae57039e7334373
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/xglm/__pycache__/modeling_xglm.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/xglm/__pycache__/tokenization_xglm.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/xglm/__pycache__/tokenization_xglm.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9a1ddf579801de37d14258fc0aaf10fa4d2c606a
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/xglm/__pycache__/tokenization_xglm.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/xglm/__pycache__/tokenization_xglm_fast.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/xglm/__pycache__/tokenization_xglm_fast.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..786d657b97b7d0728b99970e450fa9d7d941dd04
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/xglm/__pycache__/tokenization_xglm_fast.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/xlstm/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/xlstm/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0a067a7a53704ba77623df626714e3d19adf93fa
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/xlstm/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/xlstm/__pycache__/configuration_xlstm.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/xlstm/__pycache__/configuration_xlstm.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f1a6fcea194d7b7a50f40b98d377c65db205acb5
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/xlstm/__pycache__/configuration_xlstm.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/xlstm/__pycache__/modeling_xlstm.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/xlstm/__pycache__/modeling_xlstm.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..099d8d894b89ad5c0c3f51d2ad050986f9d91d0a
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/xlstm/__pycache__/modeling_xlstm.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/xmod/__pycache__/__init__.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/xmod/__pycache__/__init__.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bbef0af694dead5b1b5a6e047832fea0f63ebaff
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/xmod/__pycache__/__init__.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/xmod/__pycache__/configuration_xmod.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/xmod/__pycache__/configuration_xmod.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ecd6d580976e54587bd38127ce7e521e7b242df9
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/xmod/__pycache__/configuration_xmod.cpython-313.pyc differ
diff --git a/venv/lib/python3.13/site-packages/transformers/models/xmod/__pycache__/modeling_xmod.cpython-313.pyc b/venv/lib/python3.13/site-packages/transformers/models/xmod/__pycache__/modeling_xmod.cpython-313.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..330b02147ae2783c4d6ba2f5f0e352a1460769af
Binary files /dev/null and b/venv/lib/python3.13/site-packages/transformers/models/xmod/__pycache__/modeling_xmod.cpython-313.pyc differ