diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/aiohappyeyeballs/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/aiohappyeyeballs/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5f3ebf062e12962d4f0a0993f747d1aac5f3f2b0
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/aiohappyeyeballs/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/aiohappyeyeballs/__pycache__/impl.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/aiohappyeyeballs/__pycache__/impl.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..12312115275b285b929467ed8ea3f948e42dcdbf
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/aiohappyeyeballs/__pycache__/impl.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/aiohappyeyeballs/__pycache__/utils.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/aiohappyeyeballs/__pycache__/utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..967aa6235e1ef71da6dd21a4854171835ce416eb
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/aiohappyeyeballs/__pycache__/utils.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fc13af1ba147b6b882611c1b90ed59fc8279203c
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/__main__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/__main__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..82b705491b80c157fbde10299b11dac1d76f2e90
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/__main__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/api.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/api.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6a806a21f7b3e7d175eab7d4634c1fac85d427ff
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/api.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/cd.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/cd.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9f71e88996bb0494c71ba5e296ce4a96f77a8d91
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/cd.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/constant.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/constant.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..30a951e3c4ce158a77c4de2dfba5d63290dcda5b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/constant.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/legacy.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/legacy.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..30e9aee02f4f395c04b4345a39e0446d63246bf4
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/legacy.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/md.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/md.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..607daa2281e872d441d93fc3e322b2313d497d33
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/md.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/models.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/models.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..16f505f1590a32a616d07f0747e152e80afa519c
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/models.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/utils.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fa89ef87aa60ed266adf428222e02ada99603058
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/utils.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/version.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/version.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1b30a36c417f0f8f3b4e6444bf00e0cb37f15b25
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/__pycache__/version.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/cli/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/cli/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..543a5a4de49d07690e73df778aa580589d0789c6
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/cli/__init__.py
@@ -0,0 +1,8 @@
+from __future__ import annotations
+
+from .__main__ import cli_detect, query_yes_no
+
+__all__ = (
+    "cli_detect",
+    "query_yes_no",
+)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/cli/__main__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/cli/__main__.py
new file mode 100644
index 0000000000000000000000000000000000000000..64a290f2f86ff5c9a5c16ef4141bd49e3d9f3c8f
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/cli/__main__.py
@@ -0,0 +1,321 @@
+from __future__ import annotations
+
+import argparse
+import sys
+from json import dumps
+from os.path import abspath, basename, dirname, join, realpath
+from platform import python_version
+from unicodedata import unidata_version
+
+import charset_normalizer.md as md_module
+from charset_normalizer import from_fp
+from charset_normalizer.models import CliDetectionResult
+from charset_normalizer.version import __version__
+
+
+def query_yes_no(question: str, default: str = "yes") -> bool:
+    """Ask a yes/no question via input() and return their answer.
+
+    "question" is a string that is presented to the user.
+    "default" is the presumed answer if the user just hits <Enter>.
+        It must be "yes" (the default), "no" or None (meaning
+        an answer is required of the user).
+
+    The "answer" return value is True for "yes" or False for "no".
+
+    Credit goes to (c) https://stackoverflow.com/questions/3041986/apt-command-line-interface-like-yes-no-input
+    """
+    valid = {"yes": True, "y": True, "ye": True, "no": False, "n": False}
+    if default is None:
+        prompt = " [y/n] "
+    elif default == "yes":
+        prompt = " [Y/n] "
+    elif default == "no":
+        prompt = " [y/N] "
+    else:
+        raise ValueError("invalid default answer: '%s'" % default)
+
+    while True:
+        sys.stdout.write(question + prompt)
+        choice = input().lower()
+        if default is not None and choice == "":
+            return valid[default]
+        elif choice in valid:
+            return valid[choice]
+        else:
+            sys.stdout.write("Please respond with 'yes' or 'no' " "(or 'y' or 'n').\n")
+
+
+def cli_detect(argv: list[str] | None = None) -> int:
+    """
+    CLI assistant using ARGV and ArgumentParser
+    :param argv:
+    :return: 0 if everything is fine, anything else equal trouble
+    """
+    parser = argparse.ArgumentParser(
+        description="The Real First Universal Charset Detector. "
+        "Discover originating encoding used on text file. "
+        "Normalize text to unicode."
+    )
+
+    parser.add_argument(
+        "files", type=argparse.FileType("rb"), nargs="+", help="File(s) to be analysed"
+    )
+    parser.add_argument(
+        "-v",
+        "--verbose",
+        action="store_true",
+        default=False,
+        dest="verbose",
+        help="Display complementary information about file if any. "
+        "Stdout will contain logs about the detection process.",
+    )
+    parser.add_argument(
+        "-a",
+        "--with-alternative",
+        action="store_true",
+        default=False,
+        dest="alternatives",
+        help="Output complementary possibilities if any. Top-level JSON WILL be a list.",
+    )
+    parser.add_argument(
+        "-n",
+        "--normalize",
+        action="store_true",
+        default=False,
+        dest="normalize",
+        help="Permit to normalize input file. If not set, program does not write anything.",
+    )
+    parser.add_argument(
+        "-m",
+        "--minimal",
+        action="store_true",
+        default=False,
+        dest="minimal",
+        help="Only output the charset detected to STDOUT. Disabling JSON output.",
+    )
+    parser.add_argument(
+        "-r",
+        "--replace",
+        action="store_true",
+        default=False,
+        dest="replace",
+        help="Replace file when trying to normalize it instead of creating a new one.",
+    )
+    parser.add_argument(
+        "-f",
+        "--force",
+        action="store_true",
+        default=False,
+        dest="force",
+        help="Replace file without asking if you are sure, use this flag with caution.",
+    )
+    parser.add_argument(
+        "-i",
+        "--no-preemptive",
+        action="store_true",
+        default=False,
+        dest="no_preemptive",
+        help="Disable looking at a charset declaration to hint the detector.",
+    )
+    parser.add_argument(
+        "-t",
+        "--threshold",
+        action="store",
+        default=0.2,
+        type=float,
+        dest="threshold",
+        help="Define a custom maximum amount of noise allowed in decoded content. 0. <= noise <= 1.",
+    )
+    parser.add_argument(
+        "--version",
+        action="version",
+        version="Charset-Normalizer {} - Python {} - Unicode {} - SpeedUp {}".format(
+            __version__,
+            python_version(),
+            unidata_version,
+            "OFF" if md_module.__file__.lower().endswith(".py") else "ON",
+        ),
+        help="Show version information and exit.",
+    )
+
+    args = parser.parse_args(argv)
+
+    if args.replace is True and args.normalize is False:
+        if args.files:
+            for my_file in args.files:
+                my_file.close()
+        print("Use --replace in addition of --normalize only.", file=sys.stderr)
+        return 1
+
+    if args.force is True and args.replace is False:
+        if args.files:
+            for my_file in args.files:
+                my_file.close()
+        print("Use --force in addition of --replace only.", file=sys.stderr)
+        return 1
+
+    if args.threshold < 0.0 or args.threshold > 1.0:
+        if args.files:
+            for my_file in args.files:
+                my_file.close()
+        print("--threshold VALUE should be between 0. AND 1.", file=sys.stderr)
+        return 1
+
+    x_ = []
+
+    for my_file in args.files:
+        matches = from_fp(
+            my_file,
+            threshold=args.threshold,
+            explain=args.verbose,
+            preemptive_behaviour=args.no_preemptive is False,
+        )
+
+        best_guess = matches.best()
+
+        if best_guess is None:
+            print(
+                'Unable to identify originating encoding for "{}". {}'.format(
+                    my_file.name,
+                    (
+                        "Maybe try increasing maximum amount of chaos."
+                        if args.threshold < 1.0
+                        else ""
+                    ),
+                ),
+                file=sys.stderr,
+            )
+            x_.append(
+                CliDetectionResult(
+                    abspath(my_file.name),
+                    None,
+                    [],
+                    [],
+                    "Unknown",
+                    [],
+                    False,
+                    1.0,
+                    0.0,
+                    None,
+                    True,
+                )
+            )
+        else:
+            x_.append(
+                CliDetectionResult(
+                    abspath(my_file.name),
+                    best_guess.encoding,
+                    best_guess.encoding_aliases,
+                    [
+                        cp
+                        for cp in best_guess.could_be_from_charset
+                        if cp != best_guess.encoding
+                    ],
+                    best_guess.language,
+                    best_guess.alphabets,
+                    best_guess.bom,
+                    best_guess.percent_chaos,
+                    best_guess.percent_coherence,
+                    None,
+                    True,
+                )
+            )
+
+            if len(matches) > 1 and args.alternatives:
+                for el in matches:
+                    if el != best_guess:
+                        x_.append(
+                            CliDetectionResult(
+                                abspath(my_file.name),
+                                el.encoding,
+                                el.encoding_aliases,
+                                [
+                                    cp
+                                    for cp in el.could_be_from_charset
+                                    if cp != el.encoding
+                                ],
+                                el.language,
+                                el.alphabets,
+                                el.bom,
+                                el.percent_chaos,
+                                el.percent_coherence,
+                                None,
+                                False,
+                            )
+                        )
+
+            if args.normalize is True:
+                if best_guess.encoding.startswith("utf") is True:
+                    print(
+                        '"{}" file does not need to be normalized, as it already came from unicode.'.format(
+                            my_file.name
+                        ),
+                        file=sys.stderr,
+                    )
+                    if my_file.closed is False:
+                        my_file.close()
+                    continue
+
+                dir_path = dirname(realpath(my_file.name))
+                file_name = basename(realpath(my_file.name))
+
+                o_: list[str] = file_name.split(".")
+
+                if args.replace is False:
+                    o_.insert(-1, best_guess.encoding)
+                    if my_file.closed is False:
+                        my_file.close()
+                elif (
+                    args.force is False
+                    and query_yes_no(
+                        'Are you sure to normalize "{}" by replacing it ?'.format(
+                            my_file.name
+                        ),
+                        "no",
+                    )
+                    is False
+                ):
+                    if my_file.closed is False:
+                        my_file.close()
+                    continue
+
+                try:
+                    x_[0].unicode_path = join(dir_path, ".".join(o_))
+
+                    with open(x_[0].unicode_path, "wb") as fp:
+                        fp.write(best_guess.output())
+                except OSError as e:
+                    print(str(e), file=sys.stderr)
+                    if my_file.closed is False:
+                        my_file.close()
+                    return 2
+
+        if my_file.closed is False:
+            my_file.close()
+
+    if args.minimal is False:
+        print(
+            dumps(
+                [el.__dict__ for el in x_] if len(x_) > 1 else x_[0].__dict__,
+                ensure_ascii=True,
+                indent=4,
+            )
+        )
+    else:
+        for my_file in args.files:
+            print(
+                ", ".join(
+                    [
+                        el.encoding or "undefined"
+                        for el in x_
+                        if el.path == abspath(my_file.name)
+                    ]
+                )
+            )
+
+    return 0
+
+
+if __name__ == "__main__":
+    cli_detect()
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/cli/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/cli/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d78988ebaa5c921590e5d0c213c125f320463fd2
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/cli/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/cli/__pycache__/__main__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/cli/__pycache__/__main__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5dd3f9a811ac46cbccfefb3136d639f894a3ccba
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/charset_normalizer/cli/__pycache__/__main__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/_utils/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/_utils/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..303980838117e13a0a98182159e06e5a301082a6
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/_utils/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/_utils/__pycache__/_convertions.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/_utils/__pycache__/_convertions.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..75ab7e24b13f77cea22f7fc36144749ba2448fe9
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/_utils/__pycache__/_convertions.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/_utils/__pycache__/_inspect.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/_utils/__pycache__/_inspect.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..47feae6d46509d9d21d2f0147a1857a8bc9c905f
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/_utils/__pycache__/_inspect.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/_utils/__pycache__/_pep440.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/_utils/__pycache__/_pep440.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7a6328099baab22b528fd60fd0b776079bd894a0
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/_utils/__pycache__/_pep440.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/linalg/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/linalg/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c3b24d633bb29573b808b5ca62fe3219008f263c
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/linalg/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/linalg/tests/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/linalg/tests/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/linalg/tests/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/linalg/tests/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..38a116d8967a0437ac571956b769e4512d7cf734
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/linalg/tests/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/linalg/tests/__pycache__/test_deprecations.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/linalg/tests/__pycache__/test_deprecations.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..23e1c6f355b7fd45eb6d0d00e57a07ce5a12e415
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/linalg/tests/__pycache__/test_deprecations.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/linalg/tests/__pycache__/test_regression.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/linalg/tests/__pycache__/test_regression.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ffa3b2eda05186bd03d0bcf529f54d1882f8ee79
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/linalg/tests/__pycache__/test_regression.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/linalg/tests/test_deprecations.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/linalg/tests/test_deprecations.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd4c10832e7e7240175571605a07541f0c188f89
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/linalg/tests/test_deprecations.py
@@ -0,0 +1,20 @@
+"""Test deprecation and future warnings.
+
+"""
+import numpy as np
+from numpy.testing import assert_warns
+
+
+def test_qr_mode_full_future_warning():
+    """Check mode='full' FutureWarning.
+
+    In numpy 1.8 the mode options 'full' and 'economic' in linalg.qr were
+    deprecated. The release date will probably be sometime in the summer
+    of 2013.
+
+    """
+    a = np.eye(2)
+    assert_warns(DeprecationWarning, np.linalg.qr, a, mode='full')
+    assert_warns(DeprecationWarning, np.linalg.qr, a, mode='f')
+    assert_warns(DeprecationWarning, np.linalg.qr, a, mode='economic')
+    assert_warns(DeprecationWarning, np.linalg.qr, a, mode='e')
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/linalg/tests/test_linalg.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/linalg/tests/test_linalg.py
new file mode 100644
index 0000000000000000000000000000000000000000..5dabdfdf010a336741a1f89af101b36c4f62f5ab
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/linalg/tests/test_linalg.py
@@ -0,0 +1,2198 @@
+""" Test functions for linalg module
+
+"""
+import os
+import sys
+import itertools
+import traceback
+import textwrap
+import subprocess
+import pytest
+
+import numpy as np
+from numpy import array, single, double, csingle, cdouble, dot, identity, matmul
+from numpy.core import swapaxes
+from numpy import multiply, atleast_2d, inf, asarray
+from numpy import linalg
+from numpy.linalg import matrix_power, norm, matrix_rank, multi_dot, LinAlgError
+from numpy.linalg.linalg import _multi_dot_matrix_chain_order
+from numpy.testing import (
+    assert_, assert_equal, assert_raises, assert_array_equal,
+    assert_almost_equal, assert_allclose, suppress_warnings,
+    assert_raises_regex, HAS_LAPACK64, IS_WASM
+    )
+try:
+    import numpy.linalg.lapack_lite
+except ImportError:
+    # May be broken when numpy was built without BLAS/LAPACK present
+    # If so, ensure we don't break the whole test suite - the `lapack_lite`
+    # submodule should be removed, it's only used in two tests in this file.
+    pass
+
+
+def consistent_subclass(out, in_):
+    # For ndarray subclass input, our output should have the same subclass
+    # (non-ndarray input gets converted to ndarray).
+    return type(out) is (type(in_) if isinstance(in_, np.ndarray)
+                         else np.ndarray)
+
+
+old_assert_almost_equal = assert_almost_equal
+
+
+def assert_almost_equal(a, b, single_decimal=6, double_decimal=12, **kw):
+    if asarray(a).dtype.type in (single, csingle):
+        decimal = single_decimal
+    else:
+        decimal = double_decimal
+    old_assert_almost_equal(a, b, decimal=decimal, **kw)
+
+
+def get_real_dtype(dtype):
+    return {single: single, double: double,
+            csingle: single, cdouble: double}[dtype]
+
+
+def get_complex_dtype(dtype):
+    return {single: csingle, double: cdouble,
+            csingle: csingle, cdouble: cdouble}[dtype]
+
+
+def get_rtol(dtype):
+    # Choose a safe rtol
+    if dtype in (single, csingle):
+        return 1e-5
+    else:
+        return 1e-11
+
+
+# used to categorize tests
+all_tags = {
+  'square', 'nonsquare', 'hermitian',  # mutually exclusive
+  'generalized', 'size-0', 'strided' # optional additions
+}
+
+
+class LinalgCase:
+    def __init__(self, name, a, b, tags=set()):
+        """
+        A bundle of arguments to be passed to a test case, with an identifying
+        name, the operands a and b, and a set of tags to filter the tests
+        """
+        assert_(isinstance(name, str))
+        self.name = name
+        self.a = a
+        self.b = b
+        self.tags = frozenset(tags)  # prevent shared tags
+
+    def check(self, do):
+        """
+        Run the function `do` on this test case, expanding arguments
+        """
+        do(self.a, self.b, tags=self.tags)
+
+    def __repr__(self):
+        return f'<LinalgCase: {self.name}>'
+
+
+def apply_tag(tag, cases):
+    """
+    Add the given tag (a string) to each of the cases (a list of LinalgCase
+    objects)
+    """
+    assert tag in all_tags, "Invalid tag"
+    for case in cases:
+        case.tags = case.tags | {tag}
+    return cases
+
+
+#
+# Base test cases
+#
+
+np.random.seed(1234)
+
+CASES = []
+
+# square test cases
+CASES += apply_tag('square', [
+    LinalgCase("single",
+               array([[1., 2.], [3., 4.]], dtype=single),
+               array([2., 1.], dtype=single)),
+    LinalgCase("double",
+               array([[1., 2.], [3., 4.]], dtype=double),
+               array([2., 1.], dtype=double)),
+    LinalgCase("double_2",
+               array([[1., 2.], [3., 4.]], dtype=double),
+               array([[2., 1., 4.], [3., 4., 6.]], dtype=double)),
+    LinalgCase("csingle",
+               array([[1. + 2j, 2 + 3j], [3 + 4j, 4 + 5j]], dtype=csingle),
+               array([2. + 1j, 1. + 2j], dtype=csingle)),
+    LinalgCase("cdouble",
+               array([[1. + 2j, 2 + 3j], [3 + 4j, 4 + 5j]], dtype=cdouble),
+               array([2. + 1j, 1. + 2j], dtype=cdouble)),
+    LinalgCase("cdouble_2",
+               array([[1. + 2j, 2 + 3j], [3 + 4j, 4 + 5j]], dtype=cdouble),
+               array([[2. + 1j, 1. + 2j, 1 + 3j], [1 - 2j, 1 - 3j, 1 - 6j]], dtype=cdouble)),
+    LinalgCase("0x0",
+               np.empty((0, 0), dtype=double),
+               np.empty((0,), dtype=double),
+               tags={'size-0'}),
+    LinalgCase("8x8",
+               np.random.rand(8, 8),
+               np.random.rand(8)),
+    LinalgCase("1x1",
+               np.random.rand(1, 1),
+               np.random.rand(1)),
+    LinalgCase("nonarray",
+               [[1, 2], [3, 4]],
+               [2, 1]),
+])
+
+# non-square test-cases
+CASES += apply_tag('nonsquare', [
+    LinalgCase("single_nsq_1",
+               array([[1., 2., 3.], [3., 4., 6.]], dtype=single),
+               array([2., 1.], dtype=single)),
+    LinalgCase("single_nsq_2",
+               array([[1., 2.], [3., 4.], [5., 6.]], dtype=single),
+               array([2., 1., 3.], dtype=single)),
+    LinalgCase("double_nsq_1",
+               array([[1., 2., 3.], [3., 4., 6.]], dtype=double),
+               array([2., 1.], dtype=double)),
+    LinalgCase("double_nsq_2",
+               array([[1., 2.], [3., 4.], [5., 6.]], dtype=double),
+               array([2., 1., 3.], dtype=double)),
+    LinalgCase("csingle_nsq_1",
+               array(
+                   [[1. + 1j, 2. + 2j, 3. - 3j], [3. - 5j, 4. + 9j, 6. + 2j]], dtype=csingle),
+               array([2. + 1j, 1. + 2j], dtype=csingle)),
+    LinalgCase("csingle_nsq_2",
+               array(
+                   [[1. + 1j, 2. + 2j], [3. - 3j, 4. - 9j], [5. - 4j, 6. + 8j]], dtype=csingle),
+               array([2. + 1j, 1. + 2j, 3. - 3j], dtype=csingle)),
+    LinalgCase("cdouble_nsq_1",
+               array(
+                   [[1. + 1j, 2. + 2j, 3. - 3j], [3. - 5j, 4. + 9j, 6. + 2j]], dtype=cdouble),
+               array([2. + 1j, 1. + 2j], dtype=cdouble)),
+    LinalgCase("cdouble_nsq_2",
+               array(
+                   [[1. + 1j, 2. + 2j], [3. - 3j, 4. - 9j], [5. - 4j, 6. + 8j]], dtype=cdouble),
+               array([2. + 1j, 1. + 2j, 3. - 3j], dtype=cdouble)),
+    LinalgCase("cdouble_nsq_1_2",
+               array(
+                   [[1. + 1j, 2. + 2j, 3. - 3j], [3. - 5j, 4. + 9j, 6. + 2j]], dtype=cdouble),
+               array([[2. + 1j, 1. + 2j], [1 - 1j, 2 - 2j]], dtype=cdouble)),
+    LinalgCase("cdouble_nsq_2_2",
+               array(
+                   [[1. + 1j, 2. + 2j], [3. - 3j, 4. - 9j], [5. - 4j, 6. + 8j]], dtype=cdouble),
+               array([[2. + 1j, 1. + 2j], [1 - 1j, 2 - 2j], [1 - 1j, 2 - 2j]], dtype=cdouble)),
+    LinalgCase("8x11",
+               np.random.rand(8, 11),
+               np.random.rand(8)),
+    LinalgCase("1x5",
+               np.random.rand(1, 5),
+               np.random.rand(1)),
+    LinalgCase("5x1",
+               np.random.rand(5, 1),
+               np.random.rand(5)),
+    LinalgCase("0x4",
+               np.random.rand(0, 4),
+               np.random.rand(0),
+               tags={'size-0'}),
+    LinalgCase("4x0",
+               np.random.rand(4, 0),
+               np.random.rand(4),
+               tags={'size-0'}),
+])
+
+# hermitian test-cases
+CASES += apply_tag('hermitian', [
+    LinalgCase("hsingle",
+               array([[1., 2.], [2., 1.]], dtype=single),
+               None),
+    LinalgCase("hdouble",
+               array([[1., 2.], [2., 1.]], dtype=double),
+               None),
+    LinalgCase("hcsingle",
+               array([[1., 2 + 3j], [2 - 3j, 1]], dtype=csingle),
+               None),
+    LinalgCase("hcdouble",
+               array([[1., 2 + 3j], [2 - 3j, 1]], dtype=cdouble),
+               None),
+    LinalgCase("hempty",
+               np.empty((0, 0), dtype=double),
+               None,
+               tags={'size-0'}),
+    LinalgCase("hnonarray",
+               [[1, 2], [2, 1]],
+               None),
+    LinalgCase("matrix_b_only",
+               array([[1., 2.], [2., 1.]]),
+               None),
+    LinalgCase("hmatrix_1x1",
+               np.random.rand(1, 1),
+               None),
+])
+
+
+#
+# Gufunc test cases
+#
+def _make_generalized_cases():
+    new_cases = []
+
+    for case in CASES:
+        if not isinstance(case.a, np.ndarray):
+            continue
+
+        a = np.array([case.a, 2 * case.a, 3 * case.a])
+        if case.b is None:
+            b = None
+        else:
+            b = np.array([case.b, 7 * case.b, 6 * case.b])
+        new_case = LinalgCase(case.name + "_tile3", a, b,
+                              tags=case.tags | {'generalized'})
+        new_cases.append(new_case)
+
+        a = np.array([case.a] * 2 * 3).reshape((3, 2) + case.a.shape)
+        if case.b is None:
+            b = None
+        else:
+            b = np.array([case.b] * 2 * 3).reshape((3, 2) + case.b.shape)
+        new_case = LinalgCase(case.name + "_tile213", a, b,
+                              tags=case.tags | {'generalized'})
+        new_cases.append(new_case)
+
+    return new_cases
+
+
+CASES += _make_generalized_cases()
+
+
+#
+# Generate stride combination variations of the above
+#
+def _stride_comb_iter(x):
+    """
+    Generate cartesian product of strides for all axes
+    """
+
+    if not isinstance(x, np.ndarray):
+        yield x, "nop"
+        return
+
+    stride_set = [(1,)] * x.ndim
+    stride_set[-1] = (1, 3, -4)
+    if x.ndim > 1:
+        stride_set[-2] = (1, 3, -4)
+    if x.ndim > 2:
+        stride_set[-3] = (1, -4)
+
+    for repeats in itertools.product(*tuple(stride_set)):
+        new_shape = [abs(a * b) for a, b in zip(x.shape, repeats)]
+        slices = tuple([slice(None, None, repeat) for repeat in repeats])
+
+        # new array with different strides, but same data
+        xi = np.empty(new_shape, dtype=x.dtype)
+        xi.view(np.uint32).fill(0xdeadbeef)
+        xi = xi[slices]
+        xi[...] = x
+        xi = xi.view(x.__class__)
+        assert_(np.all(xi == x))
+        yield xi, "stride_" + "_".join(["%+d" % j for j in repeats])
+
+        # generate also zero strides if possible
+        if x.ndim >= 1 and x.shape[-1] == 1:
+            s = list(x.strides)
+            s[-1] = 0
+            xi = np.lib.stride_tricks.as_strided(x, strides=s)
+            yield xi, "stride_xxx_0"
+        if x.ndim >= 2 and x.shape[-2] == 1:
+            s = list(x.strides)
+            s[-2] = 0
+            xi = np.lib.stride_tricks.as_strided(x, strides=s)
+            yield xi, "stride_xxx_0_x"
+        if x.ndim >= 2 and x.shape[:-2] == (1, 1):
+            s = list(x.strides)
+            s[-1] = 0
+            s[-2] = 0
+            xi = np.lib.stride_tricks.as_strided(x, strides=s)
+            yield xi, "stride_xxx_0_0"
+
+
+def _make_strided_cases():
+    new_cases = []
+    for case in CASES:
+        for a, a_label in _stride_comb_iter(case.a):
+            for b, b_label in _stride_comb_iter(case.b):
+                new_case = LinalgCase(case.name + "_" + a_label + "_" + b_label, a, b,
+                                      tags=case.tags | {'strided'})
+                new_cases.append(new_case)
+    return new_cases
+
+
+CASES += _make_strided_cases()
+
+
+#
+# Test different routines against the above cases
+#
+class LinalgTestCase:
+    TEST_CASES = CASES
+
+    def check_cases(self, require=set(), exclude=set()):
+        """
+        Run func on each of the cases with all of the tags in require, and none
+        of the tags in exclude
+        """
+        for case in self.TEST_CASES:
+            # filter by require and exclude
+            if case.tags & require != require:
+                continue
+            if case.tags & exclude:
+                continue
+
+            try:
+                case.check(self.do)
+            except Exception as e:
+                msg = f'In test case: {case!r}\n\n'
+                msg += traceback.format_exc()
+                raise AssertionError(msg) from e
+
+
+class LinalgSquareTestCase(LinalgTestCase):
+
+    def test_sq_cases(self):
+        self.check_cases(require={'square'},
+                         exclude={'generalized', 'size-0'})
+
+    def test_empty_sq_cases(self):
+        self.check_cases(require={'square', 'size-0'},
+                         exclude={'generalized'})
+
+
+class LinalgNonsquareTestCase(LinalgTestCase):
+
+    def test_nonsq_cases(self):
+        self.check_cases(require={'nonsquare'},
+                         exclude={'generalized', 'size-0'})
+
+    def test_empty_nonsq_cases(self):
+        self.check_cases(require={'nonsquare', 'size-0'},
+                         exclude={'generalized'})
+
+
+class HermitianTestCase(LinalgTestCase):
+
+    def test_herm_cases(self):
+        self.check_cases(require={'hermitian'},
+                         exclude={'generalized', 'size-0'})
+
+    def test_empty_herm_cases(self):
+        self.check_cases(require={'hermitian', 'size-0'},
+                         exclude={'generalized'})
+
+
+class LinalgGeneralizedSquareTestCase(LinalgTestCase):
+
+    @pytest.mark.slow
+    def test_generalized_sq_cases(self):
+        self.check_cases(require={'generalized', 'square'},
+                         exclude={'size-0'})
+
+    @pytest.mark.slow
+    def test_generalized_empty_sq_cases(self):
+        self.check_cases(require={'generalized', 'square', 'size-0'})
+
+
+class LinalgGeneralizedNonsquareTestCase(LinalgTestCase):
+
+    @pytest.mark.slow
+    def test_generalized_nonsq_cases(self):
+        self.check_cases(require={'generalized', 'nonsquare'},
+                         exclude={'size-0'})
+
+    @pytest.mark.slow
+    def test_generalized_empty_nonsq_cases(self):
+        self.check_cases(require={'generalized', 'nonsquare', 'size-0'})
+
+
+class HermitianGeneralizedTestCase(LinalgTestCase):
+
+    @pytest.mark.slow
+    def test_generalized_herm_cases(self):
+        self.check_cases(require={'generalized', 'hermitian'},
+                         exclude={'size-0'})
+
+    @pytest.mark.slow
+    def test_generalized_empty_herm_cases(self):
+        self.check_cases(require={'generalized', 'hermitian', 'size-0'},
+                         exclude={'none'})
+
+
+def dot_generalized(a, b):
+    a = asarray(a)
+    if a.ndim >= 3:
+        if a.ndim == b.ndim:
+            # matrix x matrix
+            new_shape = a.shape[:-1] + b.shape[-1:]
+        elif a.ndim == b.ndim + 1:
+            # matrix x vector
+            new_shape = a.shape[:-1]
+        else:
+            raise ValueError("Not implemented...")
+        r = np.empty(new_shape, dtype=np.common_type(a, b))
+        for c in itertools.product(*map(range, a.shape[:-2])):
+            r[c] = dot(a[c], b[c])
+        return r
+    else:
+        return dot(a, b)
+
+
+def identity_like_generalized(a):
+    a = asarray(a)
+    if a.ndim >= 3:
+        r = np.empty(a.shape, dtype=a.dtype)
+        r[...] = identity(a.shape[-2])
+        return r
+    else:
+        return identity(a.shape[0])
+
+
+class SolveCases(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase):
+    # kept apart from TestSolve for use for testing with matrices.
+    def do(self, a, b, tags):
+        x = linalg.solve(a, b)
+        assert_almost_equal(b, dot_generalized(a, x))
+        assert_(consistent_subclass(x, b))
+
+
+class TestSolve(SolveCases):
+    @pytest.mark.parametrize('dtype', [single, double, csingle, cdouble])
+    def test_types(self, dtype):
+        x = np.array([[1, 0.5], [0.5, 1]], dtype=dtype)
+        assert_equal(linalg.solve(x, x).dtype, dtype)
+
+    def test_0_size(self):
+        class ArraySubclass(np.ndarray):
+            pass
+        # Test system of 0x0 matrices
+        a = np.arange(8).reshape(2, 2, 2)
+        b = np.arange(6).reshape(1, 2, 3).view(ArraySubclass)
+
+        expected = linalg.solve(a, b)[:, 0:0, :]
+        result = linalg.solve(a[:, 0:0, 0:0], b[:, 0:0, :])
+        assert_array_equal(result, expected)
+        assert_(isinstance(result, ArraySubclass))
+
+        # Test errors for non-square and only b's dimension being 0
+        assert_raises(linalg.LinAlgError, linalg.solve, a[:, 0:0, 0:1], b)
+        assert_raises(ValueError, linalg.solve, a, b[:, 0:0, :])
+
+        # Test broadcasting error
+        b = np.arange(6).reshape(1, 3, 2)  # broadcasting error
+        assert_raises(ValueError, linalg.solve, a, b)
+        assert_raises(ValueError, linalg.solve, a[0:0], b[0:0])
+
+        # Test zero "single equations" with 0x0 matrices.
+        b = np.arange(2).reshape(1, 2).view(ArraySubclass)
+        expected = linalg.solve(a, b)[:, 0:0]
+        result = linalg.solve(a[:, 0:0, 0:0], b[:, 0:0])
+        assert_array_equal(result, expected)
+        assert_(isinstance(result, ArraySubclass))
+
+        b = np.arange(3).reshape(1, 3)
+        assert_raises(ValueError, linalg.solve, a, b)
+        assert_raises(ValueError, linalg.solve, a[0:0], b[0:0])
+        assert_raises(ValueError, linalg.solve, a[:, 0:0, 0:0], b)
+
+    def test_0_size_k(self):
+        # test zero multiple equation (K=0) case.
+        class ArraySubclass(np.ndarray):
+            pass
+        a = np.arange(4).reshape(1, 2, 2)
+        b = np.arange(6).reshape(3, 2, 1).view(ArraySubclass)
+
+        expected = linalg.solve(a, b)[:, :, 0:0]
+        result = linalg.solve(a, b[:, :, 0:0])
+        assert_array_equal(result, expected)
+        assert_(isinstance(result, ArraySubclass))
+
+        # test both zero.
+        expected = linalg.solve(a, b)[:, 0:0, 0:0]
+        result = linalg.solve(a[:, 0:0, 0:0], b[:, 0:0, 0:0])
+        assert_array_equal(result, expected)
+        assert_(isinstance(result, ArraySubclass))
+
+
+class InvCases(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase):
+
+    def do(self, a, b, tags):
+        a_inv = linalg.inv(a)
+        assert_almost_equal(dot_generalized(a, a_inv),
+                            identity_like_generalized(a))
+        assert_(consistent_subclass(a_inv, a))
+
+
+class TestInv(InvCases):
+    @pytest.mark.parametrize('dtype', [single, double, csingle, cdouble])
+    def test_types(self, dtype):
+        x = np.array([[1, 0.5], [0.5, 1]], dtype=dtype)
+        assert_equal(linalg.inv(x).dtype, dtype)
+
+    def test_0_size(self):
+        # Check that all kinds of 0-sized arrays work
+        class ArraySubclass(np.ndarray):
+            pass
+        a = np.zeros((0, 1, 1), dtype=np.int_).view(ArraySubclass)
+        res = linalg.inv(a)
+        assert_(res.dtype.type is np.float64)
+        assert_equal(a.shape, res.shape)
+        assert_(isinstance(res, ArraySubclass))
+
+        a = np.zeros((0, 0), dtype=np.complex64).view(ArraySubclass)
+        res = linalg.inv(a)
+        assert_(res.dtype.type is np.complex64)
+        assert_equal(a.shape, res.shape)
+        assert_(isinstance(res, ArraySubclass))
+
+
+class EigvalsCases(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase):
+
+    def do(self, a, b, tags):
+        ev = linalg.eigvals(a)
+        evalues, evectors = linalg.eig(a)
+        assert_almost_equal(ev, evalues)
+
+
+class TestEigvals(EigvalsCases):
+    @pytest.mark.parametrize('dtype', [single, double, csingle, cdouble])
+    def test_types(self, dtype):
+        x = np.array([[1, 0.5], [0.5, 1]], dtype=dtype)
+        assert_equal(linalg.eigvals(x).dtype, dtype)
+        x = np.array([[1, 0.5], [-1, 1]], dtype=dtype)
+        assert_equal(linalg.eigvals(x).dtype, get_complex_dtype(dtype))
+
+    def test_0_size(self):
+        # Check that all kinds of 0-sized arrays work
+        class ArraySubclass(np.ndarray):
+            pass
+        a = np.zeros((0, 1, 1), dtype=np.int_).view(ArraySubclass)
+        res = linalg.eigvals(a)
+        assert_(res.dtype.type is np.float64)
+        assert_equal((0, 1), res.shape)
+        # This is just for documentation, it might make sense to change:
+        assert_(isinstance(res, np.ndarray))
+
+        a = np.zeros((0, 0), dtype=np.complex64).view(ArraySubclass)
+        res = linalg.eigvals(a)
+        assert_(res.dtype.type is np.complex64)
+        assert_equal((0,), res.shape)
+        # This is just for documentation, it might make sense to change:
+        assert_(isinstance(res, np.ndarray))
+
+
+class EigCases(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase):
+
+    def do(self, a, b, tags):
+        res = linalg.eig(a)
+        eigenvalues, eigenvectors = res.eigenvalues, res.eigenvectors
+        assert_allclose(dot_generalized(a, eigenvectors),
+                        np.asarray(eigenvectors) * np.asarray(eigenvalues)[..., None, :],
+                        rtol=get_rtol(eigenvalues.dtype))
+        assert_(consistent_subclass(eigenvectors, a))
+
+
+class TestEig(EigCases):
+    @pytest.mark.parametrize('dtype', [single, double, csingle, cdouble])
+    def test_types(self, dtype):
+        x = np.array([[1, 0.5], [0.5, 1]], dtype=dtype)
+        w, v = np.linalg.eig(x)
+        assert_equal(w.dtype, dtype)
+        assert_equal(v.dtype, dtype)
+
+        x = np.array([[1, 0.5], [-1, 1]], dtype=dtype)
+        w, v = np.linalg.eig(x)
+        assert_equal(w.dtype, get_complex_dtype(dtype))
+        assert_equal(v.dtype, get_complex_dtype(dtype))
+
+    def test_0_size(self):
+        # Check that all kinds of 0-sized arrays work
+        class ArraySubclass(np.ndarray):
+            pass
+        a = np.zeros((0, 1, 1), dtype=np.int_).view(ArraySubclass)
+        res, res_v = linalg.eig(a)
+        assert_(res_v.dtype.type is np.float64)
+        assert_(res.dtype.type is np.float64)
+        assert_equal(a.shape, res_v.shape)
+        assert_equal((0, 1), res.shape)
+        # This is just for documentation, it might make sense to change:
+        assert_(isinstance(a, np.ndarray))
+
+        a = np.zeros((0, 0), dtype=np.complex64).view(ArraySubclass)
+        res, res_v = linalg.eig(a)
+        assert_(res_v.dtype.type is np.complex64)
+        assert_(res.dtype.type is np.complex64)
+        assert_equal(a.shape, res_v.shape)
+        assert_equal((0,), res.shape)
+        # This is just for documentation, it might make sense to change:
+        assert_(isinstance(a, np.ndarray))
+
+
+class SVDBaseTests:
+    hermitian = False
+
+    @pytest.mark.parametrize('dtype', [single, double, csingle, cdouble])
+    def test_types(self, dtype):
+        x = np.array([[1, 0.5], [0.5, 1]], dtype=dtype)
+        res = linalg.svd(x)
+        U, S, Vh = res.U, res.S, res.Vh
+        assert_equal(U.dtype, dtype)
+        assert_equal(S.dtype, get_real_dtype(dtype))
+        assert_equal(Vh.dtype, dtype)
+        s = linalg.svd(x, compute_uv=False, hermitian=self.hermitian)
+        assert_equal(s.dtype, get_real_dtype(dtype))
+
+
+class SVDCases(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase):
+
+    def do(self, a, b, tags):
+        u, s, vt = linalg.svd(a, False)
+        assert_allclose(a, dot_generalized(np.asarray(u) * np.asarray(s)[..., None, :],
+                                           np.asarray(vt)),
+                        rtol=get_rtol(u.dtype))
+        assert_(consistent_subclass(u, a))
+        assert_(consistent_subclass(vt, a))
+
+
+class TestSVD(SVDCases, SVDBaseTests):
+    def test_empty_identity(self):
+        """ Empty input should put an identity matrix in u or vh """
+        x = np.empty((4, 0))
+        u, s, vh = linalg.svd(x, compute_uv=True, hermitian=self.hermitian)
+        assert_equal(u.shape, (4, 4))
+        assert_equal(vh.shape, (0, 0))
+        assert_equal(u, np.eye(4))
+
+        x = np.empty((0, 4))
+        u, s, vh = linalg.svd(x, compute_uv=True, hermitian=self.hermitian)
+        assert_equal(u.shape, (0, 0))
+        assert_equal(vh.shape, (4, 4))
+        assert_equal(vh, np.eye(4))
+
+
+class SVDHermitianCases(HermitianTestCase, HermitianGeneralizedTestCase):
+
+    def do(self, a, b, tags):
+        u, s, vt = linalg.svd(a, False, hermitian=True)
+        assert_allclose(a, dot_generalized(np.asarray(u) * np.asarray(s)[..., None, :],
+                                           np.asarray(vt)),
+                        rtol=get_rtol(u.dtype))
+        def hermitian(mat):
+            axes = list(range(mat.ndim))
+            axes[-1], axes[-2] = axes[-2], axes[-1]
+            return np.conj(np.transpose(mat, axes=axes))
+
+        assert_almost_equal(np.matmul(u, hermitian(u)), np.broadcast_to(np.eye(u.shape[-1]), u.shape))
+        assert_almost_equal(np.matmul(vt, hermitian(vt)), np.broadcast_to(np.eye(vt.shape[-1]), vt.shape))
+        assert_equal(np.sort(s)[..., ::-1], s)
+        assert_(consistent_subclass(u, a))
+        assert_(consistent_subclass(vt, a))
+
+
+class TestSVDHermitian(SVDHermitianCases, SVDBaseTests):
+    hermitian = True
+
+
+class CondCases(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase):
+    # cond(x, p) for p in (None, 2, -2)
+
+    def do(self, a, b, tags):
+        c = asarray(a)  # a might be a matrix
+        if 'size-0' in tags:
+            assert_raises(LinAlgError, linalg.cond, c)
+            return
+
+        # +-2 norms
+        s = linalg.svd(c, compute_uv=False)
+        assert_almost_equal(
+            linalg.cond(a), s[..., 0] / s[..., -1],
+            single_decimal=5, double_decimal=11)
+        assert_almost_equal(
+            linalg.cond(a, 2), s[..., 0] / s[..., -1],
+            single_decimal=5, double_decimal=11)
+        assert_almost_equal(
+            linalg.cond(a, -2), s[..., -1] / s[..., 0],
+            single_decimal=5, double_decimal=11)
+
+        # Other norms
+        cinv = np.linalg.inv(c)
+        assert_almost_equal(
+            linalg.cond(a, 1),
+            abs(c).sum(-2).max(-1) * abs(cinv).sum(-2).max(-1),
+            single_decimal=5, double_decimal=11)
+        assert_almost_equal(
+            linalg.cond(a, -1),
+            abs(c).sum(-2).min(-1) * abs(cinv).sum(-2).min(-1),
+            single_decimal=5, double_decimal=11)
+        assert_almost_equal(
+            linalg.cond(a, np.inf),
+            abs(c).sum(-1).max(-1) * abs(cinv).sum(-1).max(-1),
+            single_decimal=5, double_decimal=11)
+        assert_almost_equal(
+            linalg.cond(a, -np.inf),
+            abs(c).sum(-1).min(-1) * abs(cinv).sum(-1).min(-1),
+            single_decimal=5, double_decimal=11)
+        assert_almost_equal(
+            linalg.cond(a, 'fro'),
+            np.sqrt((abs(c)**2).sum(-1).sum(-1)
+                    * (abs(cinv)**2).sum(-1).sum(-1)),
+            single_decimal=5, double_decimal=11)
+
+
+class TestCond(CondCases):
+    def test_basic_nonsvd(self):
+        # Smoketest the non-svd norms
+        A = array([[1., 0, 1], [0, -2., 0], [0, 0, 3.]])
+        assert_almost_equal(linalg.cond(A, inf), 4)
+        assert_almost_equal(linalg.cond(A, -inf), 2/3)
+        assert_almost_equal(linalg.cond(A, 1), 4)
+        assert_almost_equal(linalg.cond(A, -1), 0.5)
+        assert_almost_equal(linalg.cond(A, 'fro'), np.sqrt(265 / 12))
+
+    def test_singular(self):
+        # Singular matrices have infinite condition number for
+        # positive norms, and negative norms shouldn't raise
+        # exceptions
+        As = [np.zeros((2, 2)), np.ones((2, 2))]
+        p_pos = [None, 1, 2, 'fro']
+        p_neg = [-1, -2]
+        for A, p in itertools.product(As, p_pos):
+            # Inversion may not hit exact infinity, so just check the
+            # number is large
+            assert_(linalg.cond(A, p) > 1e15)
+        for A, p in itertools.product(As, p_neg):
+            linalg.cond(A, p)
+
+    @pytest.mark.xfail(True, run=False,
+                       reason="Platform/LAPACK-dependent failure, "
+                              "see gh-18914")
+    def test_nan(self):
+        # nans should be passed through, not converted to infs
+        ps = [None, 1, -1, 2, -2, 'fro']
+        p_pos = [None, 1, 2, 'fro']
+
+        A = np.ones((2, 2))
+        A[0,1] = np.nan
+        for p in ps:
+            c = linalg.cond(A, p)
+            assert_(isinstance(c, np.float_))
+            assert_(np.isnan(c))
+
+        A = np.ones((3, 2, 2))
+        A[1,0,1] = np.nan
+        for p in ps:
+            c = linalg.cond(A, p)
+            assert_(np.isnan(c[1]))
+            if p in p_pos:
+                assert_(c[0] > 1e15)
+                assert_(c[2] > 1e15)
+            else:
+                assert_(not np.isnan(c[0]))
+                assert_(not np.isnan(c[2]))
+
+    def test_stacked_singular(self):
+        # Check behavior when only some of the stacked matrices are
+        # singular
+        np.random.seed(1234)
+        A = np.random.rand(2, 2, 2, 2)
+        A[0,0] = 0
+        A[1,1] = 0
+
+        for p in (None, 1, 2, 'fro', -1, -2):
+            c = linalg.cond(A, p)
+            assert_equal(c[0,0], np.inf)
+            assert_equal(c[1,1], np.inf)
+            assert_(np.isfinite(c[0,1]))
+            assert_(np.isfinite(c[1,0]))
+
+
+class PinvCases(LinalgSquareTestCase,
+                LinalgNonsquareTestCase,
+                LinalgGeneralizedSquareTestCase,
+                LinalgGeneralizedNonsquareTestCase):
+
+    def do(self, a, b, tags):
+        a_ginv = linalg.pinv(a)
+        # `a @ a_ginv == I` does not hold if a is singular
+        dot = dot_generalized
+        assert_almost_equal(dot(dot(a, a_ginv), a), a, single_decimal=5, double_decimal=11)
+        assert_(consistent_subclass(a_ginv, a))
+
+
+class TestPinv(PinvCases):
+    pass
+
+
+class PinvHermitianCases(HermitianTestCase, HermitianGeneralizedTestCase):
+
+    def do(self, a, b, tags):
+        a_ginv = linalg.pinv(a, hermitian=True)
+        # `a @ a_ginv == I` does not hold if a is singular
+        dot = dot_generalized
+        assert_almost_equal(dot(dot(a, a_ginv), a), a, single_decimal=5, double_decimal=11)
+        assert_(consistent_subclass(a_ginv, a))
+
+
+class TestPinvHermitian(PinvHermitianCases):
+    pass
+
+
+class DetCases(LinalgSquareTestCase, LinalgGeneralizedSquareTestCase):
+
+    def do(self, a, b, tags):
+        d = linalg.det(a)
+        res = linalg.slogdet(a)
+        s, ld = res.sign, res.logabsdet
+        if asarray(a).dtype.type in (single, double):
+            ad = asarray(a).astype(double)
+        else:
+            ad = asarray(a).astype(cdouble)
+        ev = linalg.eigvals(ad)
+        assert_almost_equal(d, multiply.reduce(ev, axis=-1))
+        assert_almost_equal(s * np.exp(ld), multiply.reduce(ev, axis=-1))
+
+        s = np.atleast_1d(s)
+        ld = np.atleast_1d(ld)
+        m = (s != 0)
+        assert_almost_equal(np.abs(s[m]), 1)
+        assert_equal(ld[~m], -inf)
+
+
+class TestDet(DetCases):
+    def test_zero(self):
+        assert_equal(linalg.det([[0.0]]), 0.0)
+        assert_equal(type(linalg.det([[0.0]])), double)
+        assert_equal(linalg.det([[0.0j]]), 0.0)
+        assert_equal(type(linalg.det([[0.0j]])), cdouble)
+
+        assert_equal(linalg.slogdet([[0.0]]), (0.0, -inf))
+        assert_equal(type(linalg.slogdet([[0.0]])[0]), double)
+        assert_equal(type(linalg.slogdet([[0.0]])[1]), double)
+        assert_equal(linalg.slogdet([[0.0j]]), (0.0j, -inf))
+        assert_equal(type(linalg.slogdet([[0.0j]])[0]), cdouble)
+        assert_equal(type(linalg.slogdet([[0.0j]])[1]), double)
+
+    @pytest.mark.parametrize('dtype', [single, double, csingle, cdouble])
+    def test_types(self, dtype):
+        x = np.array([[1, 0.5], [0.5, 1]], dtype=dtype)
+        assert_equal(np.linalg.det(x).dtype, dtype)
+        ph, s = np.linalg.slogdet(x)
+        assert_equal(s.dtype, get_real_dtype(dtype))
+        assert_equal(ph.dtype, dtype)
+
+    def test_0_size(self):
+        a = np.zeros((0, 0), dtype=np.complex64)
+        res = linalg.det(a)
+        assert_equal(res, 1.)
+        assert_(res.dtype.type is np.complex64)
+        res = linalg.slogdet(a)
+        assert_equal(res, (1, 0))
+        assert_(res[0].dtype.type is np.complex64)
+        assert_(res[1].dtype.type is np.float32)
+
+        a = np.zeros((0, 0), dtype=np.float64)
+        res = linalg.det(a)
+        assert_equal(res, 1.)
+        assert_(res.dtype.type is np.float64)
+        res = linalg.slogdet(a)
+        assert_equal(res, (1, 0))
+        assert_(res[0].dtype.type is np.float64)
+        assert_(res[1].dtype.type is np.float64)
+
+
+class LstsqCases(LinalgSquareTestCase, LinalgNonsquareTestCase):
+
+    def do(self, a, b, tags):
+        arr = np.asarray(a)
+        m, n = arr.shape
+        u, s, vt = linalg.svd(a, False)
+        x, residuals, rank, sv = linalg.lstsq(a, b, rcond=-1)
+        if m == 0:
+            assert_((x == 0).all())
+        if m <= n:
+            assert_almost_equal(b, dot(a, x))
+            assert_equal(rank, m)
+        else:
+            assert_equal(rank, n)
+        assert_almost_equal(sv, sv.__array_wrap__(s))
+        if rank == n and m > n:
+            expect_resids = (
+                np.asarray(abs(np.dot(a, x) - b)) ** 2).sum(axis=0)
+            expect_resids = np.asarray(expect_resids)
+            if np.asarray(b).ndim == 1:
+                expect_resids.shape = (1,)
+                assert_equal(residuals.shape, expect_resids.shape)
+        else:
+            expect_resids = np.array([]).view(type(x))
+        assert_almost_equal(residuals, expect_resids)
+        assert_(np.issubdtype(residuals.dtype, np.floating))
+        assert_(consistent_subclass(x, b))
+        assert_(consistent_subclass(residuals, b))
+
+
+class TestLstsq(LstsqCases):
+    def test_future_rcond(self):
+        a = np.array([[0., 1.,  0.,  1.,  2.,  0.],
+                      [0., 2.,  0.,  0.,  1.,  0.],
+                      [1., 0.,  1.,  0.,  0.,  4.],
+                      [0., 0.,  0.,  2.,  3.,  0.]]).T
+
+        b = np.array([1, 0, 0, 0, 0, 0])
+        with suppress_warnings() as sup:
+            w = sup.record(FutureWarning, "`rcond` parameter will change")
+            x, residuals, rank, s = linalg.lstsq(a, b)
+            assert_(rank == 4)
+            x, residuals, rank, s = linalg.lstsq(a, b, rcond=-1)
+            assert_(rank == 4)
+            x, residuals, rank, s = linalg.lstsq(a, b, rcond=None)
+            assert_(rank == 3)
+            # Warning should be raised exactly once (first command)
+            assert_(len(w) == 1)
+
+    @pytest.mark.parametrize(["m", "n", "n_rhs"], [
+        (4, 2, 2),
+        (0, 4, 1),
+        (0, 4, 2),
+        (4, 0, 1),
+        (4, 0, 2),
+        (4, 2, 0),
+        (0, 0, 0)
+    ])
+    def test_empty_a_b(self, m, n, n_rhs):
+        a = np.arange(m * n).reshape(m, n)
+        b = np.ones((m, n_rhs))
+        x, residuals, rank, s = linalg.lstsq(a, b, rcond=None)
+        if m == 0:
+            assert_((x == 0).all())
+        assert_equal(x.shape, (n, n_rhs))
+        assert_equal(residuals.shape, ((n_rhs,) if m > n else (0,)))
+        if m > n and n_rhs > 0:
+            # residuals are exactly the squared norms of b's columns
+            r = b - np.dot(a, x)
+            assert_almost_equal(residuals, (r * r).sum(axis=-2))
+        assert_equal(rank, min(m, n))
+        assert_equal(s.shape, (min(m, n),))
+
+    def test_incompatible_dims(self):
+        # use modified version of docstring example
+        x = np.array([0, 1, 2, 3])
+        y = np.array([-1, 0.2, 0.9, 2.1, 3.3])
+        A = np.vstack([x, np.ones(len(x))]).T
+        with assert_raises_regex(LinAlgError, "Incompatible dimensions"):
+            linalg.lstsq(A, y, rcond=None)
+
+
+@pytest.mark.parametrize('dt', [np.dtype(c) for c in '?bBhHiIqQefdgFDGO'])
+class TestMatrixPower:
+
+    rshft_0 = np.eye(4)
+    rshft_1 = rshft_0[[3, 0, 1, 2]]
+    rshft_2 = rshft_0[[2, 3, 0, 1]]
+    rshft_3 = rshft_0[[1, 2, 3, 0]]
+    rshft_all = [rshft_0, rshft_1, rshft_2, rshft_3]
+    noninv = array([[1, 0], [0, 0]])
+    stacked = np.block([[[rshft_0]]]*2)
+    #FIXME the 'e' dtype might work in future
+    dtnoinv = [object, np.dtype('e'), np.dtype('g'), np.dtype('G')]
+
+    def test_large_power(self, dt):
+        rshft = self.rshft_1.astype(dt)
+        assert_equal(
+            matrix_power(rshft, 2**100 + 2**10 + 2**5 + 0), self.rshft_0)
+        assert_equal(
+            matrix_power(rshft, 2**100 + 2**10 + 2**5 + 1), self.rshft_1)
+        assert_equal(
+            matrix_power(rshft, 2**100 + 2**10 + 2**5 + 2), self.rshft_2)
+        assert_equal(
+            matrix_power(rshft, 2**100 + 2**10 + 2**5 + 3), self.rshft_3)
+
+    def test_power_is_zero(self, dt):
+        def tz(M):
+            mz = matrix_power(M, 0)
+            assert_equal(mz, identity_like_generalized(M))
+            assert_equal(mz.dtype, M.dtype)
+
+        for mat in self.rshft_all:
+            tz(mat.astype(dt))
+            if dt != object:
+                tz(self.stacked.astype(dt))
+
+    def test_power_is_one(self, dt):
+        def tz(mat):
+            mz = matrix_power(mat, 1)
+            assert_equal(mz, mat)
+            assert_equal(mz.dtype, mat.dtype)
+
+        for mat in self.rshft_all:
+            tz(mat.astype(dt))
+            if dt != object:
+                tz(self.stacked.astype(dt))
+
+    def test_power_is_two(self, dt):
+        def tz(mat):
+            mz = matrix_power(mat, 2)
+            mmul = matmul if mat.dtype != object else dot
+            assert_equal(mz, mmul(mat, mat))
+            assert_equal(mz.dtype, mat.dtype)
+
+        for mat in self.rshft_all:
+            tz(mat.astype(dt))
+            if dt != object:
+                tz(self.stacked.astype(dt))
+
+    def test_power_is_minus_one(self, dt):
+        def tz(mat):
+            invmat = matrix_power(mat, -1)
+            mmul = matmul if mat.dtype != object else dot
+            assert_almost_equal(
+                mmul(invmat, mat), identity_like_generalized(mat))
+
+        for mat in self.rshft_all:
+            if dt not in self.dtnoinv:
+                tz(mat.astype(dt))
+
+    def test_exceptions_bad_power(self, dt):
+        mat = self.rshft_0.astype(dt)
+        assert_raises(TypeError, matrix_power, mat, 1.5)
+        assert_raises(TypeError, matrix_power, mat, [1])
+
+    def test_exceptions_non_square(self, dt):
+        assert_raises(LinAlgError, matrix_power, np.array([1], dt), 1)
+        assert_raises(LinAlgError, matrix_power, np.array([[1], [2]], dt), 1)
+        assert_raises(LinAlgError, matrix_power, np.ones((4, 3, 2), dt), 1)
+
+    @pytest.mark.skipif(IS_WASM, reason="fp errors don't work in wasm")
+    def test_exceptions_not_invertible(self, dt):
+        if dt in self.dtnoinv:
+            return
+        mat = self.noninv.astype(dt)
+        assert_raises(LinAlgError, matrix_power, mat, -1)
+
+
+class TestEigvalshCases(HermitianTestCase, HermitianGeneralizedTestCase):
+
+    def do(self, a, b, tags):
+        # note that eigenvalue arrays returned by eig must be sorted since
+        # their order isn't guaranteed.
+        ev = linalg.eigvalsh(a, 'L')
+        evalues, evectors = linalg.eig(a)
+        evalues.sort(axis=-1)
+        assert_allclose(ev, evalues, rtol=get_rtol(ev.dtype))
+
+        ev2 = linalg.eigvalsh(a, 'U')
+        assert_allclose(ev2, evalues, rtol=get_rtol(ev.dtype))
+
+
+class TestEigvalsh:
+    @pytest.mark.parametrize('dtype', [single, double, csingle, cdouble])
+    def test_types(self, dtype):
+        x = np.array([[1, 0.5], [0.5, 1]], dtype=dtype)
+        w = np.linalg.eigvalsh(x)
+        assert_equal(w.dtype, get_real_dtype(dtype))
+
+    def test_invalid(self):
+        x = np.array([[1, 0.5], [0.5, 1]], dtype=np.float32)
+        assert_raises(ValueError, np.linalg.eigvalsh, x, UPLO="lrong")
+        assert_raises(ValueError, np.linalg.eigvalsh, x, "lower")
+        assert_raises(ValueError, np.linalg.eigvalsh, x, "upper")
+
+    def test_UPLO(self):
+        Klo = np.array([[0, 0], [1, 0]], dtype=np.double)
+        Kup = np.array([[0, 1], [0, 0]], dtype=np.double)
+        tgt = np.array([-1, 1], dtype=np.double)
+        rtol = get_rtol(np.double)
+
+        # Check default is 'L'
+        w = np.linalg.eigvalsh(Klo)
+        assert_allclose(w, tgt, rtol=rtol)
+        # Check 'L'
+        w = np.linalg.eigvalsh(Klo, UPLO='L')
+        assert_allclose(w, tgt, rtol=rtol)
+        # Check 'l'
+        w = np.linalg.eigvalsh(Klo, UPLO='l')
+        assert_allclose(w, tgt, rtol=rtol)
+        # Check 'U'
+        w = np.linalg.eigvalsh(Kup, UPLO='U')
+        assert_allclose(w, tgt, rtol=rtol)
+        # Check 'u'
+        w = np.linalg.eigvalsh(Kup, UPLO='u')
+        assert_allclose(w, tgt, rtol=rtol)
+
+    def test_0_size(self):
+        # Check that all kinds of 0-sized arrays work
+        class ArraySubclass(np.ndarray):
+            pass
+        a = np.zeros((0, 1, 1), dtype=np.int_).view(ArraySubclass)
+        res = linalg.eigvalsh(a)
+        assert_(res.dtype.type is np.float64)
+        assert_equal((0, 1), res.shape)
+        # This is just for documentation, it might make sense to change:
+        assert_(isinstance(res, np.ndarray))
+
+        a = np.zeros((0, 0), dtype=np.complex64).view(ArraySubclass)
+        res = linalg.eigvalsh(a)
+        assert_(res.dtype.type is np.float32)
+        assert_equal((0,), res.shape)
+        # This is just for documentation, it might make sense to change:
+        assert_(isinstance(res, np.ndarray))
+
+
+class TestEighCases(HermitianTestCase, HermitianGeneralizedTestCase):
+
+    def do(self, a, b, tags):
+        # note that eigenvalue arrays returned by eig must be sorted since
+        # their order isn't guaranteed.
+        res = linalg.eigh(a)
+        ev, evc = res.eigenvalues, res.eigenvectors
+        evalues, evectors = linalg.eig(a)
+        evalues.sort(axis=-1)
+        assert_almost_equal(ev, evalues)
+
+        assert_allclose(dot_generalized(a, evc),
+                        np.asarray(ev)[..., None, :] * np.asarray(evc),
+                        rtol=get_rtol(ev.dtype))
+
+        ev2, evc2 = linalg.eigh(a, 'U')
+        assert_almost_equal(ev2, evalues)
+
+        assert_allclose(dot_generalized(a, evc2),
+                        np.asarray(ev2)[..., None, :] * np.asarray(evc2),
+                        rtol=get_rtol(ev.dtype), err_msg=repr(a))
+
+
+class TestEigh:
+    @pytest.mark.parametrize('dtype', [single, double, csingle, cdouble])
+    def test_types(self, dtype):
+        x = np.array([[1, 0.5], [0.5, 1]], dtype=dtype)
+        w, v = np.linalg.eigh(x)
+        assert_equal(w.dtype, get_real_dtype(dtype))
+        assert_equal(v.dtype, dtype)
+
+    def test_invalid(self):
+        x = np.array([[1, 0.5], [0.5, 1]], dtype=np.float32)
+        assert_raises(ValueError, np.linalg.eigh, x, UPLO="lrong")
+        assert_raises(ValueError, np.linalg.eigh, x, "lower")
+        assert_raises(ValueError, np.linalg.eigh, x, "upper")
+
+    def test_UPLO(self):
+        Klo = np.array([[0, 0], [1, 0]], dtype=np.double)
+        Kup = np.array([[0, 1], [0, 0]], dtype=np.double)
+        tgt = np.array([-1, 1], dtype=np.double)
+        rtol = get_rtol(np.double)
+
+        # Check default is 'L'
+        w, v = np.linalg.eigh(Klo)
+        assert_allclose(w, tgt, rtol=rtol)
+        # Check 'L'
+        w, v = np.linalg.eigh(Klo, UPLO='L')
+        assert_allclose(w, tgt, rtol=rtol)
+        # Check 'l'
+        w, v = np.linalg.eigh(Klo, UPLO='l')
+        assert_allclose(w, tgt, rtol=rtol)
+        # Check 'U'
+        w, v = np.linalg.eigh(Kup, UPLO='U')
+        assert_allclose(w, tgt, rtol=rtol)
+        # Check 'u'
+        w, v = np.linalg.eigh(Kup, UPLO='u')
+        assert_allclose(w, tgt, rtol=rtol)
+
+    def test_0_size(self):
+        # Check that all kinds of 0-sized arrays work
+        class ArraySubclass(np.ndarray):
+            pass
+        a = np.zeros((0, 1, 1), dtype=np.int_).view(ArraySubclass)
+        res, res_v = linalg.eigh(a)
+        assert_(res_v.dtype.type is np.float64)
+        assert_(res.dtype.type is np.float64)
+        assert_equal(a.shape, res_v.shape)
+        assert_equal((0, 1), res.shape)
+        # This is just for documentation, it might make sense to change:
+        assert_(isinstance(a, np.ndarray))
+
+        a = np.zeros((0, 0), dtype=np.complex64).view(ArraySubclass)
+        res, res_v = linalg.eigh(a)
+        assert_(res_v.dtype.type is np.complex64)
+        assert_(res.dtype.type is np.float32)
+        assert_equal(a.shape, res_v.shape)
+        assert_equal((0,), res.shape)
+        # This is just for documentation, it might make sense to change:
+        assert_(isinstance(a, np.ndarray))
+
+
+class _TestNormBase:
+    dt = None
+    dec = None
+
+    @staticmethod
+    def check_dtype(x, res):
+        if issubclass(x.dtype.type, np.inexact):
+            assert_equal(res.dtype, x.real.dtype)
+        else:
+            # For integer input, don't have to test float precision of output.
+            assert_(issubclass(res.dtype.type, np.floating))
+
+
+class _TestNormGeneral(_TestNormBase):
+
+    def test_empty(self):
+        assert_equal(norm([]), 0.0)
+        assert_equal(norm(array([], dtype=self.dt)), 0.0)
+        assert_equal(norm(atleast_2d(array([], dtype=self.dt))), 0.0)
+
+    def test_vector_return_type(self):
+        a = np.array([1, 0, 1])
+
+        exact_types = np.typecodes['AllInteger']
+        inexact_types = np.typecodes['AllFloat']
+
+        all_types = exact_types + inexact_types
+
+        for each_type in all_types:
+            at = a.astype(each_type)
+
+            an = norm(at, -np.inf)
+            self.check_dtype(at, an)
+            assert_almost_equal(an, 0.0)
+
+            with suppress_warnings() as sup:
+                sup.filter(RuntimeWarning, "divide by zero encountered")
+                an = norm(at, -1)
+                self.check_dtype(at, an)
+                assert_almost_equal(an, 0.0)
+
+            an = norm(at, 0)
+            self.check_dtype(at, an)
+            assert_almost_equal(an, 2)
+
+            an = norm(at, 1)
+            self.check_dtype(at, an)
+            assert_almost_equal(an, 2.0)
+
+            an = norm(at, 2)
+            self.check_dtype(at, an)
+            assert_almost_equal(an, an.dtype.type(2.0)**an.dtype.type(1.0/2.0))
+
+            an = norm(at, 4)
+            self.check_dtype(at, an)
+            assert_almost_equal(an, an.dtype.type(2.0)**an.dtype.type(1.0/4.0))
+
+            an = norm(at, np.inf)
+            self.check_dtype(at, an)
+            assert_almost_equal(an, 1.0)
+
+    def test_vector(self):
+        a = [1, 2, 3, 4]
+        b = [-1, -2, -3, -4]
+        c = [-1, 2, -3, 4]
+
+        def _test(v):
+            np.testing.assert_almost_equal(norm(v), 30 ** 0.5,
+                                           decimal=self.dec)
+            np.testing.assert_almost_equal(norm(v, inf), 4.0,
+                                           decimal=self.dec)
+            np.testing.assert_almost_equal(norm(v, -inf), 1.0,
+                                           decimal=self.dec)
+            np.testing.assert_almost_equal(norm(v, 1), 10.0,
+                                           decimal=self.dec)
+            np.testing.assert_almost_equal(norm(v, -1), 12.0 / 25,
+                                           decimal=self.dec)
+            np.testing.assert_almost_equal(norm(v, 2), 30 ** 0.5,
+                                           decimal=self.dec)
+            np.testing.assert_almost_equal(norm(v, -2), ((205. / 144) ** -0.5),
+                                           decimal=self.dec)
+            np.testing.assert_almost_equal(norm(v, 0), 4,
+                                           decimal=self.dec)
+
+        for v in (a, b, c,):
+            _test(v)
+
+        for v in (array(a, dtype=self.dt), array(b, dtype=self.dt),
+                  array(c, dtype=self.dt)):
+            _test(v)
+
+    def test_axis(self):
+        # Vector norms.
+        # Compare the use of `axis` with computing the norm of each row
+        # or column separately.
+        A = array([[1, 2, 3], [4, 5, 6]], dtype=self.dt)
+        for order in [None, -1, 0, 1, 2, 3, np.Inf, -np.Inf]:
+            expected0 = [norm(A[:, k], ord=order) for k in range(A.shape[1])]
+            assert_almost_equal(norm(A, ord=order, axis=0), expected0)
+            expected1 = [norm(A[k, :], ord=order) for k in range(A.shape[0])]
+            assert_almost_equal(norm(A, ord=order, axis=1), expected1)
+
+        # Matrix norms.
+        B = np.arange(1, 25, dtype=self.dt).reshape(2, 3, 4)
+        nd = B.ndim
+        for order in [None, -2, 2, -1, 1, np.Inf, -np.Inf, 'fro']:
+            for axis in itertools.combinations(range(-nd, nd), 2):
+                row_axis, col_axis = axis
+                if row_axis < 0:
+                    row_axis += nd
+                if col_axis < 0:
+                    col_axis += nd
+                if row_axis == col_axis:
+                    assert_raises(ValueError, norm, B, ord=order, axis=axis)
+                else:
+                    n = norm(B, ord=order, axis=axis)
+
+                    # The logic using k_index only works for nd = 3.
+                    # This has to be changed if nd is increased.
+                    k_index = nd - (row_axis + col_axis)
+                    if row_axis < col_axis:
+                        expected = [norm(B[:].take(k, axis=k_index), ord=order)
+                                    for k in range(B.shape[k_index])]
+                    else:
+                        expected = [norm(B[:].take(k, axis=k_index).T, ord=order)
+                                    for k in range(B.shape[k_index])]
+                    assert_almost_equal(n, expected)
+
+    def test_keepdims(self):
+        A = np.arange(1, 25, dtype=self.dt).reshape(2, 3, 4)
+
+        allclose_err = 'order {0}, axis = {1}'
+        shape_err = 'Shape mismatch found {0}, expected {1}, order={2}, axis={3}'
+
+        # check the order=None, axis=None case
+        expected = norm(A, ord=None, axis=None)
+        found = norm(A, ord=None, axis=None, keepdims=True)
+        assert_allclose(np.squeeze(found), expected,
+                        err_msg=allclose_err.format(None, None))
+        expected_shape = (1, 1, 1)
+        assert_(found.shape == expected_shape,
+                shape_err.format(found.shape, expected_shape, None, None))
+
+        # Vector norms.
+        for order in [None, -1, 0, 1, 2, 3, np.Inf, -np.Inf]:
+            for k in range(A.ndim):
+                expected = norm(A, ord=order, axis=k)
+                found = norm(A, ord=order, axis=k, keepdims=True)
+                assert_allclose(np.squeeze(found), expected,
+                                err_msg=allclose_err.format(order, k))
+                expected_shape = list(A.shape)
+                expected_shape[k] = 1
+                expected_shape = tuple(expected_shape)
+                assert_(found.shape == expected_shape,
+                        shape_err.format(found.shape, expected_shape, order, k))
+
+        # Matrix norms.
+        for order in [None, -2, 2, -1, 1, np.Inf, -np.Inf, 'fro', 'nuc']:
+            for k in itertools.permutations(range(A.ndim), 2):
+                expected = norm(A, ord=order, axis=k)
+                found = norm(A, ord=order, axis=k, keepdims=True)
+                assert_allclose(np.squeeze(found), expected,
+                                err_msg=allclose_err.format(order, k))
+                expected_shape = list(A.shape)
+                expected_shape[k[0]] = 1
+                expected_shape[k[1]] = 1
+                expected_shape = tuple(expected_shape)
+                assert_(found.shape == expected_shape,
+                        shape_err.format(found.shape, expected_shape, order, k))
+
+
+class _TestNorm2D(_TestNormBase):
+    # Define the part for 2d arrays separately, so we can subclass this
+    # and run the tests using np.matrix in matrixlib.tests.test_matrix_linalg.
+    array = np.array
+
+    def test_matrix_empty(self):
+        assert_equal(norm(self.array([[]], dtype=self.dt)), 0.0)
+
+    def test_matrix_return_type(self):
+        a = self.array([[1, 0, 1], [0, 1, 1]])
+
+        exact_types = np.typecodes['AllInteger']
+
+        # float32, complex64, float64, complex128 types are the only types
+        # allowed by `linalg`, which performs the matrix operations used
+        # within `norm`.
+        inexact_types = 'fdFD'
+
+        all_types = exact_types + inexact_types
+
+        for each_type in all_types:
+            at = a.astype(each_type)
+
+            an = norm(at, -np.inf)
+            self.check_dtype(at, an)
+            assert_almost_equal(an, 2.0)
+
+            with suppress_warnings() as sup:
+                sup.filter(RuntimeWarning, "divide by zero encountered")
+                an = norm(at, -1)
+                self.check_dtype(at, an)
+                assert_almost_equal(an, 1.0)
+
+            an = norm(at, 1)
+            self.check_dtype(at, an)
+            assert_almost_equal(an, 2.0)
+
+            an = norm(at, 2)
+            self.check_dtype(at, an)
+            assert_almost_equal(an, 3.0**(1.0/2.0))
+
+            an = norm(at, -2)
+            self.check_dtype(at, an)
+            assert_almost_equal(an, 1.0)
+
+            an = norm(at, np.inf)
+            self.check_dtype(at, an)
+            assert_almost_equal(an, 2.0)
+
+            an = norm(at, 'fro')
+            self.check_dtype(at, an)
+            assert_almost_equal(an, 2.0)
+
+            an = norm(at, 'nuc')
+            self.check_dtype(at, an)
+            # Lower bar needed to support low precision floats.
+            # They end up being off by 1 in the 7th place.
+            np.testing.assert_almost_equal(an, 2.7320508075688772, decimal=6)
+
+    def test_matrix_2x2(self):
+        A = self.array([[1, 3], [5, 7]], dtype=self.dt)
+        assert_almost_equal(norm(A), 84 ** 0.5)
+        assert_almost_equal(norm(A, 'fro'), 84 ** 0.5)
+        assert_almost_equal(norm(A, 'nuc'), 10.0)
+        assert_almost_equal(norm(A, inf), 12.0)
+        assert_almost_equal(norm(A, -inf), 4.0)
+        assert_almost_equal(norm(A, 1), 10.0)
+        assert_almost_equal(norm(A, -1), 6.0)
+        assert_almost_equal(norm(A, 2), 9.1231056256176615)
+        assert_almost_equal(norm(A, -2), 0.87689437438234041)
+
+        assert_raises(ValueError, norm, A, 'nofro')
+        assert_raises(ValueError, norm, A, -3)
+        assert_raises(ValueError, norm, A, 0)
+
+    def test_matrix_3x3(self):
+        # This test has been added because the 2x2 example
+        # happened to have equal nuclear norm and induced 1-norm.
+        # The 1/10 scaling factor accommodates the absolute tolerance
+        # used in assert_almost_equal.
+        A = (1 / 10) * \
+            self.array([[1, 2, 3], [6, 0, 5], [3, 2, 1]], dtype=self.dt)
+        assert_almost_equal(norm(A), (1 / 10) * 89 ** 0.5)
+        assert_almost_equal(norm(A, 'fro'), (1 / 10) * 89 ** 0.5)
+        assert_almost_equal(norm(A, 'nuc'), 1.3366836911774836)
+        assert_almost_equal(norm(A, inf), 1.1)
+        assert_almost_equal(norm(A, -inf), 0.6)
+        assert_almost_equal(norm(A, 1), 1.0)
+        assert_almost_equal(norm(A, -1), 0.4)
+        assert_almost_equal(norm(A, 2), 0.88722940323461277)
+        assert_almost_equal(norm(A, -2), 0.19456584790481812)
+
+    def test_bad_args(self):
+        # Check that bad arguments raise the appropriate exceptions.
+
+        A = self.array([[1, 2, 3], [4, 5, 6]], dtype=self.dt)
+        B = np.arange(1, 25, dtype=self.dt).reshape(2, 3, 4)
+
+        # Using `axis=<integer>` or passing in a 1-D array implies vector
+        # norms are being computed, so also using `ord='fro'`
+        # or `ord='nuc'` or any other string raises a ValueError.
+        assert_raises(ValueError, norm, A, 'fro', 0)
+        assert_raises(ValueError, norm, A, 'nuc', 0)
+        assert_raises(ValueError, norm, [3, 4], 'fro', None)
+        assert_raises(ValueError, norm, [3, 4], 'nuc', None)
+        assert_raises(ValueError, norm, [3, 4], 'test', None)
+
+        # Similarly, norm should raise an exception when ord is any finite
+        # number other than 1, 2, -1 or -2 when computing matrix norms.
+        for order in [0, 3]:
+            assert_raises(ValueError, norm, A, order, None)
+            assert_raises(ValueError, norm, A, order, (0, 1))
+            assert_raises(ValueError, norm, B, order, (1, 2))
+
+        # Invalid axis
+        assert_raises(np.AxisError, norm, B, None, 3)
+        assert_raises(np.AxisError, norm, B, None, (2, 3))
+        assert_raises(ValueError, norm, B, None, (0, 1, 2))
+
+
+class _TestNorm(_TestNorm2D, _TestNormGeneral):
+    pass
+
+
+class TestNorm_NonSystematic:
+
+    def test_longdouble_norm(self):
+        # Non-regression test: p-norm of longdouble would previously raise
+        # UnboundLocalError.
+        x = np.arange(10, dtype=np.longdouble)
+        old_assert_almost_equal(norm(x, ord=3), 12.65, decimal=2)
+
+    def test_intmin(self):
+        # Non-regression test: p-norm of signed integer would previously do
+        # float cast and abs in the wrong order.
+        x = np.array([-2 ** 31], dtype=np.int32)
+        old_assert_almost_equal(norm(x, ord=3), 2 ** 31, decimal=5)
+
+    def test_complex_high_ord(self):
+        # gh-4156
+        d = np.empty((2,), dtype=np.clongdouble)
+        d[0] = 6 + 7j
+        d[1] = -6 + 7j
+        res = 11.615898132184
+        old_assert_almost_equal(np.linalg.norm(d, ord=3), res, decimal=10)
+        d = d.astype(np.complex128)
+        old_assert_almost_equal(np.linalg.norm(d, ord=3), res, decimal=9)
+        d = d.astype(np.complex64)
+        old_assert_almost_equal(np.linalg.norm(d, ord=3), res, decimal=5)
+
+
+# Separate definitions so we can use them for matrix tests.
+class _TestNormDoubleBase(_TestNormBase):
+    dt = np.double
+    dec = 12
+
+
+class _TestNormSingleBase(_TestNormBase):
+    dt = np.float32
+    dec = 6
+
+
+class _TestNormInt64Base(_TestNormBase):
+    dt = np.int64
+    dec = 12
+
+
+class TestNormDouble(_TestNorm, _TestNormDoubleBase):
+    pass
+
+
+class TestNormSingle(_TestNorm, _TestNormSingleBase):
+    pass
+
+
+class TestNormInt64(_TestNorm, _TestNormInt64Base):
+    pass
+
+
+class TestMatrixRank:
+
+    def test_matrix_rank(self):
+        # Full rank matrix
+        assert_equal(4, matrix_rank(np.eye(4)))
+        # rank deficient matrix
+        I = np.eye(4)
+        I[-1, -1] = 0.
+        assert_equal(matrix_rank(I), 3)
+        # All zeros - zero rank
+        assert_equal(matrix_rank(np.zeros((4, 4))), 0)
+        # 1 dimension - rank 1 unless all 0
+        assert_equal(matrix_rank([1, 0, 0, 0]), 1)
+        assert_equal(matrix_rank(np.zeros((4,))), 0)
+        # accepts array-like
+        assert_equal(matrix_rank([1]), 1)
+        # greater than 2 dimensions treated as stacked matrices
+        ms = np.array([I, np.eye(4), np.zeros((4,4))])
+        assert_equal(matrix_rank(ms), np.array([3, 4, 0]))
+        # works on scalar
+        assert_equal(matrix_rank(1), 1)
+
+    def test_symmetric_rank(self):
+        assert_equal(4, matrix_rank(np.eye(4), hermitian=True))
+        assert_equal(1, matrix_rank(np.ones((4, 4)), hermitian=True))
+        assert_equal(0, matrix_rank(np.zeros((4, 4)), hermitian=True))
+        # rank deficient matrix
+        I = np.eye(4)
+        I[-1, -1] = 0.
+        assert_equal(3, matrix_rank(I, hermitian=True))
+        # manually supplied tolerance
+        I[-1, -1] = 1e-8
+        assert_equal(4, matrix_rank(I, hermitian=True, tol=0.99e-8))
+        assert_equal(3, matrix_rank(I, hermitian=True, tol=1.01e-8))
+
+
+def test_reduced_rank():
+    # Test matrices with reduced rank
+    rng = np.random.RandomState(20120714)
+    for i in range(100):
+        # Make a rank deficient matrix
+        X = rng.normal(size=(40, 10))
+        X[:, 0] = X[:, 1] + X[:, 2]
+        # Assert that matrix_rank detected deficiency
+        assert_equal(matrix_rank(X), 9)
+        X[:, 3] = X[:, 4] + X[:, 5]
+        assert_equal(matrix_rank(X), 8)
+
+
+class TestQR:
+    # Define the array class here, so run this on matrices elsewhere.
+    array = np.array
+
+    def check_qr(self, a):
+        # This test expects the argument `a` to be an ndarray or
+        # a subclass of an ndarray of inexact type.
+        a_type = type(a)
+        a_dtype = a.dtype
+        m, n = a.shape
+        k = min(m, n)
+
+        # mode == 'complete'
+        res = linalg.qr(a, mode='complete')
+        Q, R = res.Q, res.R
+        assert_(Q.dtype == a_dtype)
+        assert_(R.dtype == a_dtype)
+        assert_(isinstance(Q, a_type))
+        assert_(isinstance(R, a_type))
+        assert_(Q.shape == (m, m))
+        assert_(R.shape == (m, n))
+        assert_almost_equal(dot(Q, R), a)
+        assert_almost_equal(dot(Q.T.conj(), Q), np.eye(m))
+        assert_almost_equal(np.triu(R), R)
+
+        # mode == 'reduced'
+        q1, r1 = linalg.qr(a, mode='reduced')
+        assert_(q1.dtype == a_dtype)
+        assert_(r1.dtype == a_dtype)
+        assert_(isinstance(q1, a_type))
+        assert_(isinstance(r1, a_type))
+        assert_(q1.shape == (m, k))
+        assert_(r1.shape == (k, n))
+        assert_almost_equal(dot(q1, r1), a)
+        assert_almost_equal(dot(q1.T.conj(), q1), np.eye(k))
+        assert_almost_equal(np.triu(r1), r1)
+
+        # mode == 'r'
+        r2 = linalg.qr(a, mode='r')
+        assert_(r2.dtype == a_dtype)
+        assert_(isinstance(r2, a_type))
+        assert_almost_equal(r2, r1)
+
+
+    @pytest.mark.parametrize(["m", "n"], [
+        (3, 0),
+        (0, 3),
+        (0, 0)
+    ])
+    def test_qr_empty(self, m, n):
+        k = min(m, n)
+        a = np.empty((m, n))
+
+        self.check_qr(a)
+
+        h, tau = np.linalg.qr(a, mode='raw')
+        assert_equal(h.dtype, np.double)
+        assert_equal(tau.dtype, np.double)
+        assert_equal(h.shape, (n, m))
+        assert_equal(tau.shape, (k,))
+
+    def test_mode_raw(self):
+        # The factorization is not unique and varies between libraries,
+        # so it is not possible to check against known values. Functional
+        # testing is a possibility, but awaits the exposure of more
+        # of the functions in lapack_lite. Consequently, this test is
+        # very limited in scope. Note that the results are in FORTRAN
+        # order, hence the h arrays are transposed.
+        a = self.array([[1, 2], [3, 4], [5, 6]], dtype=np.double)
+
+        # Test double
+        h, tau = linalg.qr(a, mode='raw')
+        assert_(h.dtype == np.double)
+        assert_(tau.dtype == np.double)
+        assert_(h.shape == (2, 3))
+        assert_(tau.shape == (2,))
+
+        h, tau = linalg.qr(a.T, mode='raw')
+        assert_(h.dtype == np.double)
+        assert_(tau.dtype == np.double)
+        assert_(h.shape == (3, 2))
+        assert_(tau.shape == (2,))
+
+    def test_mode_all_but_economic(self):
+        a = self.array([[1, 2], [3, 4]])
+        b = self.array([[1, 2], [3, 4], [5, 6]])
+        for dt in "fd":
+            m1 = a.astype(dt)
+            m2 = b.astype(dt)
+            self.check_qr(m1)
+            self.check_qr(m2)
+            self.check_qr(m2.T)
+
+        for dt in "fd":
+            m1 = 1 + 1j * a.astype(dt)
+            m2 = 1 + 1j * b.astype(dt)
+            self.check_qr(m1)
+            self.check_qr(m2)
+            self.check_qr(m2.T)
+
+    def check_qr_stacked(self, a):
+        # This test expects the argument `a` to be an ndarray or
+        # a subclass of an ndarray of inexact type.
+        a_type = type(a)
+        a_dtype = a.dtype
+        m, n = a.shape[-2:]
+        k = min(m, n)
+
+        # mode == 'complete'
+        q, r = linalg.qr(a, mode='complete')
+        assert_(q.dtype == a_dtype)
+        assert_(r.dtype == a_dtype)
+        assert_(isinstance(q, a_type))
+        assert_(isinstance(r, a_type))
+        assert_(q.shape[-2:] == (m, m))
+        assert_(r.shape[-2:] == (m, n))
+        assert_almost_equal(matmul(q, r), a)
+        I_mat = np.identity(q.shape[-1])
+        stack_I_mat = np.broadcast_to(I_mat,
+                        q.shape[:-2] + (q.shape[-1],)*2)
+        assert_almost_equal(matmul(swapaxes(q, -1, -2).conj(), q), stack_I_mat)
+        assert_almost_equal(np.triu(r[..., :, :]), r)
+
+        # mode == 'reduced'
+        q1, r1 = linalg.qr(a, mode='reduced')
+        assert_(q1.dtype == a_dtype)
+        assert_(r1.dtype == a_dtype)
+        assert_(isinstance(q1, a_type))
+        assert_(isinstance(r1, a_type))
+        assert_(q1.shape[-2:] == (m, k))
+        assert_(r1.shape[-2:] == (k, n))
+        assert_almost_equal(matmul(q1, r1), a)
+        I_mat = np.identity(q1.shape[-1])
+        stack_I_mat = np.broadcast_to(I_mat,
+                        q1.shape[:-2] + (q1.shape[-1],)*2)
+        assert_almost_equal(matmul(swapaxes(q1, -1, -2).conj(), q1),
+                            stack_I_mat)
+        assert_almost_equal(np.triu(r1[..., :, :]), r1)
+
+        # mode == 'r'
+        r2 = linalg.qr(a, mode='r')
+        assert_(r2.dtype == a_dtype)
+        assert_(isinstance(r2, a_type))
+        assert_almost_equal(r2, r1)
+
+    @pytest.mark.parametrize("size", [
+        (3, 4), (4, 3), (4, 4),
+        (3, 0), (0, 3)])
+    @pytest.mark.parametrize("outer_size", [
+        (2, 2), (2,), (2, 3, 4)])
+    @pytest.mark.parametrize("dt", [
+        np.single, np.double,
+        np.csingle, np.cdouble])
+    def test_stacked_inputs(self, outer_size, size, dt):
+
+        A = np.random.normal(size=outer_size + size).astype(dt)
+        B = np.random.normal(size=outer_size + size).astype(dt)
+        self.check_qr_stacked(A)
+        self.check_qr_stacked(A + 1.j*B)
+
+
+class TestCholesky:
+    # TODO: are there no other tests for cholesky?
+
+    @pytest.mark.parametrize(
+        'shape', [(1, 1), (2, 2), (3, 3), (50, 50), (3, 10, 10)]
+    )
+    @pytest.mark.parametrize(
+        'dtype', (np.float32, np.float64, np.complex64, np.complex128)
+    )
+    def test_basic_property(self, shape, dtype):
+        # Check A = L L^H
+        np.random.seed(1)
+        a = np.random.randn(*shape)
+        if np.issubdtype(dtype, np.complexfloating):
+            a = a + 1j*np.random.randn(*shape)
+
+        t = list(range(len(shape)))
+        t[-2:] = -1, -2
+
+        a = np.matmul(a.transpose(t).conj(), a)
+        a = np.asarray(a, dtype=dtype)
+
+        c = np.linalg.cholesky(a)
+
+        b = np.matmul(c, c.transpose(t).conj())
+        with np._no_nep50_warning():
+            atol = 500 * a.shape[0] * np.finfo(dtype).eps
+        assert_allclose(b, a, atol=atol, err_msg=f'{shape} {dtype}\n{a}\n{c}')
+
+    def test_0_size(self):
+        class ArraySubclass(np.ndarray):
+            pass
+        a = np.zeros((0, 1, 1), dtype=np.int_).view(ArraySubclass)
+        res = linalg.cholesky(a)
+        assert_equal(a.shape, res.shape)
+        assert_(res.dtype.type is np.float64)
+        # for documentation purpose:
+        assert_(isinstance(res, np.ndarray))
+
+        a = np.zeros((1, 0, 0), dtype=np.complex64).view(ArraySubclass)
+        res = linalg.cholesky(a)
+        assert_equal(a.shape, res.shape)
+        assert_(res.dtype.type is np.complex64)
+        assert_(isinstance(res, np.ndarray))
+
+
+def test_byteorder_check():
+    # Byte order check should pass for native order
+    if sys.byteorder == 'little':
+        native = '<'
+    else:
+        native = '>'
+
+    for dtt in (np.float32, np.float64):
+        arr = np.eye(4, dtype=dtt)
+        n_arr = arr.newbyteorder(native)
+        sw_arr = arr.newbyteorder('S').byteswap()
+        assert_equal(arr.dtype.byteorder, '=')
+        for routine in (linalg.inv, linalg.det, linalg.pinv):
+            # Normal call
+            res = routine(arr)
+            # Native but not '='
+            assert_array_equal(res, routine(n_arr))
+            # Swapped
+            assert_array_equal(res, routine(sw_arr))
+
+
+@pytest.mark.skipif(IS_WASM, reason="fp errors don't work in wasm")
+def test_generalized_raise_multiloop():
+    # It should raise an error even if the error doesn't occur in the
+    # last iteration of the ufunc inner loop
+
+    invertible = np.array([[1, 2], [3, 4]])
+    non_invertible = np.array([[1, 1], [1, 1]])
+
+    x = np.zeros([4, 4, 2, 2])[1::2]
+    x[...] = invertible
+    x[0, 0] = non_invertible
+
+    assert_raises(np.linalg.LinAlgError, np.linalg.inv, x)
+
+
+def test_xerbla_override():
+    # Check that our xerbla has been successfully linked in. If it is not,
+    # the default xerbla routine is called, which prints a message to stdout
+    # and may, or may not, abort the process depending on the LAPACK package.
+
+    XERBLA_OK = 255
+
+    try:
+        pid = os.fork()
+    except (OSError, AttributeError):
+        # fork failed, or not running on POSIX
+        pytest.skip("Not POSIX or fork failed.")
+
+    if pid == 0:
+        # child; close i/o file handles
+        os.close(1)
+        os.close(0)
+        # Avoid producing core files.
+        import resource
+        resource.setrlimit(resource.RLIMIT_CORE, (0, 0))
+        # These calls may abort.
+        try:
+            np.linalg.lapack_lite.xerbla()
+        except ValueError:
+            pass
+        except Exception:
+            os._exit(os.EX_CONFIG)
+
+        try:
+            a = np.array([[1.]])
+            np.linalg.lapack_lite.dorgqr(
+                1, 1, 1, a,
+                0,  # <- invalid value
+                a, a, 0, 0)
+        except ValueError as e:
+            if "DORGQR parameter number 5" in str(e):
+                # success, reuse error code to mark success as
+                # FORTRAN STOP returns as success.
+                os._exit(XERBLA_OK)
+
+        # Did not abort, but our xerbla was not linked in.
+        os._exit(os.EX_CONFIG)
+    else:
+        # parent
+        pid, status = os.wait()
+        if os.WEXITSTATUS(status) != XERBLA_OK:
+            pytest.skip('Numpy xerbla not linked in.')
+
+
+@pytest.mark.skipif(IS_WASM, reason="Cannot start subprocess")
+@pytest.mark.slow
+def test_sdot_bug_8577():
+    # Regression test that loading certain other libraries does not
+    # result to wrong results in float32 linear algebra.
+    #
+    # There's a bug gh-8577 on OSX that can trigger this, and perhaps
+    # there are also other situations in which it occurs.
+    #
+    # Do the check in a separate process.
+
+    bad_libs = ['PyQt5.QtWidgets', 'IPython']
+
+    template = textwrap.dedent("""
+    import sys
+    {before}
+    try:
+        import {bad_lib}
+    except ImportError:
+        sys.exit(0)
+    {after}
+    x = np.ones(2, dtype=np.float32)
+    sys.exit(0 if np.allclose(x.dot(x), 2.0) else 1)
+    """)
+
+    for bad_lib in bad_libs:
+        code = template.format(before="import numpy as np", after="",
+                               bad_lib=bad_lib)
+        subprocess.check_call([sys.executable, "-c", code])
+
+        # Swapped import order
+        code = template.format(after="import numpy as np", before="",
+                               bad_lib=bad_lib)
+        subprocess.check_call([sys.executable, "-c", code])
+
+
+class TestMultiDot:
+
+    def test_basic_function_with_three_arguments(self):
+        # multi_dot with three arguments uses a fast hand coded algorithm to
+        # determine the optimal order. Therefore test it separately.
+        A = np.random.random((6, 2))
+        B = np.random.random((2, 6))
+        C = np.random.random((6, 2))
+
+        assert_almost_equal(multi_dot([A, B, C]), A.dot(B).dot(C))
+        assert_almost_equal(multi_dot([A, B, C]), np.dot(A, np.dot(B, C)))
+
+    def test_basic_function_with_two_arguments(self):
+        # separate code path with two arguments
+        A = np.random.random((6, 2))
+        B = np.random.random((2, 6))
+
+        assert_almost_equal(multi_dot([A, B]), A.dot(B))
+        assert_almost_equal(multi_dot([A, B]), np.dot(A, B))
+
+    def test_basic_function_with_dynamic_programming_optimization(self):
+        # multi_dot with four or more arguments uses the dynamic programming
+        # optimization and therefore deserve a separate
+        A = np.random.random((6, 2))
+        B = np.random.random((2, 6))
+        C = np.random.random((6, 2))
+        D = np.random.random((2, 1))
+        assert_almost_equal(multi_dot([A, B, C, D]), A.dot(B).dot(C).dot(D))
+
+    def test_vector_as_first_argument(self):
+        # The first argument can be 1-D
+        A1d = np.random.random(2)  # 1-D
+        B = np.random.random((2, 6))
+        C = np.random.random((6, 2))
+        D = np.random.random((2, 2))
+
+        # the result should be 1-D
+        assert_equal(multi_dot([A1d, B, C, D]).shape, (2,))
+
+    def test_vector_as_last_argument(self):
+        # The last argument can be 1-D
+        A = np.random.random((6, 2))
+        B = np.random.random((2, 6))
+        C = np.random.random((6, 2))
+        D1d = np.random.random(2)  # 1-D
+
+        # the result should be 1-D
+        assert_equal(multi_dot([A, B, C, D1d]).shape, (6,))
+
+    def test_vector_as_first_and_last_argument(self):
+        # The first and last arguments can be 1-D
+        A1d = np.random.random(2)  # 1-D
+        B = np.random.random((2, 6))
+        C = np.random.random((6, 2))
+        D1d = np.random.random(2)  # 1-D
+
+        # the result should be a scalar
+        assert_equal(multi_dot([A1d, B, C, D1d]).shape, ())
+
+    def test_three_arguments_and_out(self):
+        # multi_dot with three arguments uses a fast hand coded algorithm to
+        # determine the optimal order. Therefore test it separately.
+        A = np.random.random((6, 2))
+        B = np.random.random((2, 6))
+        C = np.random.random((6, 2))
+
+        out = np.zeros((6, 2))
+        ret = multi_dot([A, B, C], out=out)
+        assert out is ret
+        assert_almost_equal(out, A.dot(B).dot(C))
+        assert_almost_equal(out, np.dot(A, np.dot(B, C)))
+
+    def test_two_arguments_and_out(self):
+        # separate code path with two arguments
+        A = np.random.random((6, 2))
+        B = np.random.random((2, 6))
+        out = np.zeros((6, 6))
+        ret = multi_dot([A, B], out=out)
+        assert out is ret
+        assert_almost_equal(out, A.dot(B))
+        assert_almost_equal(out, np.dot(A, B))
+
+    def test_dynamic_programming_optimization_and_out(self):
+        # multi_dot with four or more arguments uses the dynamic programming
+        # optimization and therefore deserve a separate test
+        A = np.random.random((6, 2))
+        B = np.random.random((2, 6))
+        C = np.random.random((6, 2))
+        D = np.random.random((2, 1))
+        out = np.zeros((6, 1))
+        ret = multi_dot([A, B, C, D], out=out)
+        assert out is ret
+        assert_almost_equal(out, A.dot(B).dot(C).dot(D))
+
+    def test_dynamic_programming_logic(self):
+        # Test for the dynamic programming part
+        # This test is directly taken from Cormen page 376.
+        arrays = [np.random.random((30, 35)),
+                  np.random.random((35, 15)),
+                  np.random.random((15, 5)),
+                  np.random.random((5, 10)),
+                  np.random.random((10, 20)),
+                  np.random.random((20, 25))]
+        m_expected = np.array([[0., 15750., 7875., 9375., 11875., 15125.],
+                               [0.,     0., 2625., 4375.,  7125., 10500.],
+                               [0.,     0.,    0.,  750.,  2500.,  5375.],
+                               [0.,     0.,    0.,    0.,  1000.,  3500.],
+                               [0.,     0.,    0.,    0.,     0.,  5000.],
+                               [0.,     0.,    0.,    0.,     0.,     0.]])
+        s_expected = np.array([[0,  1,  1,  3,  3,  3],
+                               [0,  0,  2,  3,  3,  3],
+                               [0,  0,  0,  3,  3,  3],
+                               [0,  0,  0,  0,  4,  5],
+                               [0,  0,  0,  0,  0,  5],
+                               [0,  0,  0,  0,  0,  0]], dtype=int)
+        s_expected -= 1  # Cormen uses 1-based index, python does not.
+
+        s, m = _multi_dot_matrix_chain_order(arrays, return_costs=True)
+
+        # Only the upper triangular part (without the diagonal) is interesting.
+        assert_almost_equal(np.triu(s[:-1, 1:]),
+                            np.triu(s_expected[:-1, 1:]))
+        assert_almost_equal(np.triu(m), np.triu(m_expected))
+
+    def test_too_few_input_arrays(self):
+        assert_raises(ValueError, multi_dot, [])
+        assert_raises(ValueError, multi_dot, [np.random.random((3, 3))])
+
+
+class TestTensorinv:
+
+    @pytest.mark.parametrize("arr, ind", [
+        (np.ones((4, 6, 8, 2)), 2),
+        (np.ones((3, 3, 2)), 1),
+        ])
+    def test_non_square_handling(self, arr, ind):
+        with assert_raises(LinAlgError):
+            linalg.tensorinv(arr, ind=ind)
+
+    @pytest.mark.parametrize("shape, ind", [
+        # examples from docstring
+        ((4, 6, 8, 3), 2),
+        ((24, 8, 3), 1),
+        ])
+    def test_tensorinv_shape(self, shape, ind):
+        a = np.eye(24)
+        a.shape = shape
+        ainv = linalg.tensorinv(a=a, ind=ind)
+        expected = a.shape[ind:] + a.shape[:ind]
+        actual = ainv.shape
+        assert_equal(actual, expected)
+
+    @pytest.mark.parametrize("ind", [
+        0, -2,
+        ])
+    def test_tensorinv_ind_limit(self, ind):
+        a = np.eye(24)
+        a.shape = (4, 6, 8, 3)
+        with assert_raises(ValueError):
+            linalg.tensorinv(a=a, ind=ind)
+
+    def test_tensorinv_result(self):
+        # mimic a docstring example
+        a = np.eye(24)
+        a.shape = (24, 8, 3)
+        ainv = linalg.tensorinv(a, ind=1)
+        b = np.ones(24)
+        assert_allclose(np.tensordot(ainv, b, 1), np.linalg.tensorsolve(a, b))
+
+
+class TestTensorsolve:
+
+    @pytest.mark.parametrize("a, axes", [
+        (np.ones((4, 6, 8, 2)), None),
+        (np.ones((3, 3, 2)), (0, 2)),
+        ])
+    def test_non_square_handling(self, a, axes):
+        with assert_raises(LinAlgError):
+            b = np.ones(a.shape[:2])
+            linalg.tensorsolve(a, b, axes=axes)
+
+    @pytest.mark.parametrize("shape",
+        [(2, 3, 6), (3, 4, 4, 3), (0, 3, 3, 0)],
+    )
+    def test_tensorsolve_result(self, shape):
+        a = np.random.randn(*shape)
+        b = np.ones(a.shape[:2])
+        x = np.linalg.tensorsolve(a, b)
+        assert_allclose(np.tensordot(a, x, axes=len(x.shape)), b)
+
+
+def test_unsupported_commontype():
+    # linalg gracefully handles unsupported type
+    arr = np.array([[1, -2], [2, 5]], dtype='float16')
+    with assert_raises_regex(TypeError, "unsupported in linalg"):
+        linalg.cholesky(arr)
+
+
+#@pytest.mark.slow
+#@pytest.mark.xfail(not HAS_LAPACK64, run=False,
+#                   reason="Numpy not compiled with 64-bit BLAS/LAPACK")
+#@requires_memory(free_bytes=16e9)
+@pytest.mark.skip(reason="Bad memory reports lead to OOM in ci testing")
+def test_blas64_dot():
+    n = 2**32
+    a = np.zeros([1, n], dtype=np.float32)
+    b = np.ones([1, 1], dtype=np.float32)
+    a[0,-1] = 1
+    c = np.dot(b, a)
+    assert_equal(c[0,-1], 1)
+
+
+@pytest.mark.xfail(not HAS_LAPACK64,
+                   reason="Numpy not compiled with 64-bit BLAS/LAPACK")
+def test_blas64_geqrf_lwork_smoketest():
+    # Smoke test LAPACK geqrf lwork call with 64-bit integers
+    dtype = np.float64
+    lapack_routine = np.linalg.lapack_lite.dgeqrf
+
+    m = 2**32 + 1
+    n = 2**32 + 1
+    lda = m
+
+    # Dummy arrays, not referenced by the lapack routine, so don't
+    # need to be of the right size
+    a = np.zeros([1, 1], dtype=dtype)
+    work = np.zeros([1], dtype=dtype)
+    tau = np.zeros([1], dtype=dtype)
+
+    # Size query
+    results = lapack_routine(m, n, a, lda, tau, work, -1, 0)
+    assert_equal(results['info'], 0)
+    assert_equal(results['m'], m)
+    assert_equal(results['n'], m)
+
+    # Should result to an integer of a reasonable size
+    lwork = int(work.item())
+    assert_(2**32 < lwork < 2**42)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/linalg/tests/test_regression.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/linalg/tests/test_regression.py
new file mode 100644
index 0000000000000000000000000000000000000000..af38443a93c3b74d4344130ab7bbac0206e9f7f7
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/linalg/tests/test_regression.py
@@ -0,0 +1,145 @@
+""" Test functions for linalg module
+"""
+import warnings
+
+import numpy as np
+from numpy import linalg, arange, float64, array, dot, transpose
+from numpy.testing import (
+    assert_, assert_raises, assert_equal, assert_array_equal,
+    assert_array_almost_equal, assert_array_less
+)
+
+
+class TestRegression:
+
+    def test_eig_build(self):
+        # Ticket #652
+        rva = array([1.03221168e+02 + 0.j,
+                     -1.91843603e+01 + 0.j,
+                     -6.04004526e-01 + 15.84422474j,
+                     -6.04004526e-01 - 15.84422474j,
+                     -1.13692929e+01 + 0.j,
+                     -6.57612485e-01 + 10.41755503j,
+                     -6.57612485e-01 - 10.41755503j,
+                     1.82126812e+01 + 0.j,
+                     1.06011014e+01 + 0.j,
+                     7.80732773e+00 + 0.j,
+                     -7.65390898e-01 + 0.j,
+                     1.51971555e-15 + 0.j,
+                     -1.51308713e-15 + 0.j])
+        a = arange(13 * 13, dtype=float64)
+        a.shape = (13, 13)
+        a = a % 17
+        va, ve = linalg.eig(a)
+        va.sort()
+        rva.sort()
+        assert_array_almost_equal(va, rva)
+
+    def test_eigh_build(self):
+        # Ticket 662.
+        rvals = [68.60568999, 89.57756725, 106.67185574]
+
+        cov = array([[77.70273908,   3.51489954,  15.64602427],
+                     [3.51489954,  88.97013878,  -1.07431931],
+                     [15.64602427,  -1.07431931,  98.18223512]])
+
+        vals, vecs = linalg.eigh(cov)
+        assert_array_almost_equal(vals, rvals)
+
+    def test_svd_build(self):
+        # Ticket 627.
+        a = array([[0., 1.], [1., 1.], [2., 1.], [3., 1.]])
+        m, n = a.shape
+        u, s, vh = linalg.svd(a)
+
+        b = dot(transpose(u[:, n:]), a)
+
+        assert_array_almost_equal(b, np.zeros((2, 2)))
+
+    def test_norm_vector_badarg(self):
+        # Regression for #786: Frobenius norm for vectors raises
+        # ValueError.
+        assert_raises(ValueError, linalg.norm, array([1., 2., 3.]), 'fro')
+
+    def test_lapack_endian(self):
+        # For bug #1482
+        a = array([[5.7998084,  -2.1825367],
+                   [-2.1825367,   9.85910595]], dtype='>f8')
+        b = array(a, dtype='<f8')
+
+        ap = linalg.cholesky(a)
+        bp = linalg.cholesky(b)
+        assert_array_equal(ap, bp)
+
+    def test_large_svd_32bit(self):
+        # See gh-4442, 64bit would require very large/slow matrices.
+        x = np.eye(1000, 66)
+        np.linalg.svd(x)
+
+    def test_svd_no_uv(self):
+        # gh-4733
+        for shape in (3, 4), (4, 4), (4, 3):
+            for t in float, complex:
+                a = np.ones(shape, dtype=t)
+                w = linalg.svd(a, compute_uv=False)
+                c = np.count_nonzero(np.absolute(w) > 0.5)
+                assert_equal(c, 1)
+                assert_equal(np.linalg.matrix_rank(a), 1)
+                assert_array_less(1, np.linalg.norm(a, ord=2))
+
+    def test_norm_object_array(self):
+        # gh-7575
+        testvector = np.array([np.array([0, 1]), 0, 0], dtype=object)
+
+        norm = linalg.norm(testvector)
+        assert_array_equal(norm, [0, 1])
+        assert_(norm.dtype == np.dtype('float64'))
+
+        norm = linalg.norm(testvector, ord=1)
+        assert_array_equal(norm, [0, 1])
+        assert_(norm.dtype != np.dtype('float64'))
+
+        norm = linalg.norm(testvector, ord=2)
+        assert_array_equal(norm, [0, 1])
+        assert_(norm.dtype == np.dtype('float64'))
+
+        assert_raises(ValueError, linalg.norm, testvector, ord='fro')
+        assert_raises(ValueError, linalg.norm, testvector, ord='nuc')
+        assert_raises(ValueError, linalg.norm, testvector, ord=np.inf)
+        assert_raises(ValueError, linalg.norm, testvector, ord=-np.inf)
+        assert_raises(ValueError, linalg.norm, testvector, ord=0)
+        assert_raises(ValueError, linalg.norm, testvector, ord=-1)
+        assert_raises(ValueError, linalg.norm, testvector, ord=-2)
+
+        testmatrix = np.array([[np.array([0, 1]), 0, 0],
+                               [0,                0, 0]], dtype=object)
+
+        norm = linalg.norm(testmatrix)
+        assert_array_equal(norm, [0, 1])
+        assert_(norm.dtype == np.dtype('float64'))
+
+        norm = linalg.norm(testmatrix, ord='fro')
+        assert_array_equal(norm, [0, 1])
+        assert_(norm.dtype == np.dtype('float64'))
+
+        assert_raises(TypeError, linalg.norm, testmatrix, ord='nuc')
+        assert_raises(ValueError, linalg.norm, testmatrix, ord=np.inf)
+        assert_raises(ValueError, linalg.norm, testmatrix, ord=-np.inf)
+        assert_raises(ValueError, linalg.norm, testmatrix, ord=0)
+        assert_raises(ValueError, linalg.norm, testmatrix, ord=1)
+        assert_raises(ValueError, linalg.norm, testmatrix, ord=-1)
+        assert_raises(TypeError, linalg.norm, testmatrix, ord=2)
+        assert_raises(TypeError, linalg.norm, testmatrix, ord=-2)
+        assert_raises(ValueError, linalg.norm, testmatrix, ord=3)
+
+    def test_lstsq_complex_larger_rhs(self):
+        # gh-9891
+        size = 20
+        n_rhs = 70
+        G = np.random.randn(size, size) + 1j * np.random.randn(size, size)
+        u = np.random.randn(size, n_rhs) + 1j * np.random.randn(size, n_rhs)
+        b = G.dot(u)
+        # This should work without segmentation fault.
+        u_lstsq, res, rank, sv = linalg.lstsq(G, b, rcond=None)
+        # check results just in case
+        assert_array_almost_equal(u_lstsq, u)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/arithmetic.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/arithmetic.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e256bc37c6c335a54261d34a6f802c2c7321328b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/arithmetic.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/arrayterator.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/arrayterator.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0b9646a8df2465254e5a0cb602830d95a544a550
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/arrayterator.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/bitwise_ops.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/bitwise_ops.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9318c67f6e4973518826e566c142acc53056a33a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/bitwise_ops.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/einsumfunc.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/einsumfunc.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..134247a1f999cdb040e5d2a5996ff711f72cf842
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/einsumfunc.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/lib_utils.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/lib_utils.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e44399097320c8ff6cd068c2f34a8099b74e7b30
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/lib_utils.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/literal.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/literal.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d988303bf7a1b08cd039f7ccce63429f80acd5c2
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/literal.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/multiarray.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/multiarray.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..827590b1214199cc47a15733fde0a520d6d85022
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/multiarray.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/numeric.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/numeric.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4f25197625ab3be8be460dab9a9e29074fd912c7
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/numeric.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/simple_py3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/simple_py3.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9c1f8bd185a6d1c28a26fb9023529a9539ed3c0d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/simple_py3.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/ufuncs.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/ufuncs.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..335c9515f838a20868624ce3c3b18db9689fc581
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/ufuncs.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/warnings_and_errors.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/warnings_and_errors.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..61d20ae9394b00e221d86cf2bed6143d153ffb1d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/numpy/typing/tests/data/pass/__pycache__/warnings_and_errors.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_convolution_double_backward.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_convolution_double_backward.h
new file mode 100644
index 0000000000000000000000000000000000000000..454c0b2d12778ad7b7337d189653cb44b9327895
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/_convolution_double_backward.h
@@ -0,0 +1,53 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Function.h
+
+#include <ATen/Context.h>
+#include <ATen/DeviceGuard.h>
+#include <ATen/TensorUtils.h>
+#include <ATen/TracerMode.h>
+#include <ATen/core/Generator.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/core/Tensor.h>
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <string_view>
+
+
+
+#include <ATen/ops/_convolution_double_backward_ops.h>
+
+namespace at {
+
+
+// aten::_convolution_double_backward(Tensor? ggI, Tensor? ggW, Tensor? ggb, Tensor gO, Tensor weight, Tensor self, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> _convolution_double_backward(const ::std::optional<at::Tensor> & ggI, const ::std::optional<at::Tensor> & ggW, const ::std::optional<at::Tensor> & ggb, const at::Tensor & gO, const at::Tensor & weight, const at::Tensor & self, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, ::std::array<bool,3> output_mask) {
+    return at::_ops::_convolution_double_backward::call(ggI, ggW, ggb, gO, weight, self, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), transposed, c10::fromIntArrayRefSlow(output_padding), groups, output_mask);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same_v<T, int64_t>>>
+  ::std::tuple<at::Tensor,at::Tensor,at::Tensor> _convolution_double_backward(const ::std::optional<at::Tensor> & ggI, const ::std::optional<at::Tensor> & ggW, const ::std::optional<at::Tensor> & ggb, const at::Tensor & gO, const at::Tensor & weight, const at::Tensor & self, at::IntArrayRef stride, at::IntArrayRef padding, at::IntArrayRef dilation, bool transposed, at::IntArrayRef output_padding, int64_t groups, ::std::array<bool,3> output_mask) {
+    return at::_ops::_convolution_double_backward::call(ggI, ggW, ggb, gO, weight, self, c10::fromIntArrayRefSlow(stride), c10::fromIntArrayRefSlow(padding), c10::fromIntArrayRefSlow(dilation), transposed, c10::fromIntArrayRefSlow(output_padding), groups, output_mask);
+  }
+}
+
+// aten::_convolution_double_backward(Tensor? ggI, Tensor? ggW, Tensor? ggb, Tensor gO, Tensor weight, Tensor self, SymInt[] stride, SymInt[] padding, SymInt[] dilation, bool transposed, SymInt[] output_padding, SymInt groups, bool[3] output_mask) -> (Tensor, Tensor, Tensor)
+inline ::std::tuple<at::Tensor,at::Tensor,at::Tensor> _convolution_double_backward_symint(const ::std::optional<at::Tensor> & ggI, const ::std::optional<at::Tensor> & ggW, const ::std::optional<at::Tensor> & ggb, const at::Tensor & gO, const at::Tensor & weight, const at::Tensor & self, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, ::std::array<bool,3> output_mask) {
+    return at::_ops::_convolution_double_backward::call(ggI, ggW, ggb, gO, weight, self, stride, padding, dilation, transposed, output_padding, groups, output_mask);
+}
+namespace symint {
+  template <typename T, typename = std::enable_if_t<std::is_same_v<T, c10::SymInt>>>
+  ::std::tuple<at::Tensor,at::Tensor,at::Tensor> _convolution_double_backward(const ::std::optional<at::Tensor> & ggI, const ::std::optional<at::Tensor> & ggW, const ::std::optional<at::Tensor> & ggb, const at::Tensor & gO, const at::Tensor & weight, const at::Tensor & self, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, bool transposed, c10::SymIntArrayRef output_padding, c10::SymInt groups, ::std::array<bool,3> output_mask) {
+    return at::_ops::_convolution_double_backward::call(ggI, ggW, ggb, gO, weight, self, stride, padding, dilation, transposed, output_padding, groups, output_mask);
+  }
+}
+
+}
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cudnn_convolution_relu_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cudnn_convolution_relu_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..3a54c0f6c3734615a2acf372d0fc0f62d22feaad
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/cudnn_convolution_relu_ops.h
@@ -0,0 +1,45 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <string_view>
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API cudnn_convolution_relu {
+  using schema = at::Tensor (const at::Tensor &, const at::Tensor &, const ::std::optional<at::Tensor> &, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymInt);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::cudnn_convolution_relu";
+  static constexpr const char* overload_name = "";
+  static constexpr const char* schema_str = "cudnn_convolution_relu(Tensor self, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups) -> Tensor";
+  static at::Tensor call(const at::Tensor & self, const at::Tensor & weight, const ::std::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, const ::std::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups);
+};
+
+struct TORCH_API cudnn_convolution_relu_out {
+  using schema = at::Tensor & (const at::Tensor &, const at::Tensor &, const ::std::optional<at::Tensor> &, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymIntArrayRef, c10::SymInt, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::cudnn_convolution_relu";
+  static constexpr const char* overload_name = "out";
+  static constexpr const char* schema_str = "cudnn_convolution_relu.out(Tensor self, Tensor weight, Tensor? bias, SymInt[] stride, SymInt[] padding, SymInt[] dilation, SymInt groups, *, Tensor(a!) out) -> Tensor(a!)";
+  static at::Tensor & call(const at::Tensor & self, const at::Tensor & weight, const ::std::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, const at::Tensor & weight, const ::std::optional<at::Tensor> & bias, c10::SymIntArrayRef stride, c10::SymIntArrayRef padding, c10::SymIntArrayRef dilation, c10::SymInt groups, at::Tensor & out);
+};
+
+}} // namespace at::_ops
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mkldnn_adaptive_avg_pool2d_ops.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mkldnn_adaptive_avg_pool2d_ops.h
new file mode 100644
index 0000000000000000000000000000000000000000..4b8399254b2f4ac51ae809f800672a90268a89a7
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/mkldnn_adaptive_avg_pool2d_ops.h
@@ -0,0 +1,45 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from Operator.h
+
+#include <string_view>
+#include <tuple>
+#include <vector>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+namespace _ops {
+
+
+struct TORCH_API mkldnn_adaptive_avg_pool2d {
+  using schema = at::Tensor (const at::Tensor &, at::IntArrayRef);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::mkldnn_adaptive_avg_pool2d";
+  static constexpr const char* overload_name = "";
+  static constexpr const char* schema_str = "mkldnn_adaptive_avg_pool2d(Tensor self, int[2] output_size) -> Tensor";
+  static at::Tensor call(const at::Tensor & self, at::IntArrayRef output_size);
+  static at::Tensor redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size);
+};
+
+struct TORCH_API mkldnn_adaptive_avg_pool2d_out {
+  using schema = at::Tensor & (const at::Tensor &, at::IntArrayRef, at::Tensor &);
+  using ptr_schema = schema*;
+  // See Note [static constexpr char* members for windows NVCC]
+  static constexpr const char* name = "aten::mkldnn_adaptive_avg_pool2d";
+  static constexpr const char* overload_name = "out";
+  static constexpr const char* schema_str = "mkldnn_adaptive_avg_pool2d.out(Tensor self, int[2] output_size, *, Tensor(a!) out) -> Tensor(a!)";
+  static at::Tensor & call(const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out);
+  static at::Tensor & redispatch(c10::DispatchKeySet dispatchKeySet, const at::Tensor & self, at::IntArrayRef output_size, at::Tensor & out);
+};
+
+}} // namespace at::_ops
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/silu_meta.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/silu_meta.h
new file mode 100644
index 0000000000000000000000000000000000000000..7b0eb00bdf69acc7dc935f62a4590474eca07cbd
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/silu_meta.h
@@ -0,0 +1,32 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+
+// @generated by torchgen/gen.py from NativeMetaFunction.h
+
+#include <c10/core/Scalar.h>
+#include <c10/core/Storage.h>
+#include <c10/core/TensorOptions.h>
+#include <c10/util/Deprecated.h>
+#include <optional>
+#include <c10/core/QScheme.h>
+#include <ATen/core/Reduction.h>
+#include <ATen/TensorIterator.h>
+#include <ATen/TensorMeta.h>
+#include <tuple>
+#include <vector>
+
+namespace at {
+namespace meta {
+
+struct TORCH_API structured_silu : public TensorIteratorBase {
+
+
+    void meta(const at::Tensor & self);
+};
+
+} // namespace native
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/slow_conv_transpose2d_compositeexplicitautogradnonfunctional_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/slow_conv_transpose2d_compositeexplicitautogradnonfunctional_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..f948c61e0902061f7ed6cec000648b106a50cf45
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/slow_conv_transpose2d_compositeexplicitautogradnonfunctional_dispatch.h
@@ -0,0 +1,29 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautogradnonfunctional {
+
+TORCH_API at::Tensor slow_conv_transpose2d(const at::Tensor & self, const at::Tensor & weight, at::IntArrayRef kernel_size, const ::std::optional<at::Tensor> & bias={}, at::IntArrayRef stride=1, at::IntArrayRef padding=0, at::IntArrayRef output_padding=0, at::IntArrayRef dilation=1);
+TORCH_API at::Tensor slow_conv_transpose2d_symint(const at::Tensor & self, const at::Tensor & weight, c10::SymIntArrayRef kernel_size, const ::std::optional<at::Tensor> & bias={}, c10::SymIntArrayRef stride=c10::SymInt(1), c10::SymIntArrayRef padding=c10::SymInt(0), c10::SymIntArrayRef output_padding=c10::SymInt(0), c10::SymIntArrayRef dilation=c10::SymInt(1));
+
+} // namespace compositeexplicitautogradnonfunctional
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_bilinear2d_compositeexplicitautogradnonfunctional_dispatch.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_bilinear2d_compositeexplicitautogradnonfunctional_dispatch.h
new file mode 100644
index 0000000000000000000000000000000000000000..26131cf94bb7acb6b525a6cb3fd3a6d548411e3a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/ATen/ops/upsample_bilinear2d_compositeexplicitautogradnonfunctional_dispatch.h
@@ -0,0 +1,29 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+#pragma once
+// @generated by torchgen/gen.py from DispatchKeyFunction.h
+
+// NB: The implementing C++ file is RegisterDispatchKey.cpp
+
+// The only #includes we need are for custom classes that have defaults in the C++ API
+#include <c10/core/MemoryFormat.h>
+#include <c10/core/Scalar.h>
+#include <ATen/core/Reduction.h>
+
+// Forward declarations of any types needed in the operator signatures.
+// We can't directly include these classes because it will cause circular include dependencies.
+// This file is included by TensorBody.h, which defines the Tensor class.
+#include <ATen/core/ATen_fwd.h>
+
+namespace at {
+
+namespace compositeexplicitautogradnonfunctional {
+
+TORCH_API at::Tensor upsample_bilinear2d(const at::Tensor & self, at::IntArrayRef output_size, bool align_corners, ::std::optional<double> scales_h=::std::nullopt, ::std::optional<double> scales_w=::std::nullopt);
+TORCH_API at::Tensor upsample_bilinear2d_symint(const at::Tensor & self, c10::SymIntArrayRef output_size, bool align_corners, ::std::optional<double> scales_h=::std::nullopt, ::std::optional<double> scales_w=::std::nullopt);
+
+} // namespace compositeexplicitautogradnonfunctional
+} // namespace at
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/any.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/any.h
new file mode 100644
index 0000000000000000000000000000000000000000..1bb6306e75cda733f20e34123e0f6720aefb8a06
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/any.h
@@ -0,0 +1,154 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Protocol Buffers - Google's data interchange format
+// Copyright 2008 Google Inc.  All rights reserved.
+// https://developers.google.com/protocol-buffers/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef GOOGLE_PROTOBUF_ANY_H__
+#define GOOGLE_PROTOBUF_ANY_H__
+
+#include <string>
+
+#include <google/protobuf/stubs/common.h>
+#include <google/protobuf/arenastring.h>
+#include <google/protobuf/message_lite.h>
+
+#include <google/protobuf/port_def.inc>
+
+namespace google {
+namespace protobuf {
+
+class FieldDescriptor;
+class Message;
+
+namespace internal {
+
+extern const char kAnyFullTypeName[];          // "google.protobuf.Any".
+extern const char kTypeGoogleApisComPrefix[];  // "type.googleapis.com/".
+extern const char kTypeGoogleProdComPrefix[];  // "type.googleprod.com/".
+
+std::string GetTypeUrl(StringPiece message_name,
+                       StringPiece type_url_prefix);
+
+// Helper class used to implement google::protobuf::Any.
+class PROTOBUF_EXPORT AnyMetadata {
+  typedef ArenaStringPtr UrlType;
+  typedef ArenaStringPtr ValueType;
+ public:
+  // AnyMetadata does not take ownership of "type_url" and "value".
+  AnyMetadata(UrlType* type_url, ValueType* value);
+
+  // Packs a message using the default type URL prefix: "type.googleapis.com".
+  // The resulted type URL will be "type.googleapis.com/<message_full_name>".
+  template <typename T>
+  void PackFrom(const T& message) {
+    InternalPackFrom(message, kTypeGoogleApisComPrefix, T::FullMessageName());
+  }
+
+  void PackFrom(const Message& message);
+
+  // Packs a message using the given type URL prefix. The type URL will be
+  // constructed by concatenating the message type's full name to the prefix
+  // with an optional "/" separator if the prefix doesn't already end with "/".
+  // For example, both PackFrom(message, "type.googleapis.com") and
+  // PackFrom(message, "type.googleapis.com/") yield the same result type
+  // URL: "type.googleapis.com/<message_full_name>".
+  template <typename T>
+  void PackFrom(const T& message, StringPiece type_url_prefix) {
+    InternalPackFrom(message, type_url_prefix, T::FullMessageName());
+  }
+
+  void PackFrom(const Message& message, const std::string& type_url_prefix);
+
+  // Unpacks the payload into the given message. Returns false if the message's
+  // type doesn't match the type specified in the type URL (i.e., the full
+  // name after the last "/" of the type URL doesn't match the message's actual
+  // full name) or parsing the payload has failed.
+  template <typename T>
+  bool UnpackTo(T* message) const {
+    return InternalUnpackTo(T::FullMessageName(), message);
+  }
+
+  bool UnpackTo(Message* message) const;
+
+  // Checks whether the type specified in the type URL matches the given type.
+  // A type is considered matching if its full name matches the full name after
+  // the last "/" in the type URL.
+  template <typename T>
+  bool Is() const {
+    return InternalIs(T::FullMessageName());
+  }
+
+ private:
+  void InternalPackFrom(const MessageLite& message,
+                        StringPiece type_url_prefix,
+                        StringPiece type_name);
+  bool InternalUnpackTo(StringPiece type_name,
+                        MessageLite* message) const;
+  bool InternalIs(StringPiece type_name) const;
+
+  UrlType* type_url_;
+  ValueType* value_;
+
+  GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(AnyMetadata);
+};
+
+// Get the proto type name from Any::type_url value. For example, passing
+// "type.googleapis.com/rpc.QueryOrigin" will return "rpc.QueryOrigin" in
+// *full_type_name. Returns false if the type_url does not have a "/"
+// in the type url separating the full type name.
+//
+// NOTE: this function is available publicly as:
+//   google::protobuf::Any()  // static method on the generated message type.
+bool ParseAnyTypeUrl(const std::string& type_url, std::string* full_type_name);
+
+// Get the proto type name and prefix from Any::type_url value. For example,
+// passing "type.googleapis.com/rpc.QueryOrigin" will return
+// "type.googleapis.com/" in *url_prefix and "rpc.QueryOrigin" in
+// *full_type_name. Returns false if the type_url does not have a "/" in the
+// type url separating the full type name.
+bool ParseAnyTypeUrl(const std::string& type_url, std::string* url_prefix,
+                     std::string* full_type_name);
+
+// See if message is of type google.protobuf.Any, if so, return the descriptors
+// for "type_url" and "value" fields.
+bool GetAnyFieldDescriptors(const Message& message,
+                            const FieldDescriptor** type_url_field,
+                            const FieldDescriptor** value_field);
+
+}  // namespace internal
+}  // namespace protobuf
+}  // namespace google
+
+#include <google/protobuf/port_undef.inc>
+
+#endif  // GOOGLE_PROTOBUF_ANY_H__
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/extension_set_inl.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/extension_set_inl.h
new file mode 100644
index 0000000000000000000000000000000000000000..c8996c1c4e5022c0fe2a0e7fedbdd7241a8716b6
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/extension_set_inl.h
@@ -0,0 +1,281 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Protocol Buffers - Google's data interchange format
+// Copyright 2008 Google Inc.  All rights reserved.
+// https://developers.google.com/protocol-buffers/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef GOOGLE_PROTOBUF_EXTENSION_SET_INL_H__
+#define GOOGLE_PROTOBUF_EXTENSION_SET_INL_H__
+
+#include <google/protobuf/parse_context.h>
+#include <google/protobuf/extension_set.h>
+#include <google/protobuf/metadata_lite.h>
+
+namespace google {
+namespace protobuf {
+namespace internal {
+
+template <typename T>
+const char* ExtensionSet::ParseFieldWithExtensionInfo(
+    int number, bool was_packed_on_wire, const ExtensionInfo& extension,
+    InternalMetadata* metadata, const char* ptr, internal::ParseContext* ctx) {
+  if (was_packed_on_wire) {
+    switch (extension.type) {
+#define HANDLE_TYPE(UPPERCASE, CPP_CAMELCASE)                                \
+  case WireFormatLite::TYPE_##UPPERCASE:                                     \
+    return internal::Packed##CPP_CAMELCASE##Parser(                          \
+        MutableRawRepeatedField(number, extension.type, extension.is_packed, \
+                                extension.descriptor),                       \
+        ptr, ctx);
+      HANDLE_TYPE(INT32, Int32);
+      HANDLE_TYPE(INT64, Int64);
+      HANDLE_TYPE(UINT32, UInt32);
+      HANDLE_TYPE(UINT64, UInt64);
+      HANDLE_TYPE(SINT32, SInt32);
+      HANDLE_TYPE(SINT64, SInt64);
+      HANDLE_TYPE(FIXED32, Fixed32);
+      HANDLE_TYPE(FIXED64, Fixed64);
+      HANDLE_TYPE(SFIXED32, SFixed32);
+      HANDLE_TYPE(SFIXED64, SFixed64);
+      HANDLE_TYPE(FLOAT, Float);
+      HANDLE_TYPE(DOUBLE, Double);
+      HANDLE_TYPE(BOOL, Bool);
+#undef HANDLE_TYPE
+
+      case WireFormatLite::TYPE_ENUM:
+        return internal::PackedEnumParserArg<T>(
+            MutableRawRepeatedField(number, extension.type, extension.is_packed,
+                                    extension.descriptor),
+            ptr, ctx, extension.enum_validity_check.func,
+            extension.enum_validity_check.arg, metadata, number);
+      case WireFormatLite::TYPE_STRING:
+      case WireFormatLite::TYPE_BYTES:
+      case WireFormatLite::TYPE_GROUP:
+      case WireFormatLite::TYPE_MESSAGE:
+        GOOGLE_LOG(FATAL) << "Non-primitive types can't be packed.";
+        break;
+    }
+  } else {
+    switch (extension.type) {
+#define HANDLE_VARINT_TYPE(UPPERCASE, CPP_CAMELCASE)                        \
+  case WireFormatLite::TYPE_##UPPERCASE: {                                  \
+    uint64 value;                                                           \
+    ptr = VarintParse(ptr, &value);                                         \
+    GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);                                    \
+    if (extension.is_repeated) {                                            \
+      Add##CPP_CAMELCASE(number, WireFormatLite::TYPE_##UPPERCASE,          \
+                         extension.is_packed, value, extension.descriptor); \
+    } else {                                                                \
+      Set##CPP_CAMELCASE(number, WireFormatLite::TYPE_##UPPERCASE, value,   \
+                         extension.descriptor);                             \
+    }                                                                       \
+  } break
+
+      HANDLE_VARINT_TYPE(INT32, Int32);
+      HANDLE_VARINT_TYPE(INT64, Int64);
+      HANDLE_VARINT_TYPE(UINT32, UInt32);
+      HANDLE_VARINT_TYPE(UINT64, UInt64);
+      HANDLE_VARINT_TYPE(BOOL, Bool);
+#undef HANDLE_VARINT_TYPE
+#define HANDLE_SVARINT_TYPE(UPPERCASE, CPP_CAMELCASE, SIZE)                 \
+  case WireFormatLite::TYPE_##UPPERCASE: {                                  \
+    uint64 val;                                                             \
+    ptr = VarintParse(ptr, &val);                                           \
+    GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);                                    \
+    auto value = WireFormatLite::ZigZagDecode##SIZE(val);                   \
+    if (extension.is_repeated) {                                            \
+      Add##CPP_CAMELCASE(number, WireFormatLite::TYPE_##UPPERCASE,          \
+                         extension.is_packed, value, extension.descriptor); \
+    } else {                                                                \
+      Set##CPP_CAMELCASE(number, WireFormatLite::TYPE_##UPPERCASE, value,   \
+                         extension.descriptor);                             \
+    }                                                                       \
+  } break
+
+      HANDLE_SVARINT_TYPE(SINT32, Int32, 32);
+      HANDLE_SVARINT_TYPE(SINT64, Int64, 64);
+#undef HANDLE_SVARINT_TYPE
+#define HANDLE_FIXED_TYPE(UPPERCASE, CPP_CAMELCASE, CPPTYPE)                \
+  case WireFormatLite::TYPE_##UPPERCASE: {                                  \
+    auto value = UnalignedLoad<CPPTYPE>(ptr);                               \
+    ptr += sizeof(CPPTYPE);                                                 \
+    if (extension.is_repeated) {                                            \
+      Add##CPP_CAMELCASE(number, WireFormatLite::TYPE_##UPPERCASE,          \
+                         extension.is_packed, value, extension.descriptor); \
+    } else {                                                                \
+      Set##CPP_CAMELCASE(number, WireFormatLite::TYPE_##UPPERCASE, value,   \
+                         extension.descriptor);                             \
+    }                                                                       \
+  } break
+
+      HANDLE_FIXED_TYPE(FIXED32, UInt32, uint32);
+      HANDLE_FIXED_TYPE(FIXED64, UInt64, uint64);
+      HANDLE_FIXED_TYPE(SFIXED32, Int32, int32);
+      HANDLE_FIXED_TYPE(SFIXED64, Int64, int64);
+      HANDLE_FIXED_TYPE(FLOAT, Float, float);
+      HANDLE_FIXED_TYPE(DOUBLE, Double, double);
+#undef HANDLE_FIXED_TYPE
+
+      case WireFormatLite::TYPE_ENUM: {
+        uint64 val;
+        ptr = VarintParse(ptr, &val);
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+        int value = val;
+
+        if (!extension.enum_validity_check.func(
+                extension.enum_validity_check.arg, value)) {
+          WriteVarint(number, val, metadata->mutable_unknown_fields<T>());
+        } else if (extension.is_repeated) {
+          AddEnum(number, WireFormatLite::TYPE_ENUM, extension.is_packed, value,
+                  extension.descriptor);
+        } else {
+          SetEnum(number, WireFormatLite::TYPE_ENUM, value,
+                  extension.descriptor);
+        }
+        break;
+      }
+
+      case WireFormatLite::TYPE_BYTES:
+      case WireFormatLite::TYPE_STRING: {
+        std::string* value =
+            extension.is_repeated
+                ? AddString(number, WireFormatLite::TYPE_STRING,
+                            extension.descriptor)
+                : MutableString(number, WireFormatLite::TYPE_STRING,
+                                extension.descriptor);
+        int size = ReadSize(&ptr);
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+        return ctx->ReadString(ptr, size, value);
+      }
+
+      case WireFormatLite::TYPE_GROUP: {
+        MessageLite* value =
+            extension.is_repeated
+                ? AddMessage(number, WireFormatLite::TYPE_GROUP,
+                             *extension.message_info.prototype,
+                             extension.descriptor)
+                : MutableMessage(number, WireFormatLite::TYPE_GROUP,
+                                 *extension.message_info.prototype,
+                                 extension.descriptor);
+        uint32 tag = (number << 3) + WireFormatLite::WIRETYPE_START_GROUP;
+        return ctx->ParseGroup(value, ptr, tag);
+      }
+
+      case WireFormatLite::TYPE_MESSAGE: {
+        MessageLite* value =
+            extension.is_repeated
+                ? AddMessage(number, WireFormatLite::TYPE_MESSAGE,
+                             *extension.message_info.prototype,
+                             extension.descriptor)
+                : MutableMessage(number, WireFormatLite::TYPE_MESSAGE,
+                                 *extension.message_info.prototype,
+                                 extension.descriptor);
+        return ctx->ParseMessage(value, ptr);
+      }
+    }
+  }
+  return ptr;
+}
+
+template <typename Msg, typename T>
+const char* ExtensionSet::ParseMessageSetItemTmpl(
+    const char* ptr, const Msg* containing_type,
+    internal::InternalMetadata* metadata, internal::ParseContext* ctx) {
+  std::string payload;
+  uint32 type_id = 0;
+  bool payload_read = false;
+  while (!ctx->Done(&ptr)) {
+    uint32 tag = static_cast<uint8>(*ptr++);
+    if (tag == WireFormatLite::kMessageSetTypeIdTag) {
+      uint64 tmp;
+      ptr = ParseBigVarint(ptr, &tmp);
+      GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+      type_id = tmp;
+      if (payload_read) {
+        ExtensionInfo extension;
+        bool was_packed_on_wire;
+        if (!FindExtension(2, type_id, containing_type, ctx, &extension,
+                           &was_packed_on_wire)) {
+          WriteLengthDelimited(type_id, payload,
+                               metadata->mutable_unknown_fields<T>());
+        } else {
+          MessageLite* value =
+              extension.is_repeated
+                  ? AddMessage(type_id, WireFormatLite::TYPE_MESSAGE,
+                               *extension.message_info.prototype,
+                               extension.descriptor)
+                  : MutableMessage(type_id, WireFormatLite::TYPE_MESSAGE,
+                                   *extension.message_info.prototype,
+                                   extension.descriptor);
+
+          const char* p;
+          // We can't use regular parse from string as we have to track
+          // proper recursion depth and descriptor pools.
+          ParseContext tmp_ctx(ctx->depth(), false, &p, payload);
+          tmp_ctx.data().pool = ctx->data().pool;
+          tmp_ctx.data().factory = ctx->data().factory;
+          GOOGLE_PROTOBUF_PARSER_ASSERT(value->_InternalParse(p, &tmp_ctx) &&
+                                         tmp_ctx.EndedAtLimit());
+        }
+        type_id = 0;
+      }
+    } else if (tag == WireFormatLite::kMessageSetMessageTag) {
+      if (type_id != 0) {
+        ptr = ParseFieldMaybeLazily(static_cast<uint64>(type_id) * 8 + 2, ptr,
+                                    containing_type, metadata, ctx);
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr != nullptr);
+        type_id = 0;
+      } else {
+        int32 size = ReadSize(&ptr);
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+        ptr = ctx->ReadString(ptr, size, &payload);
+        GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+        payload_read = true;
+      }
+    } else {
+      ptr = ReadTag(ptr - 1, &tag);
+      if (tag == 0 || (tag & 7) == 4) {
+        ctx->SetLastTag(tag);
+        return ptr;
+      }
+      ptr = ParseField(tag, ptr, containing_type, metadata, ctx);
+      GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+    }
+  }
+  return ptr;
+}
+
+}  // namespace internal
+}  // namespace protobuf
+}  // namespace google
+
+#endif  // GOOGLE_PROTOBUF_EXTENSION_SET_INL_H__
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/field_mask.pb.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/field_mask.pb.h
new file mode 100644
index 0000000000000000000000000000000000000000..d8d57d823c293171522dfef2099cf1be0b51b261
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/field_mask.pb.h
@@ -0,0 +1,320 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Generated by the protocol buffer compiler.  DO NOT EDIT!
+// source: google/protobuf/field_mask.proto
+
+#ifndef GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2ffield_5fmask_2eproto
+#define GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2ffield_5fmask_2eproto
+
+#include <limits>
+#include <string>
+
+#include <google/protobuf/port_def.inc>
+#if PROTOBUF_VERSION < 3013000
+#error This file was generated by a newer version of protoc which is
+#error incompatible with your Protocol Buffer headers. Please update
+#error your headers.
+#endif
+#if 3013000 < PROTOBUF_MIN_PROTOC_VERSION
+#error This file was generated by an older version of protoc which is
+#error incompatible with your Protocol Buffer headers. Please
+#error regenerate this file with a newer version of protoc.
+#endif
+
+#include <google/protobuf/port_undef.inc>
+#include <google/protobuf/io/coded_stream.h>
+#include <google/protobuf/arena.h>
+#include <google/protobuf/arenastring.h>
+#include <google/protobuf/generated_message_table_driven.h>
+#include <google/protobuf/generated_message_util.h>
+#include <google/protobuf/inlined_string_field.h>
+#include <google/protobuf/metadata_lite.h>
+#include <google/protobuf/generated_message_reflection.h>
+#include <google/protobuf/message.h>
+#include <google/protobuf/repeated_field.h>  // IWYU pragma: export
+#include <google/protobuf/extension_set.h>  // IWYU pragma: export
+#include <google/protobuf/unknown_field_set.h>
+// @@protoc_insertion_point(includes)
+#include <google/protobuf/port_def.inc>
+#define PROTOBUF_INTERNAL_EXPORT_google_2fprotobuf_2ffield_5fmask_2eproto PROTOBUF_EXPORT
+PROTOBUF_NAMESPACE_OPEN
+namespace internal {
+class AnyMetadata;
+}  // namespace internal
+PROTOBUF_NAMESPACE_CLOSE
+
+// Internal implementation detail -- do not use these members.
+struct PROTOBUF_EXPORT TableStruct_google_2fprotobuf_2ffield_5fmask_2eproto {
+  static const ::PROTOBUF_NAMESPACE_ID::internal::ParseTableField entries[]
+    PROTOBUF_SECTION_VARIABLE(protodesc_cold);
+  static const ::PROTOBUF_NAMESPACE_ID::internal::AuxiliaryParseTableField aux[]
+    PROTOBUF_SECTION_VARIABLE(protodesc_cold);
+  static const ::PROTOBUF_NAMESPACE_ID::internal::ParseTable schema[1]
+    PROTOBUF_SECTION_VARIABLE(protodesc_cold);
+  static const ::PROTOBUF_NAMESPACE_ID::internal::FieldMetadata field_metadata[];
+  static const ::PROTOBUF_NAMESPACE_ID::internal::SerializationTable serialization_table[];
+  static const ::PROTOBUF_NAMESPACE_ID::uint32 offsets[];
+};
+extern PROTOBUF_EXPORT const ::PROTOBUF_NAMESPACE_ID::internal::DescriptorTable descriptor_table_google_2fprotobuf_2ffield_5fmask_2eproto;
+PROTOBUF_NAMESPACE_OPEN
+class FieldMask;
+class FieldMaskDefaultTypeInternal;
+PROTOBUF_EXPORT extern FieldMaskDefaultTypeInternal _FieldMask_default_instance_;
+PROTOBUF_NAMESPACE_CLOSE
+PROTOBUF_NAMESPACE_OPEN
+template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::FieldMask* Arena::CreateMaybeMessage<PROTOBUF_NAMESPACE_ID::FieldMask>(Arena*);
+PROTOBUF_NAMESPACE_CLOSE
+PROTOBUF_NAMESPACE_OPEN
+
+// ===================================================================
+
+class PROTOBUF_EXPORT FieldMask PROTOBUF_FINAL :
+    public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.FieldMask) */ {
+ public:
+  inline FieldMask() : FieldMask(nullptr) {}
+  virtual ~FieldMask();
+
+  FieldMask(const FieldMask& from);
+  FieldMask(FieldMask&& from) noexcept
+    : FieldMask() {
+    *this = ::std::move(from);
+  }
+
+  inline FieldMask& operator=(const FieldMask& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  inline FieldMask& operator=(FieldMask&& from) noexcept {
+    if (GetArena() == from.GetArena()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() {
+    return GetDescriptor();
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() {
+    return GetMetadataStatic().descriptor;
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() {
+    return GetMetadataStatic().reflection;
+  }
+  static const FieldMask& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const FieldMask* internal_default_instance() {
+    return reinterpret_cast<const FieldMask*>(
+               &_FieldMask_default_instance_);
+  }
+  static constexpr int kIndexInFileMessages =
+    0;
+
+  friend void swap(FieldMask& a, FieldMask& b) {
+    a.Swap(&b);
+  }
+  inline void Swap(FieldMask* other) {
+    if (other == this) return;
+    if (GetArena() == other->GetArena()) {
+      InternalSwap(other);
+    } else {
+      ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other);
+    }
+  }
+  void UnsafeArenaSwap(FieldMask* other) {
+    if (other == this) return;
+    GOOGLE_DCHECK(GetArena() == other->GetArena());
+    InternalSwap(other);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline FieldMask* New() const final {
+    return CreateMaybeMessage<FieldMask>(nullptr);
+  }
+
+  FieldMask* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final {
+    return CreateMaybeMessage<FieldMask>(arena);
+  }
+  void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void CopyFrom(const FieldMask& from);
+  void MergeFrom(const FieldMask& from);
+  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
+  bool IsInitialized() const final;
+
+  size_t ByteSizeLong() const final;
+  const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final;
+  ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize(
+      ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final;
+  int GetCachedSize() const final { return _cached_size_.Get(); }
+
+  private:
+  inline void SharedCtor();
+  inline void SharedDtor();
+  void SetCachedSize(int size) const final;
+  void InternalSwap(FieldMask* other);
+  friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata;
+  static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() {
+    return "google.protobuf.FieldMask";
+  }
+  protected:
+  explicit FieldMask(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  private:
+  static void ArenaDtor(void* object);
+  inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  public:
+
+  ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final;
+  private:
+  static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() {
+    ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2ffield_5fmask_2eproto);
+    return ::descriptor_table_google_2fprotobuf_2ffield_5fmask_2eproto.file_level_metadata[kIndexInFileMessages];
+  }
+
+  public:
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  enum : int {
+    kPathsFieldNumber = 1,
+  };
+  // repeated string paths = 1;
+  int paths_size() const;
+  private:
+  int _internal_paths_size() const;
+  public:
+  void clear_paths();
+  const std::string& paths(int index) const;
+  std::string* mutable_paths(int index);
+  void set_paths(int index, const std::string& value);
+  void set_paths(int index, std::string&& value);
+  void set_paths(int index, const char* value);
+  void set_paths(int index, const char* value, size_t size);
+  std::string* add_paths();
+  void add_paths(const std::string& value);
+  void add_paths(std::string&& value);
+  void add_paths(const char* value);
+  void add_paths(const char* value, size_t size);
+  const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField<std::string>& paths() const;
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField<std::string>* mutable_paths();
+  private:
+  const std::string& _internal_paths(int index) const;
+  std::string* _internal_add_paths();
+  public:
+
+  // @@protoc_insertion_point(class_scope:google.protobuf.FieldMask)
+ private:
+  class _Internal;
+
+  template <typename T> friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper;
+  typedef void InternalArenaConstructable_;
+  typedef void DestructorSkippable_;
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField<std::string> paths_;
+  mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_;
+  friend struct ::TableStruct_google_2fprotobuf_2ffield_5fmask_2eproto;
+};
+// ===================================================================
+
+
+// ===================================================================
+
+#ifdef __GNUC__
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#endif  // __GNUC__
+// FieldMask
+
+// repeated string paths = 1;
+inline int FieldMask::_internal_paths_size() const {
+  return paths_.size();
+}
+inline int FieldMask::paths_size() const {
+  return _internal_paths_size();
+}
+inline void FieldMask::clear_paths() {
+  paths_.Clear();
+}
+inline std::string* FieldMask::add_paths() {
+  // @@protoc_insertion_point(field_add_mutable:google.protobuf.FieldMask.paths)
+  return _internal_add_paths();
+}
+inline const std::string& FieldMask::_internal_paths(int index) const {
+  return paths_.Get(index);
+}
+inline const std::string& FieldMask::paths(int index) const {
+  // @@protoc_insertion_point(field_get:google.protobuf.FieldMask.paths)
+  return _internal_paths(index);
+}
+inline std::string* FieldMask::mutable_paths(int index) {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.FieldMask.paths)
+  return paths_.Mutable(index);
+}
+inline void FieldMask::set_paths(int index, const std::string& value) {
+  // @@protoc_insertion_point(field_set:google.protobuf.FieldMask.paths)
+  paths_.Mutable(index)->assign(value);
+}
+inline void FieldMask::set_paths(int index, std::string&& value) {
+  // @@protoc_insertion_point(field_set:google.protobuf.FieldMask.paths)
+  paths_.Mutable(index)->assign(std::move(value));
+}
+inline void FieldMask::set_paths(int index, const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  paths_.Mutable(index)->assign(value);
+  // @@protoc_insertion_point(field_set_char:google.protobuf.FieldMask.paths)
+}
+inline void FieldMask::set_paths(int index, const char* value, size_t size) {
+  paths_.Mutable(index)->assign(
+    reinterpret_cast<const char*>(value), size);
+  // @@protoc_insertion_point(field_set_pointer:google.protobuf.FieldMask.paths)
+}
+inline std::string* FieldMask::_internal_add_paths() {
+  return paths_.Add();
+}
+inline void FieldMask::add_paths(const std::string& value) {
+  paths_.Add()->assign(value);
+  // @@protoc_insertion_point(field_add:google.protobuf.FieldMask.paths)
+}
+inline void FieldMask::add_paths(std::string&& value) {
+  paths_.Add(std::move(value));
+  // @@protoc_insertion_point(field_add:google.protobuf.FieldMask.paths)
+}
+inline void FieldMask::add_paths(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  paths_.Add()->assign(value);
+  // @@protoc_insertion_point(field_add_char:google.protobuf.FieldMask.paths)
+}
+inline void FieldMask::add_paths(const char* value, size_t size) {
+  paths_.Add()->assign(reinterpret_cast<const char*>(value), size);
+  // @@protoc_insertion_point(field_add_pointer:google.protobuf.FieldMask.paths)
+}
+inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField<std::string>&
+FieldMask::paths() const {
+  // @@protoc_insertion_point(field_list:google.protobuf.FieldMask.paths)
+  return paths_;
+}
+inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField<std::string>*
+FieldMask::mutable_paths() {
+  // @@protoc_insertion_point(field_mutable_list:google.protobuf.FieldMask.paths)
+  return &paths_;
+}
+
+#ifdef __GNUC__
+  #pragma GCC diagnostic pop
+#endif  // __GNUC__
+
+// @@protoc_insertion_point(namespace_scope)
+
+PROTOBUF_NAMESPACE_CLOSE
+
+// @@protoc_insertion_point(global_scope)
+
+#include <google/protobuf/port_undef.inc>
+#endif  // GOOGLE_PROTOBUF_INCLUDED_GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2ffield_5fmask_2eproto
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/generated_message_table_driven.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/generated_message_table_driven.h
new file mode 100644
index 0000000000000000000000000000000000000000..7165fd31b5927217ed2628ca6044a2ffc41e5274
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/generated_message_table_driven.h
@@ -0,0 +1,344 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Protocol Buffers - Google's data interchange format
+// Copyright 2008 Google Inc.  All rights reserved.
+// https://developers.google.com/protocol-buffers/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef GOOGLE_PROTOBUF_GENERATED_MESSAGE_TABLE_DRIVEN_H__
+#define GOOGLE_PROTOBUF_GENERATED_MESSAGE_TABLE_DRIVEN_H__
+
+#include <google/protobuf/map.h>
+#include <google/protobuf/map_entry_lite.h>
+#include <google/protobuf/map_field_lite.h>
+#include <google/protobuf/message_lite.h>
+#include <google/protobuf/wire_format_lite.h>
+
+// We require C++11 and Clang to use constexpr for variables, as GCC 4.8
+// requires constexpr to be consistent between declarations of variables
+// unnecessarily (see https://gcc.gnu.org/bugzilla/show_bug.cgi?id=58541).
+// VS 2017 Update 3 also supports this usage of constexpr.
+#if defined(__clang__) || (defined(_MSC_VER) && _MSC_VER >= 1911)
+#define PROTOBUF_CONSTEXPR_VAR constexpr
+#else  // !__clang__
+#define PROTOBUF_CONSTEXPR_VAR
+#endif  // !_clang
+
+#ifdef SWIG
+#error "You cannot SWIG proto headers"
+#endif
+
+#include <google/protobuf/port_def.inc>
+
+namespace google {
+namespace protobuf {
+namespace internal {
+
+// Processing-type masks.
+static constexpr const unsigned char kOneofMask = 0x40;
+static constexpr const unsigned char kRepeatedMask = 0x20;
+// Mask for the raw type: either a WireFormatLite::FieldType or one of the
+// ProcessingTypes below, without the oneof or repeated flag.
+static constexpr const unsigned char kTypeMask = 0x1f;
+
+// Wire type masks.
+static constexpr const unsigned char kNotPackedMask = 0x10;
+static constexpr const unsigned char kInvalidMask = 0x20;
+
+enum ProcessingTypes {
+  TYPE_STRING_CORD = 19,
+  TYPE_STRING_STRING_PIECE = 20,
+  TYPE_BYTES_CORD = 21,
+  TYPE_BYTES_STRING_PIECE = 22,
+  TYPE_STRING_INLINED = 23,
+  TYPE_BYTES_INLINED = 24,
+  TYPE_MAP = 25,
+};
+
+static_assert(TYPE_MAP < kRepeatedMask, "Invalid enum");
+
+struct PROTOBUF_EXPORT FieldMetadata {
+  uint32 offset;  // offset of this field in the struct
+  uint32 tag;     // field * 8 + wire_type
+  // byte offset * 8 + bit_offset;
+  // if the high bit is set then this is the byte offset of the oneof_case
+  // for this field.
+  uint32 has_offset;
+  uint32 type;      // the type of this field.
+  const void* ptr;  // auxiliary data
+
+  // From the serializer point of view each fundamental type can occur in
+  // 4 different ways. For simplicity we treat all combinations as a cartesion
+  // product although not all combinations are allowed.
+  enum FieldTypeClass {
+    kPresence,
+    kNoPresence,
+    kRepeated,
+    kPacked,
+    kOneOf,
+    kNumTypeClasses  // must be last enum
+  };
+  // C++ protobuf has 20 fundamental types, were we added Cord and StringPiece
+  // and also distinquish the same types if they have different wire format.
+  enum {
+    kCordType = 19,
+    kStringPieceType = 20,
+    kInlinedType = 21,
+    kNumTypes = 21,
+    kSpecial = kNumTypes * kNumTypeClasses,
+  };
+
+  static int CalculateType(int fundamental_type, FieldTypeClass type_class);
+};
+
+// TODO(ckennelly):  Add a static assertion to ensure that these masks do not
+// conflict with wiretypes.
+
+// ParseTableField is kept small to help simplify instructions for computing
+// offsets, as we will always need this information to parse a field.
+// Additional data, needed for some types, is stored in
+// AuxiliaryParseTableField.
+struct ParseTableField {
+  uint32 offset;
+  // The presence_index ordinarily represents a has_bit index, but for fields
+  // inside a oneof it represents the index in _oneof_case_.
+  uint32 presence_index;
+  unsigned char normal_wiretype;
+  unsigned char packed_wiretype;
+
+  // processing_type is given by:
+  //   (FieldDescriptor->type() << 1) | FieldDescriptor->is_packed()
+  unsigned char processing_type;
+
+  unsigned char tag_size;
+};
+
+struct ParseTable;
+
+union AuxiliaryParseTableField {
+  typedef bool (*EnumValidator)(int);
+
+  // Enums
+  struct enum_aux {
+    EnumValidator validator;
+  };
+  enum_aux enums;
+  // Group, messages
+  struct message_aux {
+    // ExplicitlyInitialized<T> -> T requires a reinterpret_cast, which prevents
+    // the tables from being constructed as a constexpr.  We use void to avoid
+    // the cast.
+    const void* default_message_void;
+    const MessageLite* default_message() const {
+      return static_cast<const MessageLite*>(default_message_void);
+    }
+  };
+  message_aux messages;
+  // Strings
+  struct string_aux {
+    const void* default_ptr;
+    const char* field_name;
+  };
+  string_aux strings;
+
+  struct map_aux {
+    bool (*parse_map)(io::CodedInputStream*, void*);
+  };
+  map_aux maps;
+
+  AuxiliaryParseTableField() = default;
+  constexpr AuxiliaryParseTableField(AuxiliaryParseTableField::enum_aux e)
+      : enums(e) {}
+  constexpr AuxiliaryParseTableField(AuxiliaryParseTableField::message_aux m)
+      : messages(m) {}
+  constexpr AuxiliaryParseTableField(AuxiliaryParseTableField::string_aux s)
+      : strings(s) {}
+  constexpr AuxiliaryParseTableField(AuxiliaryParseTableField::map_aux m)
+      : maps(m) {}
+};
+
+struct ParseTable {
+  const ParseTableField* fields;
+  const AuxiliaryParseTableField* aux;
+  int max_field_number;
+  // TODO(ckennelly): Do something with this padding.
+
+  // TODO(ckennelly): Vet these for sign extension.
+  int64 has_bits_offset;
+  int64 oneof_case_offset;
+  int64 extension_offset;
+  int64 arena_offset;
+
+  // ExplicitlyInitialized<T> -> T requires a reinterpret_cast, which prevents
+  // the tables from being constructed as a constexpr.  We use void to avoid
+  // the cast.
+  const void* default_instance_void;
+  const MessageLite* default_instance() const {
+    return static_cast<const MessageLite*>(default_instance_void);
+  }
+
+  bool unknown_field_set;
+};
+
+static_assert(sizeof(ParseTableField) <= 16, "ParseTableField is too large");
+// The tables must be composed of POD components to ensure link-time
+// initialization.
+static_assert(std::is_pod<ParseTableField>::value, "");
+static_assert(std::is_pod<AuxiliaryParseTableField>::value, "");
+static_assert(std::is_pod<AuxiliaryParseTableField::enum_aux>::value, "");
+static_assert(std::is_pod<AuxiliaryParseTableField::message_aux>::value, "");
+static_assert(std::is_pod<AuxiliaryParseTableField::string_aux>::value, "");
+static_assert(std::is_pod<ParseTable>::value, "");
+
+// TODO(ckennelly): Consolidate these implementations into a single one, using
+// dynamic dispatch to the appropriate unknown field handler.
+bool MergePartialFromCodedStream(MessageLite* msg, const ParseTable& table,
+                                 io::CodedInputStream* input);
+bool MergePartialFromCodedStreamLite(MessageLite* msg, const ParseTable& table,
+                                     io::CodedInputStream* input);
+
+template <typename Entry>
+bool ParseMap(io::CodedInputStream* input, void* map_field) {
+  typedef typename MapEntryToMapField<Entry>::MapFieldType MapFieldType;
+  typedef Map<typename Entry::EntryKeyType, typename Entry::EntryValueType>
+      MapType;
+  typedef typename Entry::template Parser<MapFieldType, MapType> ParserType;
+
+  ParserType parser(static_cast<MapFieldType*>(map_field));
+  return WireFormatLite::ReadMessageNoVirtual(input, &parser);
+}
+
+struct SerializationTable {
+  int num_fields;
+  const FieldMetadata* field_table;
+};
+
+PROTOBUF_EXPORT void SerializeInternal(const uint8* base,
+                                       const FieldMetadata* table,
+                                       int32 num_fields,
+                                       io::CodedOutputStream* output);
+
+inline void TableSerialize(const MessageLite& msg,
+                           const SerializationTable* table,
+                           io::CodedOutputStream* output) {
+  const FieldMetadata* field_table = table->field_table;
+  int num_fields = table->num_fields - 1;
+  const uint8* base = reinterpret_cast<const uint8*>(&msg);
+  // TODO(gerbens) This skips the first test if we could use the fast
+  // array serialization path, we should make this
+  // int cached_size =
+  //    *reinterpret_cast<const int32*>(base + field_table->offset);
+  // SerializeWithCachedSize(msg, field_table + 1, num_fields, cached_size, ...)
+  // But we keep conformance with the old way for now.
+  SerializeInternal(base, field_table + 1, num_fields, output);
+}
+
+uint8* SerializeInternalToArray(const uint8* base, const FieldMetadata* table,
+                                int32 num_fields, bool is_deterministic,
+                                uint8* buffer);
+
+inline uint8* TableSerializeToArray(const MessageLite& msg,
+                                    const SerializationTable* table,
+                                    bool is_deterministic, uint8* buffer) {
+  const uint8* base = reinterpret_cast<const uint8*>(&msg);
+  const FieldMetadata* field_table = table->field_table + 1;
+  int num_fields = table->num_fields - 1;
+  return SerializeInternalToArray(base, field_table, num_fields,
+                                  is_deterministic, buffer);
+}
+
+template <typename T>
+struct CompareHelper {
+  bool operator()(const T& a, const T& b) const { return a < b; }
+};
+
+template <>
+struct CompareHelper<ArenaStringPtr> {
+  bool operator()(const ArenaStringPtr& a, const ArenaStringPtr& b) const {
+    return a.Get() < b.Get();
+  }
+};
+
+struct CompareMapKey {
+  template <typename T>
+  bool operator()(const MapEntryHelper<T>& a,
+                  const MapEntryHelper<T>& b) const {
+    return Compare(a.key_, b.key_);
+  }
+  template <typename T>
+  bool Compare(const T& a, const T& b) const {
+    return CompareHelper<T>()(a, b);
+  }
+};
+
+template <typename MapFieldType, const SerializationTable* table>
+void MapFieldSerializer(const uint8* base, uint32 offset, uint32 tag,
+                        uint32 has_offset, io::CodedOutputStream* output) {
+  typedef MapEntryHelper<typename MapFieldType::EntryTypeTrait> Entry;
+  typedef typename MapFieldType::MapType::const_iterator Iter;
+
+  const MapFieldType& map_field =
+      *reinterpret_cast<const MapFieldType*>(base + offset);
+  const SerializationTable* t =
+      table +
+      has_offset;  // has_offset is overloaded for maps to mean table offset
+  if (!output->IsSerializationDeterministic()) {
+    for (Iter it = map_field.GetMap().begin(); it != map_field.GetMap().end();
+         ++it) {
+      Entry map_entry(*it);
+      output->WriteVarint32(tag);
+      output->WriteVarint32(map_entry._cached_size_);
+      SerializeInternal(reinterpret_cast<const uint8*>(&map_entry),
+                        t->field_table, t->num_fields, output);
+    }
+  } else {
+    std::vector<Entry> v;
+    for (Iter it = map_field.GetMap().begin(); it != map_field.GetMap().end();
+         ++it) {
+      v.push_back(Entry(*it));
+    }
+    std::sort(v.begin(), v.end(), CompareMapKey());
+    for (int i = 0; i < v.size(); i++) {
+      output->WriteVarint32(tag);
+      output->WriteVarint32(v[i]._cached_size_);
+      SerializeInternal(reinterpret_cast<const uint8*>(&v[i]), t->field_table,
+                        t->num_fields, output);
+    }
+  }
+}
+
+}  // namespace internal
+}  // namespace protobuf
+}  // namespace google
+
+#include <google/protobuf/port_undef.inc>
+
+#endif  // GOOGLE_PROTOBUF_GENERATED_MESSAGE_TABLE_DRIVEN_H__
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/implicit_weak_message.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/implicit_weak_message.h
new file mode 100644
index 0000000000000000000000000000000000000000..d373a520a14bf5ab8f0bd6a354e84d0a0f1580ce
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/implicit_weak_message.h
@@ -0,0 +1,195 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Protocol Buffers - Google's data interchange format
+// Copyright 2008 Google Inc.  All rights reserved.
+// https://developers.google.com/protocol-buffers/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef GOOGLE_PROTOBUF_IMPLICIT_WEAK_MESSAGE_H__
+#define GOOGLE_PROTOBUF_IMPLICIT_WEAK_MESSAGE_H__
+
+#include <string>
+
+#include <google/protobuf/io/coded_stream.h>
+#include <google/protobuf/arena.h>
+#include <google/protobuf/message_lite.h>
+#include <google/protobuf/repeated_field.h>
+
+#ifdef SWIG
+#error "You cannot SWIG proto headers"
+#endif
+
+#include <google/protobuf/port_def.inc>
+
+// This file is logically internal-only and should only be used by protobuf
+// generated code.
+
+namespace google {
+namespace protobuf {
+namespace internal {
+
+// An implementation of MessageLite that treats all data as unknown. This type
+// acts as a placeholder for an implicit weak field in the case where the true
+// message type does not get linked into the binary.
+class PROTOBUF_EXPORT ImplicitWeakMessage : public MessageLite {
+ public:
+  ImplicitWeakMessage() {}
+  explicit ImplicitWeakMessage(Arena* arena) : MessageLite(arena) {}
+
+  static const ImplicitWeakMessage* default_instance();
+
+  std::string GetTypeName() const override { return ""; }
+
+  MessageLite* New() const override { return new ImplicitWeakMessage; }
+  MessageLite* New(Arena* arena) const override {
+    return Arena::CreateMessage<ImplicitWeakMessage>(arena);
+  }
+
+  void Clear() override { data_.clear(); }
+
+  bool IsInitialized() const override { return true; }
+
+  void CheckTypeAndMergeFrom(const MessageLite& other) override {
+    data_.append(static_cast<const ImplicitWeakMessage&>(other).data_);
+  }
+
+  const char* _InternalParse(const char* ptr, ParseContext* ctx) final;
+
+  size_t ByteSizeLong() const override { return data_.size(); }
+
+  uint8* _InternalSerialize(uint8* target,
+                            io::EpsCopyOutputStream* stream) const final {
+    return stream->WriteRaw(data_.data(), static_cast<int>(data_.size()),
+                            target);
+  }
+
+  int GetCachedSize() const override { return static_cast<int>(data_.size()); }
+
+  typedef void InternalArenaConstructable_;
+
+ private:
+  std::string data_;
+  GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(ImplicitWeakMessage);
+};
+
+// A type handler for use with implicit weak repeated message fields.
+template <typename ImplicitWeakType>
+class ImplicitWeakTypeHandler {
+ public:
+  typedef MessageLite Type;
+  static constexpr bool Moveable = false;
+
+  static inline MessageLite* NewFromPrototype(const MessageLite* prototype,
+                                              Arena* arena = NULL) {
+    return prototype->New(arena);
+  }
+
+  static inline void Delete(MessageLite* value, Arena* arena) {
+    if (arena == NULL) {
+      delete value;
+    }
+  }
+  static inline Arena* GetArena(MessageLite* value) {
+    return value->GetArena();
+  }
+  static inline void* GetMaybeArenaPointer(MessageLite* value) {
+    return value->GetArena();
+  }
+  static inline void Clear(MessageLite* value) { value->Clear(); }
+  static void Merge(const MessageLite& from, MessageLite* to) {
+    to->CheckTypeAndMergeFrom(from);
+  }
+};
+
+}  // namespace internal
+
+template <typename T>
+struct WeakRepeatedPtrField {
+  using TypeHandler = internal::ImplicitWeakTypeHandler<T>;
+  WeakRepeatedPtrField() : weak() {}
+  explicit WeakRepeatedPtrField(Arena* arena) : weak(arena) {}
+  ~WeakRepeatedPtrField() { weak.template Destroy<TypeHandler>(); }
+
+  typedef internal::RepeatedPtrIterator<MessageLite> iterator;
+  typedef internal::RepeatedPtrIterator<const MessageLite> const_iterator;
+  typedef internal::RepeatedPtrOverPtrsIterator<MessageLite*, void*>
+      pointer_iterator;
+  typedef internal::RepeatedPtrOverPtrsIterator<const MessageLite* const,
+                                                const void* const>
+      const_pointer_iterator;
+
+  iterator begin() { return iterator(base().raw_data()); }
+  const_iterator begin() const { return iterator(base().raw_data()); }
+  const_iterator cbegin() const { return begin(); }
+  iterator end() { return begin() + base().size(); }
+  const_iterator end() const { return begin() + base().size(); }
+  const_iterator cend() const { return end(); }
+  pointer_iterator pointer_begin() {
+    return pointer_iterator(base().raw_mutable_data());
+  }
+  const_pointer_iterator pointer_begin() const {
+    return const_pointer_iterator(base().raw_mutable_data());
+  }
+  pointer_iterator pointer_end() {
+    return pointer_iterator(base().raw_mutable_data() + base().size());
+  }
+  const_pointer_iterator pointer_end() const {
+    return const_pointer_iterator(base().raw_mutable_data() + base().size());
+  }
+
+  MessageLite* AddWeak(const MessageLite* prototype) {
+    return base().AddWeak(prototype);
+  }
+  T* Add() { return weak.Add(); }
+  void Clear() { base().template Clear<TypeHandler>(); }
+  void MergeFrom(const WeakRepeatedPtrField& other) {
+    base().template MergeFrom<TypeHandler>(other.base());
+  }
+  void InternalSwap(WeakRepeatedPtrField* other) {
+    base().InternalSwap(&other->base());
+  }
+
+  const internal::RepeatedPtrFieldBase& base() const { return weak; }
+  internal::RepeatedPtrFieldBase& base() { return weak; }
+  // Union disables running the destructor. Which would create a strong link.
+  // Instead we explicitly destroy the underlying base through the virtual
+  // destructor.
+  union {
+    RepeatedPtrField<T> weak;
+  };
+};
+
+}  // namespace protobuf
+}  // namespace google
+
+#include <google/protobuf/port_undef.inc>
+
+#endif  // GOOGLE_PROTOBUF_IMPLICIT_WEAK_MESSAGE_H__
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/map_entry_lite.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/map_entry_lite.h
new file mode 100644
index 0000000000000000000000000000000000000000..eb1a33b1937b979d09dc38664c1c937910a9f62d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/map_entry_lite.h
@@ -0,0 +1,676 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Protocol Buffers - Google's data interchange format
+// Copyright 2008 Google Inc.  All rights reserved.
+// https://developers.google.com/protocol-buffers/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef GOOGLE_PROTOBUF_MAP_ENTRY_LITE_H__
+#define GOOGLE_PROTOBUF_MAP_ENTRY_LITE_H__
+
+#include <assert.h>
+#include <string>
+
+#include <google/protobuf/stubs/casts.h>
+#include <google/protobuf/parse_context.h>
+#include <google/protobuf/io/coded_stream.h>
+#include <google/protobuf/arena.h>
+#include <google/protobuf/arenastring.h>
+#include <google/protobuf/generated_message_util.h>
+#include <google/protobuf/map.h>
+#include <google/protobuf/map_type_handler.h>
+#include <google/protobuf/port.h>
+#include <google/protobuf/wire_format_lite.h>
+
+#include <google/protobuf/port_def.inc>
+#ifdef SWIG
+#error "You cannot SWIG proto headers"
+#endif
+
+namespace google {
+namespace protobuf {
+namespace internal {
+template <typename Derived, typename Key, typename Value,
+          WireFormatLite::FieldType kKeyFieldType,
+          WireFormatLite::FieldType kValueFieldType, int default_enum_value>
+class MapEntry;
+template <typename Derived, typename Key, typename Value,
+          WireFormatLite::FieldType kKeyFieldType,
+          WireFormatLite::FieldType kValueFieldType, int default_enum_value>
+class MapFieldLite;
+}  // namespace internal
+}  // namespace protobuf
+}  // namespace google
+
+namespace google {
+namespace protobuf {
+namespace internal {
+
+// MoveHelper::Move is used to set *dest.  It copies *src, or moves it (in
+// the C++11 sense), or swaps it. *src is left in a sane state for
+// subsequent destruction, but shouldn't be used for anything.
+template <bool is_enum, bool is_message, bool is_stringlike, typename T>
+struct MoveHelper {  // primitives
+  static void Move(T* src, T* dest) { *dest = *src; }
+};
+
+template <bool is_message, bool is_stringlike, typename T>
+struct MoveHelper<true, is_message, is_stringlike, T> {  // enums
+  static void Move(T* src, T* dest) { *dest = *src; }
+  // T is an enum here, so allow conversions to and from int.
+  static void Move(T* src, int* dest) { *dest = static_cast<int>(*src); }
+  static void Move(int* src, T* dest) { *dest = static_cast<T>(*src); }
+};
+
+template <bool is_stringlike, typename T>
+struct MoveHelper<false, true, is_stringlike, T> {  // messages
+  static void Move(T* src, T* dest) { dest->Swap(src); }
+};
+
+template <typename T>
+struct MoveHelper<false, false, true, T> {  // strings and similar
+  static void Move(T* src, T* dest) {
+#if __cplusplus >= 201103L
+    *dest = std::move(*src);
+#else
+    dest->swap(*src);
+#endif
+  }
+};
+
+// Functions for operating on a map entry.  Does not contain any representation
+// (this class is not intended to be instantiated).
+template <typename Key, typename Value, WireFormatLite::FieldType kKeyFieldType,
+          WireFormatLite::FieldType kValueFieldType>
+struct MapEntryFuncs {
+  typedef MapTypeHandler<kKeyFieldType, Key> KeyTypeHandler;
+  typedef MapTypeHandler<kValueFieldType, Value> ValueTypeHandler;
+  static const int kKeyFieldNumber = 1;
+  static const int kValueFieldNumber = 2;
+
+  static uint8* InternalSerialize(int field_number, const Key& key,
+                                  const Value& value, uint8* ptr,
+                                  io::EpsCopyOutputStream* stream) {
+    ptr = stream->EnsureSpace(ptr);
+    ptr = WireFormatLite::WriteTagToArray(
+        field_number, WireFormatLite::WIRETYPE_LENGTH_DELIMITED, ptr);
+    ptr = io::CodedOutputStream::WriteVarint32ToArray(GetCachedSize(key, value),
+                                                      ptr);
+
+    ptr = KeyTypeHandler::Write(kKeyFieldNumber, key, ptr, stream);
+    return ValueTypeHandler::Write(kValueFieldNumber, value, ptr, stream);
+  }
+
+  static size_t ByteSizeLong(const Key& key, const Value& value) {
+    // Tags for key and value will both be one byte (field numbers 1 and 2).
+    size_t inner_length =
+        2 + KeyTypeHandler::ByteSize(key) + ValueTypeHandler::ByteSize(value);
+    return inner_length + io::CodedOutputStream::VarintSize32(
+                              static_cast<uint32>(inner_length));
+  }
+
+  static int GetCachedSize(const Key& key, const Value& value) {
+    // Tags for key and value will both be one byte (field numbers 1 and 2).
+    return 2 + KeyTypeHandler::GetCachedSize(key) +
+           ValueTypeHandler::GetCachedSize(value);
+  }
+};
+
+// MapEntryImpl is used to implement parsing and serialization of map entries.
+// It uses Curious Recursive Template Pattern (CRTP) to provide the type of
+// the eventual code to the template code.
+template <typename Derived, typename Base, typename Key, typename Value,
+          WireFormatLite::FieldType kKeyFieldType,
+          WireFormatLite::FieldType kValueFieldType, int default_enum_value>
+class MapEntryImpl : public Base {
+ public:
+  typedef MapEntryFuncs<Key, Value, kKeyFieldType, kValueFieldType> Funcs;
+
+ protected:
+  // Provide utilities to parse/serialize key/value.  Provide utilities to
+  // manipulate internal stored type.
+  typedef MapTypeHandler<kKeyFieldType, Key> KeyTypeHandler;
+  typedef MapTypeHandler<kValueFieldType, Value> ValueTypeHandler;
+
+  // Define internal memory layout. Strings and messages are stored as
+  // pointers, while other types are stored as values.
+  typedef typename KeyTypeHandler::TypeOnMemory KeyOnMemory;
+  typedef typename ValueTypeHandler::TypeOnMemory ValueOnMemory;
+
+  // Enum type cannot be used for MapTypeHandler::Read. Define a type
+  // which will replace Enum with int.
+  typedef typename KeyTypeHandler::MapEntryAccessorType KeyMapEntryAccessorType;
+  typedef
+      typename ValueTypeHandler::MapEntryAccessorType ValueMapEntryAccessorType;
+
+  // Constants for field number.
+  static const int kKeyFieldNumber = 1;
+  static const int kValueFieldNumber = 2;
+
+  // Constants for field tag.
+  static const uint8 kKeyTag =
+      GOOGLE_PROTOBUF_WIRE_FORMAT_MAKE_TAG(kKeyFieldNumber, KeyTypeHandler::kWireType);
+  static const uint8 kValueTag = GOOGLE_PROTOBUF_WIRE_FORMAT_MAKE_TAG(
+      kValueFieldNumber, ValueTypeHandler::kWireType);
+  static const size_t kTagSize = 1;
+
+ public:
+  // Work-around for a compiler bug (see repeated_field.h).
+  typedef void MapEntryHasMergeTypeTrait;
+  typedef Derived EntryType;
+  typedef Key EntryKeyType;
+  typedef Value EntryValueType;
+  static const WireFormatLite::FieldType kEntryKeyFieldType = kKeyFieldType;
+  static const WireFormatLite::FieldType kEntryValueFieldType = kValueFieldType;
+  static const int kEntryDefaultEnumValue = default_enum_value;
+
+  MapEntryImpl() {
+    KeyTypeHandler::Initialize(&key_, NULL);
+    ValueTypeHandler::InitializeMaybeByDefaultEnum(&value_, default_enum_value,
+                                                   NULL);
+    _has_bits_[0] = 0;
+  }
+
+  explicit MapEntryImpl(Arena* arena) : Base(arena) {
+    KeyTypeHandler::Initialize(&key_, arena);
+    ValueTypeHandler::InitializeMaybeByDefaultEnum(&value_, default_enum_value,
+                                                   arena);
+    _has_bits_[0] = 0;
+  }
+
+  ~MapEntryImpl() {
+    if (Base::GetArena() != NULL) return;
+    KeyTypeHandler::DeleteNoArena(key_);
+    ValueTypeHandler::DeleteNoArena(value_);
+  }
+
+  // accessors ======================================================
+
+  virtual inline const KeyMapEntryAccessorType& key() const {
+    return KeyTypeHandler::GetExternalReference(key_);
+  }
+  virtual inline const ValueMapEntryAccessorType& value() const {
+    return ValueTypeHandler::DefaultIfNotInitialized(
+        value_, Derived::internal_default_instance()->value_);
+  }
+  inline KeyMapEntryAccessorType* mutable_key() {
+    set_has_key();
+    return KeyTypeHandler::EnsureMutable(&key_, Base::GetArena());
+  }
+  inline ValueMapEntryAccessorType* mutable_value() {
+    set_has_value();
+    return ValueTypeHandler::EnsureMutable(&value_, Base::GetArena());
+  }
+
+  // implements MessageLite =========================================
+
+  // MapEntryImpl is for implementation only and this function isn't called
+  // anywhere. Just provide a fake implementation here for MessageLite.
+  std::string GetTypeName() const override { return ""; }
+
+  void CheckTypeAndMergeFrom(const MessageLite& other) override {
+    MergeFromInternal(*::google::protobuf::internal::DownCast<const Derived*>(&other));
+  }
+
+  const char* _InternalParse(const char* ptr, ParseContext* ctx) final {
+    while (!ctx->Done(&ptr)) {
+      uint32 tag;
+      ptr = ReadTag(ptr, &tag);
+      GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+      if (tag == kKeyTag) {
+        set_has_key();
+        KeyMapEntryAccessorType* key = mutable_key();
+        ptr = KeyTypeHandler::Read(ptr, ctx, key);
+        if (!Derived::ValidateKey(key)) return nullptr;
+      } else if (tag == kValueTag) {
+        set_has_value();
+        ValueMapEntryAccessorType* value = mutable_value();
+        ptr = ValueTypeHandler::Read(ptr, ctx, value);
+        if (!Derived::ValidateValue(value)) return nullptr;
+      } else {
+        if (tag == 0 || WireFormatLite::GetTagWireType(tag) ==
+                            WireFormatLite::WIRETYPE_END_GROUP) {
+          ctx->SetLastTag(tag);
+          return ptr;
+        }
+        ptr = UnknownFieldParse(tag, static_cast<std::string*>(nullptr), ptr,
+                                ctx);
+      }
+      GOOGLE_PROTOBUF_PARSER_ASSERT(ptr);
+    }
+    return ptr;
+  }
+
+  size_t ByteSizeLong() const override {
+    size_t size = 0;
+    size += kTagSize + static_cast<size_t>(KeyTypeHandler::ByteSize(key()));
+    size += kTagSize + static_cast<size_t>(ValueTypeHandler::ByteSize(value()));
+    return size;
+  }
+
+  ::google::protobuf::uint8* _InternalSerialize(::google::protobuf::uint8* ptr,
+                              io::EpsCopyOutputStream* stream) const override {
+    ptr = KeyTypeHandler::Write(kKeyFieldNumber, key(), ptr, stream);
+    return ValueTypeHandler::Write(kValueFieldNumber, value(), ptr, stream);
+  }
+
+  // Don't override SerializeWithCachedSizesToArray.  Use MessageLite's.
+
+  int GetCachedSize() const override {
+    int size = 0;
+    size += has_key() ? static_cast<int>(kTagSize) +
+                            KeyTypeHandler::GetCachedSize(key())
+                      : 0;
+    size += has_value() ? static_cast<int>(kTagSize) +
+                              ValueTypeHandler::GetCachedSize(value())
+                        : 0;
+    return size;
+  }
+
+  bool IsInitialized() const override {
+    return ValueTypeHandler::IsInitialized(value_);
+  }
+
+  Base* New() const override {
+    Derived* entry = new Derived;
+    return entry;
+  }
+
+  Base* New(Arena* arena) const override {
+    Derived* entry = Arena::CreateMessage<Derived>(arena);
+    return entry;
+  }
+
+ protected:
+  // We can't declare this function directly here as it would hide the other
+  // overload (const Message&).
+  void MergeFromInternal(const MapEntryImpl& from) {
+    if (from._has_bits_[0]) {
+      if (from.has_key()) {
+        KeyTypeHandler::EnsureMutable(&key_, Base::GetArena());
+        KeyTypeHandler::Merge(from.key(), &key_, Base::GetArena());
+        set_has_key();
+      }
+      if (from.has_value()) {
+        ValueTypeHandler::EnsureMutable(&value_, Base::GetArena());
+        ValueTypeHandler::Merge(from.value(), &value_, Base::GetArena());
+        set_has_value();
+      }
+    }
+  }
+
+ public:
+  void Clear() override {
+    KeyTypeHandler::Clear(&key_, Base::GetArena());
+    ValueTypeHandler::ClearMaybeByDefaultEnum(&value_, Base::GetArena(),
+                                              default_enum_value);
+    clear_has_key();
+    clear_has_value();
+  }
+
+  static void InitAsDefaultInstance() {
+    Derived* d = const_cast<Derived*>(Derived::internal_default_instance());
+    KeyTypeHandler::AssignDefaultValue(&d->key_);
+    ValueTypeHandler::AssignDefaultValue(&d->value_);
+  }
+
+  // Parsing using MergePartialFromCodedStream, above, is not as
+  // efficient as it could be.  This helper class provides a speedier way.
+  template <typename MapField, typename Map>
+  class Parser {
+   public:
+    explicit Parser(MapField* mf) : mf_(mf), map_(mf->MutableMap()) {}
+    ~Parser() {
+      if (entry_ != nullptr && entry_->GetArena() == nullptr) delete entry_;
+    }
+
+    // This does what the typical MergePartialFromCodedStream() is expected to
+    // do, with the additional side-effect that if successful (i.e., if true is
+    // going to be its return value) it inserts the key-value pair into map_.
+    bool MergePartialFromCodedStream(io::CodedInputStream* input) {
+      // Look for the expected thing: a key and then a value.  If it fails,
+      // invoke the enclosing class's MergePartialFromCodedStream, or return
+      // false if that would be pointless.
+      if (input->ExpectTag(kKeyTag)) {
+        if (!KeyTypeHandler::Read(input, &key_)) {
+          return false;
+        }
+        // Peek at the next byte to see if it is kValueTag.  If not, bail out.
+        const void* data;
+        int size;
+        input->GetDirectBufferPointerInline(&data, &size);
+        // We could use memcmp here, but we don't bother. The tag is one byte.
+        static_assert(kTagSize == 1, "tag size must be 1");
+        if (size > 0 && *reinterpret_cast<const char*>(data) == kValueTag) {
+          typename Map::size_type map_size = map_->size();
+          value_ptr_ = &(*map_)[key_];
+          if (PROTOBUF_PREDICT_TRUE(map_size != map_->size())) {
+            // We created a new key-value pair.  Fill in the value.
+            typedef
+                typename MapIf<ValueTypeHandler::kIsEnum, int*, Value*>::type T;
+            input->Skip(kTagSize);  // Skip kValueTag.
+            if (!ValueTypeHandler::Read(input,
+                                        reinterpret_cast<T>(value_ptr_))) {
+              map_->erase(key_);  // Failure! Undo insertion.
+              return false;
+            }
+            if (input->ExpectAtEnd()) return true;
+            return ReadBeyondKeyValuePair(input);
+          }
+        }
+      } else {
+        key_ = Key();
+      }
+
+      NewEntry();
+      *entry_->mutable_key() = key_;
+      const bool result = entry_->MergePartialFromCodedStream(input);
+      if (result) UseKeyAndValueFromEntry();
+      return result;
+    }
+
+    const char* _InternalParse(const char* ptr, ParseContext* ctx) {
+      if (PROTOBUF_PREDICT_TRUE(!ctx->Done(&ptr) && *ptr == kKeyTag)) {
+        ptr = KeyTypeHandler::Read(ptr + 1, ctx, &key_);
+        if (PROTOBUF_PREDICT_FALSE(!ptr || !Derived::ValidateKey(&key_))) {
+          return nullptr;
+        }
+        if (PROTOBUF_PREDICT_TRUE(!ctx->Done(&ptr) && *ptr == kValueTag)) {
+          typename Map::size_type map_size = map_->size();
+          value_ptr_ = &(*map_)[key_];
+          if (PROTOBUF_PREDICT_TRUE(map_size != map_->size())) {
+            using T =
+                typename MapIf<ValueTypeHandler::kIsEnum, int*, Value*>::type;
+            ptr = ValueTypeHandler::Read(ptr + 1, ctx,
+                                         reinterpret_cast<T>(value_ptr_));
+            if (PROTOBUF_PREDICT_FALSE(!ptr ||
+                                       !Derived::ValidateValue(value_ptr_))) {
+              map_->erase(key_);  // Failure! Undo insertion.
+              return nullptr;
+            }
+            if (PROTOBUF_PREDICT_TRUE(ctx->Done(&ptr))) return ptr;
+            if (!ptr) return nullptr;
+            NewEntry();
+            ValueMover::Move(value_ptr_, entry_->mutable_value());
+            map_->erase(key_);
+            goto move_key;
+          }
+        } else {
+          if (!ptr) return nullptr;
+        }
+        NewEntry();
+      move_key:
+        KeyMover::Move(&key_, entry_->mutable_key());
+      } else {
+        if (!ptr) return nullptr;
+        NewEntry();
+      }
+      ptr = entry_->_InternalParse(ptr, ctx);
+      if (ptr) UseKeyAndValueFromEntry();
+      return ptr;
+    }
+
+    template <typename UnknownType>
+    const char* ParseWithEnumValidation(const char* ptr, ParseContext* ctx,
+                                        bool (*is_valid)(int), uint32 field_num,
+                                        InternalMetadata* metadata) {
+      auto entry = NewEntry();
+      ptr = entry->_InternalParse(ptr, ctx);
+      if (!ptr) return nullptr;
+      if (is_valid(entry->value())) {
+        UseKeyAndValueFromEntry();
+      } else {
+        WriteLengthDelimited(field_num, entry->SerializeAsString(),
+                             metadata->mutable_unknown_fields<UnknownType>());
+      }
+      return ptr;
+    }
+
+    MapEntryImpl* NewEntry() { return entry_ = mf_->NewEntry(); }
+
+    const Key& key() const { return key_; }
+    const Value& value() const { return *value_ptr_; }
+
+    const Key& entry_key() const { return entry_->key(); }
+    const Value& entry_value() const { return entry_->value(); }
+
+   private:
+    void UseKeyAndValueFromEntry() {
+      // Update key_ in case we need it later (because key() is called).
+      // This is potentially inefficient, especially if the key is
+      // expensive to copy (e.g., a long string), but this is a cold
+      // path, so it's not a big deal.
+      key_ = entry_->key();
+      value_ptr_ = &(*map_)[key_];
+      ValueMover::Move(entry_->mutable_value(), value_ptr_);
+    }
+
+    // After reading a key and value successfully, and inserting that data
+    // into map_, we are not at the end of the input.  This is unusual, but
+    // allowed by the spec.
+    bool ReadBeyondKeyValuePair(io::CodedInputStream* input) PROTOBUF_COLD {
+      NewEntry();
+      ValueMover::Move(value_ptr_, entry_->mutable_value());
+      map_->erase(key_);
+      KeyMover::Move(&key_, entry_->mutable_key());
+      const bool result = entry_->MergePartialFromCodedStream(input);
+      if (result) UseKeyAndValueFromEntry();
+      return result;
+    }
+
+    typedef MoveHelper<KeyTypeHandler::kIsEnum, KeyTypeHandler::kIsMessage,
+                       KeyTypeHandler::kWireType ==
+                           WireFormatLite::WIRETYPE_LENGTH_DELIMITED,
+                       Key>
+        KeyMover;
+    typedef MoveHelper<ValueTypeHandler::kIsEnum, ValueTypeHandler::kIsMessage,
+                       ValueTypeHandler::kWireType ==
+                           WireFormatLite::WIRETYPE_LENGTH_DELIMITED,
+                       Value>
+        ValueMover;
+
+    MapField* const mf_;
+    Map* const map_;
+    Key key_;
+    Value* value_ptr_;
+    MapEntryImpl* entry_ = nullptr;
+  };
+
+ protected:
+  void set_has_key() { _has_bits_[0] |= 0x00000001u; }
+  bool has_key() const { return (_has_bits_[0] & 0x00000001u) != 0; }
+  void clear_has_key() { _has_bits_[0] &= ~0x00000001u; }
+  void set_has_value() { _has_bits_[0] |= 0x00000002u; }
+  bool has_value() const { return (_has_bits_[0] & 0x00000002u) != 0; }
+  void clear_has_value() { _has_bits_[0] &= ~0x00000002u; }
+
+ public:
+  inline Arena* GetArena() const { return Base::GetArena(); }
+
+ public:  // Needed for constructing tables
+  KeyOnMemory key_;
+  ValueOnMemory value_;
+  uint32 _has_bits_[1];
+
+ private:
+  friend class ::PROTOBUF_NAMESPACE_ID::Arena;
+  typedef void InternalArenaConstructable_;
+  typedef void DestructorSkippable_;
+  template <typename C, typename K, typename V, WireFormatLite::FieldType,
+            WireFormatLite::FieldType, int>
+  friend class internal::MapEntry;
+  template <typename C, typename K, typename V, WireFormatLite::FieldType,
+            WireFormatLite::FieldType, int>
+  friend class internal::MapFieldLite;
+
+  GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(MapEntryImpl);
+};
+
+template <typename T, typename Key, typename Value,
+          WireFormatLite::FieldType kKeyFieldType,
+          WireFormatLite::FieldType kValueFieldType, int default_enum_value>
+class MapEntryLite
+    : public MapEntryImpl<T, MessageLite, Key, Value, kKeyFieldType,
+                          kValueFieldType, default_enum_value> {
+ public:
+  typedef MapEntryImpl<T, MessageLite, Key, Value, kKeyFieldType,
+                       kValueFieldType, default_enum_value>
+      SuperType;
+  MapEntryLite() {}
+  explicit MapEntryLite(Arena* arena) : SuperType(arena) {}
+  ~MapEntryLite() { MessageLite::_internal_metadata_.Delete<std::string>(); }
+  void MergeFrom(const MapEntryLite& other) { MergeFromInternal(other); }
+
+ private:
+  GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(MapEntryLite);
+};
+// The completely unprincipled and unwieldy use of template parameters in
+// the map code necessitates wrappers to make the code a little bit more
+// manageable.
+template <typename Derived>
+struct DeconstructMapEntry;
+
+template <typename T, typename K, typename V, WireFormatLite::FieldType key,
+          WireFormatLite::FieldType value, int default_enum>
+struct DeconstructMapEntry<MapEntryLite<T, K, V, key, value, default_enum> > {
+  typedef K Key;
+  typedef V Value;
+  static const WireFormatLite::FieldType kKeyFieldType = key;
+  static const WireFormatLite::FieldType kValueFieldType = value;
+  static const int default_enum_value = default_enum;
+};
+
+// Helpers for deterministic serialization =============================
+
+// This struct can be used with any generic sorting algorithm.  If the Key
+// type is relatively small and easy to copy then copying Keys into an
+// array of SortItems can be beneficial.  Then all the data the sorting
+// algorithm needs to touch is in that one array.
+template <typename Key, typename PtrToKeyValuePair>
+struct SortItem {
+  SortItem() {}
+  explicit SortItem(PtrToKeyValuePair p) : first(p->first), second(p) {}
+
+  Key first;
+  PtrToKeyValuePair second;
+};
+
+template <typename T>
+struct CompareByFirstField {
+  bool operator()(const T& a, const T& b) const { return a.first < b.first; }
+};
+
+template <typename T>
+struct CompareByDerefFirst {
+  bool operator()(const T& a, const T& b) const { return a->first < b->first; }
+};
+
+// Helper for table driven serialization
+
+template <WireFormatLite::FieldType FieldType>
+struct FromHelper {
+  template <typename T>
+  static const T& From(const T& x) {
+    return x;
+  }
+};
+
+template <>
+struct FromHelper<WireFormatLite::TYPE_STRING> {
+  static ArenaStringPtr From(const std::string& x) {
+    ArenaStringPtr res;
+    TaggedPtr<std::string> ptr;
+    ptr.Set(const_cast<std::string*>(&x));
+    res.UnsafeSetTaggedPointer(ptr);
+    return res;
+  }
+};
+template <>
+struct FromHelper<WireFormatLite::TYPE_BYTES> {
+  static ArenaStringPtr From(const std::string& x) {
+    ArenaStringPtr res;
+    TaggedPtr<std::string> ptr;
+    ptr.Set(const_cast<std::string*>(&x));
+    res.UnsafeSetTaggedPointer(ptr);
+    return res;
+  }
+};
+template <>
+struct FromHelper<WireFormatLite::TYPE_MESSAGE> {
+  template <typename T>
+  static T* From(const T& x) {
+    return const_cast<T*>(&x);
+  }
+};
+
+template <typename MapEntryType>
+struct MapEntryHelper;
+
+template <typename T, typename Key, typename Value,
+          WireFormatLite::FieldType kKeyFieldType,
+          WireFormatLite::FieldType kValueFieldType, int default_enum_value>
+struct MapEntryHelper<MapEntryLite<T, Key, Value, kKeyFieldType,
+                                   kValueFieldType, default_enum_value> > {
+  // Provide utilities to parse/serialize key/value.  Provide utilities to
+  // manipulate internal stored type.
+  typedef MapTypeHandler<kKeyFieldType, Key> KeyTypeHandler;
+  typedef MapTypeHandler<kValueFieldType, Value> ValueTypeHandler;
+
+  // Define internal memory layout. Strings and messages are stored as
+  // pointers, while other types are stored as values.
+  typedef typename KeyTypeHandler::TypeOnMemory KeyOnMemory;
+  typedef typename ValueTypeHandler::TypeOnMemory ValueOnMemory;
+
+  explicit MapEntryHelper(const MapPair<Key, Value>& map_pair)
+      : _has_bits_(3),
+        _cached_size_(2 + KeyTypeHandler::GetCachedSize(map_pair.first) +
+                      ValueTypeHandler::GetCachedSize(map_pair.second)),
+        key_(FromHelper<kKeyFieldType>::From(map_pair.first)),
+        value_(FromHelper<kValueFieldType>::From(map_pair.second)) {}
+
+  // Purposely not following the style guide naming. These are the names
+  // the proto compiler would generate given the map entry descriptor.
+  // The proto compiler generates the offsets in this struct as if this was
+  // a regular message. This way the table driven code barely notices it's
+  // dealing with a map field.
+  uint32 _has_bits_;     // NOLINT
+  uint32 _cached_size_;  // NOLINT
+  KeyOnMemory key_;      // NOLINT
+  ValueOnMemory value_;  // NOLINT
+};
+
+}  // namespace internal
+}  // namespace protobuf
+}  // namespace google
+
+#include <google/protobuf/port_undef.inc>
+
+#endif  // GOOGLE_PROTOBUF_MAP_ENTRY_LITE_H__
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/metadata.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/metadata.h
new file mode 100644
index 0000000000000000000000000000000000000000..756936991766a4ef660ba971bc9b9172bd734b8d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/metadata.h
@@ -0,0 +1,41 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Protocol Buffers - Google's data interchange format
+// Copyright 2008 Google Inc.  All rights reserved.
+// https://developers.google.com/protocol-buffers/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+#ifndef GOOGLE_PROTOBUF_METADATA_H__
+#define GOOGLE_PROTOBUF_METADATA_H__
+
+// TODO(b/151117630): Remove this file and all instances where it gets imported.
+
+#endif  // GOOGLE_PROTOBUF_METADATA_H__
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/port.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/port.h
new file mode 100644
index 0000000000000000000000000000000000000000..cfb7d4fcdf21632bf9a342aa72d3d5c15c8c1615
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/port.h
@@ -0,0 +1,48 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Protocol Buffers - Google's data interchange format
+// Copyright 2008 Google Inc.  All rights reserved.
+// https://developers.google.com/protocol-buffers/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// A common header that is included across all protobuf headers.  We do our best
+// to avoid #defining any macros here; instead we generally put macros in
+// port_def.inc and port_undef.inc so they are not visible from outside of
+// protobuf.
+
+#ifndef GOOGLE_PROTOBUF_PORT_H__
+#define GOOGLE_PROTOBUF_PORT_H__
+
+
+#include <google/protobuf/stubs/port.h>
+
+
+#endif  // GOOGLE_PROTOBUF_PORT_H__
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/reflection.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/reflection.h
new file mode 100644
index 0000000000000000000000000000000000000000..ff2da6f7f6b11af5f6fc0dce7be23fb67fc51345
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/reflection.h
@@ -0,0 +1,568 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Protocol Buffers - Google's data interchange format
+// Copyright 2008 Google Inc.  All rights reserved.
+// https://developers.google.com/protocol-buffers/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// This header defines the RepeatedFieldRef class template used to access
+// repeated fields with protobuf reflection API.
+#ifndef GOOGLE_PROTOBUF_REFLECTION_H__
+#define GOOGLE_PROTOBUF_REFLECTION_H__
+
+#include <memory>
+
+#include <google/protobuf/message.h>
+#include <google/protobuf/generated_enum_util.h>
+
+#ifdef SWIG
+#error "You cannot SWIG proto headers"
+#endif
+
+#include <google/protobuf/port_def.inc>
+
+namespace google {
+namespace protobuf {
+namespace internal {
+template <typename T, typename Enable = void>
+struct RefTypeTraits;
+}  // namespace internal
+
+template <typename T>
+RepeatedFieldRef<T> Reflection::GetRepeatedFieldRef(
+    const Message& message, const FieldDescriptor* field) const {
+  return RepeatedFieldRef<T>(message, field);
+}
+
+template <typename T>
+MutableRepeatedFieldRef<T> Reflection::GetMutableRepeatedFieldRef(
+    Message* message, const FieldDescriptor* field) const {
+  return MutableRepeatedFieldRef<T>(message, field);
+}
+
+// RepeatedFieldRef definition for non-message types.
+template <typename T>
+class RepeatedFieldRef<
+    T, typename std::enable_if<!std::is_base_of<Message, T>::value>::type> {
+  typedef typename internal::RefTypeTraits<T>::iterator IteratorType;
+  typedef typename internal::RefTypeTraits<T>::AccessorType AccessorType;
+
+ public:
+  bool empty() const { return accessor_->IsEmpty(data_); }
+  int size() const { return accessor_->Size(data_); }
+  T Get(int index) const { return accessor_->template Get<T>(data_, index); }
+
+  typedef IteratorType iterator;
+  typedef IteratorType const_iterator;
+  typedef T value_type;
+  typedef T& reference;
+  typedef const T& const_reference;
+  typedef int size_type;
+  typedef ptrdiff_t difference_type;
+
+  iterator begin() const { return iterator(data_, accessor_, true); }
+  iterator end() const { return iterator(data_, accessor_, false); }
+
+ private:
+  friend class Reflection;
+  RepeatedFieldRef(const Message& message, const FieldDescriptor* field) {
+    const Reflection* reflection = message.GetReflection();
+    data_ = reflection->RepeatedFieldData(const_cast<Message*>(&message), field,
+                                          internal::RefTypeTraits<T>::cpp_type,
+                                          NULL);
+    accessor_ = reflection->RepeatedFieldAccessor(field);
+  }
+
+  const void* data_;
+  const AccessorType* accessor_;
+};
+
+// MutableRepeatedFieldRef definition for non-message types.
+template <typename T>
+class MutableRepeatedFieldRef<
+    T, typename std::enable_if<!std::is_base_of<Message, T>::value>::type> {
+  typedef typename internal::RefTypeTraits<T>::AccessorType AccessorType;
+
+ public:
+  bool empty() const { return accessor_->IsEmpty(data_); }
+  int size() const { return accessor_->Size(data_); }
+  T Get(int index) const { return accessor_->template Get<T>(data_, index); }
+
+  void Set(int index, const T& value) const {
+    accessor_->template Set<T>(data_, index, value);
+  }
+  void Add(const T& value) const { accessor_->template Add<T>(data_, value); }
+  void RemoveLast() const { accessor_->RemoveLast(data_); }
+  void SwapElements(int index1, int index2) const {
+    accessor_->SwapElements(data_, index1, index2);
+  }
+  void Clear() const { accessor_->Clear(data_); }
+
+  void Swap(const MutableRepeatedFieldRef& other) const {
+    accessor_->Swap(data_, other.accessor_, other.data_);
+  }
+
+  template <typename Container>
+  void MergeFrom(const Container& container) const {
+    typedef typename Container::const_iterator Iterator;
+    for (Iterator it = container.begin(); it != container.end(); ++it) {
+      Add(*it);
+    }
+  }
+  template <typename Container>
+  void CopyFrom(const Container& container) const {
+    Clear();
+    MergeFrom(container);
+  }
+
+ private:
+  friend class Reflection;
+  MutableRepeatedFieldRef(Message* message, const FieldDescriptor* field) {
+    const Reflection* reflection = message->GetReflection();
+    data_ = reflection->RepeatedFieldData(
+        message, field, internal::RefTypeTraits<T>::cpp_type, NULL);
+    accessor_ = reflection->RepeatedFieldAccessor(field);
+  }
+
+  void* data_;
+  const AccessorType* accessor_;
+};
+
+// RepeatedFieldRef definition for message types.
+template <typename T>
+class RepeatedFieldRef<
+    T, typename std::enable_if<std::is_base_of<Message, T>::value>::type> {
+  typedef typename internal::RefTypeTraits<T>::iterator IteratorType;
+  typedef typename internal::RefTypeTraits<T>::AccessorType AccessorType;
+
+ public:
+  bool empty() const { return accessor_->IsEmpty(data_); }
+  int size() const { return accessor_->Size(data_); }
+  // This method returns a reference to the underlying message object if it
+  // exists. If a message object doesn't exist (e.g., data stored in serialized
+  // form), scratch_space will be filled with the data and a reference to it
+  // will be returned.
+  //
+  // Example:
+  //   RepeatedFieldRef<Message> h = ...
+  //   unique_ptr<Message> scratch_space(h.NewMessage());
+  //   const Message& item = h.Get(index, scratch_space.get());
+  const T& Get(int index, T* scratch_space) const {
+    return *static_cast<const T*>(accessor_->Get(data_, index, scratch_space));
+  }
+  // Create a new message of the same type as the messages stored in this
+  // repeated field. Caller takes ownership of the returned object.
+  T* NewMessage() const { return static_cast<T*>(default_instance_->New()); }
+
+  typedef IteratorType iterator;
+  typedef IteratorType const_iterator;
+  typedef T value_type;
+  typedef T& reference;
+  typedef const T& const_reference;
+  typedef int size_type;
+  typedef ptrdiff_t difference_type;
+
+  iterator begin() const {
+    return iterator(data_, accessor_, true, NewMessage());
+  }
+  iterator end() const {
+    // The end iterator must not be dereferenced, no need for scratch space.
+    return iterator(data_, accessor_, false, nullptr);
+  }
+
+ private:
+  friend class Reflection;
+  RepeatedFieldRef(const Message& message, const FieldDescriptor* field) {
+    const Reflection* reflection = message.GetReflection();
+    data_ = reflection->RepeatedFieldData(
+        const_cast<Message*>(&message), field,
+        internal::RefTypeTraits<T>::cpp_type,
+        internal::RefTypeTraits<T>::GetMessageFieldDescriptor());
+    accessor_ = reflection->RepeatedFieldAccessor(field);
+    default_instance_ =
+        reflection->GetMessageFactory()->GetPrototype(field->message_type());
+  }
+
+  const void* data_;
+  const AccessorType* accessor_;
+  const Message* default_instance_;
+};
+
+// MutableRepeatedFieldRef definition for message types.
+template <typename T>
+class MutableRepeatedFieldRef<
+    T, typename std::enable_if<std::is_base_of<Message, T>::value>::type> {
+  typedef typename internal::RefTypeTraits<T>::AccessorType AccessorType;
+
+ public:
+  bool empty() const { return accessor_->IsEmpty(data_); }
+  int size() const { return accessor_->Size(data_); }
+  // See comments for RepeatedFieldRef<Message>::Get()
+  const T& Get(int index, T* scratch_space) const {
+    return *static_cast<const T*>(accessor_->Get(data_, index, scratch_space));
+  }
+  // Create a new message of the same type as the messages stored in this
+  // repeated field. Caller takes ownership of the returned object.
+  T* NewMessage() const { return static_cast<T*>(default_instance_->New()); }
+
+  void Set(int index, const T& value) const {
+    accessor_->Set(data_, index, &value);
+  }
+  void Add(const T& value) const { accessor_->Add(data_, &value); }
+  void RemoveLast() const { accessor_->RemoveLast(data_); }
+  void SwapElements(int index1, int index2) const {
+    accessor_->SwapElements(data_, index1, index2);
+  }
+  void Clear() const { accessor_->Clear(data_); }
+
+  void Swap(const MutableRepeatedFieldRef& other) const {
+    accessor_->Swap(data_, other.accessor_, other.data_);
+  }
+
+  template <typename Container>
+  void MergeFrom(const Container& container) const {
+    typedef typename Container::const_iterator Iterator;
+    for (Iterator it = container.begin(); it != container.end(); ++it) {
+      Add(*it);
+    }
+  }
+  template <typename Container>
+  void CopyFrom(const Container& container) const {
+    Clear();
+    MergeFrom(container);
+  }
+
+ private:
+  friend class Reflection;
+  MutableRepeatedFieldRef(Message* message, const FieldDescriptor* field) {
+    const Reflection* reflection = message->GetReflection();
+    data_ = reflection->RepeatedFieldData(
+        message, field, internal::RefTypeTraits<T>::cpp_type,
+        internal::RefTypeTraits<T>::GetMessageFieldDescriptor());
+    accessor_ = reflection->RepeatedFieldAccessor(field);
+    default_instance_ =
+        reflection->GetMessageFactory()->GetPrototype(field->message_type());
+  }
+
+  void* data_;
+  const AccessorType* accessor_;
+  const Message* default_instance_;
+};
+
+namespace internal {
+// Interfaces used to implement reflection RepeatedFieldRef API.
+// Reflection::GetRepeatedAccessor() should return a pointer to an singleton
+// object that implements the below interface.
+//
+// This interface passes/returns values using void pointers. The actual type
+// of the value depends on the field's cpp_type. Following is a mapping from
+// cpp_type to the type that should be used in this interface:
+//
+//   field->cpp_type()      T                Actual type of void*
+//   CPPTYPE_INT32        int32                   int32
+//   CPPTYPE_UINT32       uint32                  uint32
+//   CPPTYPE_INT64        int64                   int64
+//   CPPTYPE_UINT64       uint64                  uint64
+//   CPPTYPE_DOUBLE       double                  double
+//   CPPTYPE_FLOAT        float                   float
+//   CPPTYPE_BOOL         bool                    bool
+//   CPPTYPE_ENUM         generated enum type     int32
+//   CPPTYPE_STRING       string                  std::string
+//   CPPTYPE_MESSAGE      generated message type  google::protobuf::Message
+//                        or google::protobuf::Message
+//
+// Note that for enums we use int32 in the interface.
+//
+// You can map from T to the actual type using RefTypeTraits:
+//   typedef RefTypeTraits<T>::AccessorValueType ActualType;
+class PROTOBUF_EXPORT RepeatedFieldAccessor {
+ public:
+  // Typedefs for clarity.
+  typedef void Field;
+  typedef void Value;
+  typedef void Iterator;
+
+  virtual bool IsEmpty(const Field* data) const = 0;
+  virtual int Size(const Field* data) const = 0;
+  // Depends on the underlying representation of the repeated field, this
+  // method can return a pointer to the underlying object if such an object
+  // exists, or fill the data into scratch_space and return scratch_space.
+  // Callers of this method must ensure scratch_space is a valid pointer
+  // to a mutable object of the correct type.
+  virtual const Value* Get(const Field* data, int index,
+                           Value* scratch_space) const = 0;
+
+  virtual void Clear(Field* data) const = 0;
+  virtual void Set(Field* data, int index, const Value* value) const = 0;
+  virtual void Add(Field* data, const Value* value) const = 0;
+  virtual void RemoveLast(Field* data) const = 0;
+  virtual void SwapElements(Field* data, int index1, int index2) const = 0;
+  virtual void Swap(Field* data, const RepeatedFieldAccessor* other_mutator,
+                    Field* other_data) const = 0;
+
+  // Create an iterator that points at the beginning of the repeated field.
+  virtual Iterator* BeginIterator(const Field* data) const = 0;
+  // Create an iterator that points at the end of the repeated field.
+  virtual Iterator* EndIterator(const Field* data) const = 0;
+  // Make a copy of an iterator and return the new copy.
+  virtual Iterator* CopyIterator(const Field* data,
+                                 const Iterator* iterator) const = 0;
+  // Move an iterator to point to the next element.
+  virtual Iterator* AdvanceIterator(const Field* data,
+                                    Iterator* iterator) const = 0;
+  // Compare whether two iterators point to the same element.
+  virtual bool EqualsIterator(const Field* data, const Iterator* a,
+                              const Iterator* b) const = 0;
+  // Delete an iterator created by BeginIterator(), EndIterator() and
+  // CopyIterator().
+  virtual void DeleteIterator(const Field* data, Iterator* iterator) const = 0;
+  // Like Get() but for iterators.
+  virtual const Value* GetIteratorValue(const Field* data,
+                                        const Iterator* iterator,
+                                        Value* scratch_space) const = 0;
+
+  // Templated methods that make using this interface easier for non-message
+  // types.
+  template <typename T>
+  T Get(const Field* data, int index) const {
+    typedef typename RefTypeTraits<T>::AccessorValueType ActualType;
+    ActualType scratch_space;
+    return static_cast<T>(*reinterpret_cast<const ActualType*>(
+        Get(data, index, static_cast<Value*>(&scratch_space))));
+  }
+
+  template <typename T, typename ValueType>
+  void Set(Field* data, int index, const ValueType& value) const {
+    typedef typename RefTypeTraits<T>::AccessorValueType ActualType;
+    // In this RepeatedFieldAccessor interface we pass/return data using
+    // raw pointers. Type of the data these raw pointers point to should
+    // be ActualType. Here we have a ValueType object and want a ActualType
+    // pointer. We can't cast a ValueType pointer to an ActualType pointer
+    // directly because their type might be different (for enums ValueType
+    // may be a generated enum type while ActualType is int32). To be safe
+    // we make a copy to get a temporary ActualType object and use it.
+    ActualType tmp = static_cast<ActualType>(value);
+    Set(data, index, static_cast<const Value*>(&tmp));
+  }
+
+  template <typename T, typename ValueType>
+  void Add(Field* data, const ValueType& value) const {
+    typedef typename RefTypeTraits<T>::AccessorValueType ActualType;
+    // In this RepeatedFieldAccessor interface we pass/return data using
+    // raw pointers. Type of the data these raw pointers point to should
+    // be ActualType. Here we have a ValueType object and want a ActualType
+    // pointer. We can't cast a ValueType pointer to an ActualType pointer
+    // directly because their type might be different (for enums ValueType
+    // may be a generated enum type while ActualType is int32). To be safe
+    // we make a copy to get a temporary ActualType object and use it.
+    ActualType tmp = static_cast<ActualType>(value);
+    Add(data, static_cast<const Value*>(&tmp));
+  }
+
+ protected:
+  // We want the destructor to be completely trivial as to allow it to be
+  // a function local static. Hence we make it non-virtual and protected,
+  // this class only live as part of a global singleton and should not be
+  // deleted.
+  ~RepeatedFieldAccessor() = default;
+};
+
+// Implement (Mutable)RepeatedFieldRef::iterator
+template <typename T>
+class RepeatedFieldRefIterator
+    : public std::iterator<std::forward_iterator_tag, T> {
+  typedef typename RefTypeTraits<T>::AccessorValueType AccessorValueType;
+  typedef typename RefTypeTraits<T>::IteratorValueType IteratorValueType;
+  typedef typename RefTypeTraits<T>::IteratorPointerType IteratorPointerType;
+
+ public:
+  // Constructor for non-message fields.
+  RepeatedFieldRefIterator(const void* data,
+                           const RepeatedFieldAccessor* accessor, bool begin)
+      : data_(data),
+        accessor_(accessor),
+        iterator_(begin ? accessor->BeginIterator(data)
+                        : accessor->EndIterator(data)),
+        // The end iterator must not be dereferenced, no need for scratch space.
+        scratch_space_(begin ? new AccessorValueType : nullptr) {}
+  // Constructor for message fields.
+  RepeatedFieldRefIterator(const void* data,
+                           const RepeatedFieldAccessor* accessor, bool begin,
+                           AccessorValueType* scratch_space)
+      : data_(data),
+        accessor_(accessor),
+        iterator_(begin ? accessor->BeginIterator(data)
+                        : accessor->EndIterator(data)),
+        scratch_space_(scratch_space) {}
+  ~RepeatedFieldRefIterator() { accessor_->DeleteIterator(data_, iterator_); }
+  RepeatedFieldRefIterator operator++(int) {
+    RepeatedFieldRefIterator tmp(*this);
+    iterator_ = accessor_->AdvanceIterator(data_, iterator_);
+    return tmp;
+  }
+  RepeatedFieldRefIterator& operator++() {
+    iterator_ = accessor_->AdvanceIterator(data_, iterator_);
+    return *this;
+  }
+  IteratorValueType operator*() const {
+    return static_cast<IteratorValueType>(
+        *static_cast<const AccessorValueType*>(accessor_->GetIteratorValue(
+            data_, iterator_, scratch_space_.get())));
+  }
+  IteratorPointerType operator->() const {
+    return static_cast<IteratorPointerType>(
+        accessor_->GetIteratorValue(data_, iterator_, scratch_space_.get()));
+  }
+  bool operator!=(const RepeatedFieldRefIterator& other) const {
+    assert(data_ == other.data_);
+    assert(accessor_ == other.accessor_);
+    return !accessor_->EqualsIterator(data_, iterator_, other.iterator_);
+  }
+  bool operator==(const RepeatedFieldRefIterator& other) const {
+    return !this->operator!=(other);
+  }
+
+  RepeatedFieldRefIterator(const RepeatedFieldRefIterator& other)
+      : data_(other.data_),
+        accessor_(other.accessor_),
+        iterator_(accessor_->CopyIterator(data_, other.iterator_)) {}
+  RepeatedFieldRefIterator& operator=(const RepeatedFieldRefIterator& other) {
+    if (this != &other) {
+      accessor_->DeleteIterator(data_, iterator_);
+      data_ = other.data_;
+      accessor_ = other.accessor_;
+      iterator_ = accessor_->CopyIterator(data_, other.iterator_);
+    }
+    return *this;
+  }
+
+ protected:
+  const void* data_;
+  const RepeatedFieldAccessor* accessor_;
+  void* iterator_;
+  std::unique_ptr<AccessorValueType> scratch_space_;
+};
+
+// TypeTraits that maps the type parameter T of RepeatedFieldRef or
+// MutableRepeatedFieldRef to corresponding iterator type,
+// RepeatedFieldAccessor type, etc.
+template <typename T>
+struct PrimitiveTraits {
+  static constexpr bool is_primitive = false;
+};
+#define DEFINE_PRIMITIVE(TYPE, type)                 \
+  template <>                                        \
+  struct PrimitiveTraits<type> {                     \
+    static const bool is_primitive = true;           \
+    static const FieldDescriptor::CppType cpp_type = \
+        FieldDescriptor::CPPTYPE_##TYPE;             \
+  };
+DEFINE_PRIMITIVE(INT32, int32)
+DEFINE_PRIMITIVE(UINT32, uint32)
+DEFINE_PRIMITIVE(INT64, int64)
+DEFINE_PRIMITIVE(UINT64, uint64)
+DEFINE_PRIMITIVE(FLOAT, float)
+DEFINE_PRIMITIVE(DOUBLE, double)
+DEFINE_PRIMITIVE(BOOL, bool)
+#undef DEFINE_PRIMITIVE
+
+template <typename T>
+struct RefTypeTraits<
+    T, typename std::enable_if<PrimitiveTraits<T>::is_primitive>::type> {
+  typedef RepeatedFieldRefIterator<T> iterator;
+  typedef RepeatedFieldAccessor AccessorType;
+  typedef T AccessorValueType;
+  typedef T IteratorValueType;
+  typedef T* IteratorPointerType;
+  static constexpr FieldDescriptor::CppType cpp_type =
+      PrimitiveTraits<T>::cpp_type;
+  static const Descriptor* GetMessageFieldDescriptor() { return NULL; }
+};
+
+template <typename T>
+struct RefTypeTraits<
+    T, typename std::enable_if<is_proto_enum<T>::value>::type> {
+  typedef RepeatedFieldRefIterator<T> iterator;
+  typedef RepeatedFieldAccessor AccessorType;
+  // We use int32 for repeated enums in RepeatedFieldAccessor.
+  typedef int32 AccessorValueType;
+  typedef T IteratorValueType;
+  typedef int32* IteratorPointerType;
+  static constexpr FieldDescriptor::CppType cpp_type =
+      FieldDescriptor::CPPTYPE_ENUM;
+  static const Descriptor* GetMessageFieldDescriptor() { return NULL; }
+};
+
+template <typename T>
+struct RefTypeTraits<
+    T, typename std::enable_if<std::is_same<std::string, T>::value>::type> {
+  typedef RepeatedFieldRefIterator<T> iterator;
+  typedef RepeatedFieldAccessor AccessorType;
+  typedef std::string AccessorValueType;
+  typedef const std::string IteratorValueType;
+  typedef const std::string* IteratorPointerType;
+  static constexpr FieldDescriptor::CppType cpp_type =
+      FieldDescriptor::CPPTYPE_STRING;
+  static const Descriptor* GetMessageFieldDescriptor() { return NULL; }
+};
+
+template <typename T>
+struct MessageDescriptorGetter {
+  static const Descriptor* get() {
+    return T::default_instance().GetDescriptor();
+  }
+};
+template <>
+struct MessageDescriptorGetter<Message> {
+  static const Descriptor* get() { return NULL; }
+};
+
+template <typename T>
+struct RefTypeTraits<
+    T, typename std::enable_if<std::is_base_of<Message, T>::value>::type> {
+  typedef RepeatedFieldRefIterator<T> iterator;
+  typedef RepeatedFieldAccessor AccessorType;
+  typedef Message AccessorValueType;
+  typedef const T& IteratorValueType;
+  typedef const T* IteratorPointerType;
+  static constexpr FieldDescriptor::CppType cpp_type =
+      FieldDescriptor::CPPTYPE_MESSAGE;
+  static const Descriptor* GetMessageFieldDescriptor() {
+    return MessageDescriptorGetter<T>::get();
+  }
+};
+}  // namespace internal
+}  // namespace protobuf
+}  // namespace google
+
+#include <google/protobuf/port_undef.inc>
+
+#endif  // GOOGLE_PROTOBUF_REFLECTION_H__
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/struct.pb.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/struct.pb.h
new file mode 100644
index 0000000000000000000000000000000000000000..349b293ade6e3011ec94538b2acfb0099be7a369
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/struct.pb.h
@@ -0,0 +1,1178 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Generated by the protocol buffer compiler.  DO NOT EDIT!
+// source: google/protobuf/struct.proto
+
+#ifndef GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2fstruct_2eproto
+#define GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2fstruct_2eproto
+
+#include <limits>
+#include <string>
+
+#include <google/protobuf/port_def.inc>
+#if PROTOBUF_VERSION < 3013000
+#error This file was generated by a newer version of protoc which is
+#error incompatible with your Protocol Buffer headers. Please update
+#error your headers.
+#endif
+#if 3013000 < PROTOBUF_MIN_PROTOC_VERSION
+#error This file was generated by an older version of protoc which is
+#error incompatible with your Protocol Buffer headers. Please
+#error regenerate this file with a newer version of protoc.
+#endif
+
+#include <google/protobuf/port_undef.inc>
+#include <google/protobuf/io/coded_stream.h>
+#include <google/protobuf/arena.h>
+#include <google/protobuf/arenastring.h>
+#include <google/protobuf/generated_message_table_driven.h>
+#include <google/protobuf/generated_message_util.h>
+#include <google/protobuf/inlined_string_field.h>
+#include <google/protobuf/metadata_lite.h>
+#include <google/protobuf/generated_message_reflection.h>
+#include <google/protobuf/message.h>
+#include <google/protobuf/repeated_field.h>  // IWYU pragma: export
+#include <google/protobuf/extension_set.h>  // IWYU pragma: export
+#include <google/protobuf/map.h>  // IWYU pragma: export
+#include <google/protobuf/map_entry.h>
+#include <google/protobuf/map_field_inl.h>
+#include <google/protobuf/generated_enum_reflection.h>
+#include <google/protobuf/unknown_field_set.h>
+// @@protoc_insertion_point(includes)
+#include <google/protobuf/port_def.inc>
+#define PROTOBUF_INTERNAL_EXPORT_google_2fprotobuf_2fstruct_2eproto PROTOBUF_EXPORT
+PROTOBUF_NAMESPACE_OPEN
+namespace internal {
+class AnyMetadata;
+}  // namespace internal
+PROTOBUF_NAMESPACE_CLOSE
+
+// Internal implementation detail -- do not use these members.
+struct PROTOBUF_EXPORT TableStruct_google_2fprotobuf_2fstruct_2eproto {
+  static const ::PROTOBUF_NAMESPACE_ID::internal::ParseTableField entries[]
+    PROTOBUF_SECTION_VARIABLE(protodesc_cold);
+  static const ::PROTOBUF_NAMESPACE_ID::internal::AuxiliaryParseTableField aux[]
+    PROTOBUF_SECTION_VARIABLE(protodesc_cold);
+  static const ::PROTOBUF_NAMESPACE_ID::internal::ParseTable schema[4]
+    PROTOBUF_SECTION_VARIABLE(protodesc_cold);
+  static const ::PROTOBUF_NAMESPACE_ID::internal::FieldMetadata field_metadata[];
+  static const ::PROTOBUF_NAMESPACE_ID::internal::SerializationTable serialization_table[];
+  static const ::PROTOBUF_NAMESPACE_ID::uint32 offsets[];
+};
+extern PROTOBUF_EXPORT const ::PROTOBUF_NAMESPACE_ID::internal::DescriptorTable descriptor_table_google_2fprotobuf_2fstruct_2eproto;
+PROTOBUF_NAMESPACE_OPEN
+class ListValue;
+class ListValueDefaultTypeInternal;
+PROTOBUF_EXPORT extern ListValueDefaultTypeInternal _ListValue_default_instance_;
+class Struct;
+class StructDefaultTypeInternal;
+PROTOBUF_EXPORT extern StructDefaultTypeInternal _Struct_default_instance_;
+class Struct_FieldsEntry_DoNotUse;
+class Struct_FieldsEntry_DoNotUseDefaultTypeInternal;
+PROTOBUF_EXPORT extern Struct_FieldsEntry_DoNotUseDefaultTypeInternal _Struct_FieldsEntry_DoNotUse_default_instance_;
+class Value;
+class ValueDefaultTypeInternal;
+PROTOBUF_EXPORT extern ValueDefaultTypeInternal _Value_default_instance_;
+PROTOBUF_NAMESPACE_CLOSE
+PROTOBUF_NAMESPACE_OPEN
+template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::ListValue* Arena::CreateMaybeMessage<PROTOBUF_NAMESPACE_ID::ListValue>(Arena*);
+template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::Struct* Arena::CreateMaybeMessage<PROTOBUF_NAMESPACE_ID::Struct>(Arena*);
+template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::Struct_FieldsEntry_DoNotUse* Arena::CreateMaybeMessage<PROTOBUF_NAMESPACE_ID::Struct_FieldsEntry_DoNotUse>(Arena*);
+template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::Value* Arena::CreateMaybeMessage<PROTOBUF_NAMESPACE_ID::Value>(Arena*);
+PROTOBUF_NAMESPACE_CLOSE
+PROTOBUF_NAMESPACE_OPEN
+
+enum NullValue : int {
+  NULL_VALUE = 0,
+  NullValue_INT_MIN_SENTINEL_DO_NOT_USE_ = std::numeric_limits<::PROTOBUF_NAMESPACE_ID::int32>::min(),
+  NullValue_INT_MAX_SENTINEL_DO_NOT_USE_ = std::numeric_limits<::PROTOBUF_NAMESPACE_ID::int32>::max()
+};
+PROTOBUF_EXPORT bool NullValue_IsValid(int value);
+constexpr NullValue NullValue_MIN = NULL_VALUE;
+constexpr NullValue NullValue_MAX = NULL_VALUE;
+constexpr int NullValue_ARRAYSIZE = NullValue_MAX + 1;
+
+PROTOBUF_EXPORT const ::PROTOBUF_NAMESPACE_ID::EnumDescriptor* NullValue_descriptor();
+template<typename T>
+inline const std::string& NullValue_Name(T enum_t_value) {
+  static_assert(::std::is_same<T, NullValue>::value ||
+    ::std::is_integral<T>::value,
+    "Incorrect type passed to function NullValue_Name.");
+  return ::PROTOBUF_NAMESPACE_ID::internal::NameOfEnum(
+    NullValue_descriptor(), enum_t_value);
+}
+inline bool NullValue_Parse(
+    ::PROTOBUF_NAMESPACE_ID::ConstStringParam name, NullValue* value) {
+  return ::PROTOBUF_NAMESPACE_ID::internal::ParseNamedEnum<NullValue>(
+    NullValue_descriptor(), name, value);
+}
+// ===================================================================
+
+class Struct_FieldsEntry_DoNotUse : public ::PROTOBUF_NAMESPACE_ID::internal::MapEntry<Struct_FieldsEntry_DoNotUse, 
+    std::string, PROTOBUF_NAMESPACE_ID::Value,
+    ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::TYPE_STRING,
+    ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::TYPE_MESSAGE,
+    0 > {
+public:
+  typedef ::PROTOBUF_NAMESPACE_ID::internal::MapEntry<Struct_FieldsEntry_DoNotUse, 
+    std::string, PROTOBUF_NAMESPACE_ID::Value,
+    ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::TYPE_STRING,
+    ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::TYPE_MESSAGE,
+    0 > SuperType;
+  Struct_FieldsEntry_DoNotUse();
+  explicit Struct_FieldsEntry_DoNotUse(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  void MergeFrom(const Struct_FieldsEntry_DoNotUse& other);
+  static const Struct_FieldsEntry_DoNotUse* internal_default_instance() { return reinterpret_cast<const Struct_FieldsEntry_DoNotUse*>(&_Struct_FieldsEntry_DoNotUse_default_instance_); }
+  static bool ValidateKey(std::string* s) {
+    return ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::VerifyUtf8String(s->data(), static_cast<int>(s->size()), ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::PARSE, "google.protobuf.Struct.FieldsEntry.key");
+ }
+  static bool ValidateValue(void*) { return true; }
+  void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& other) final;
+  ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final;
+  private:
+  static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() {
+    ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fstruct_2eproto);
+    return ::descriptor_table_google_2fprotobuf_2fstruct_2eproto.file_level_metadata[0];
+  }
+
+  public:
+};
+
+// -------------------------------------------------------------------
+
+class PROTOBUF_EXPORT Struct PROTOBUF_FINAL :
+    public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.Struct) */ {
+ public:
+  inline Struct() : Struct(nullptr) {}
+  virtual ~Struct();
+
+  Struct(const Struct& from);
+  Struct(Struct&& from) noexcept
+    : Struct() {
+    *this = ::std::move(from);
+  }
+
+  inline Struct& operator=(const Struct& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  inline Struct& operator=(Struct&& from) noexcept {
+    if (GetArena() == from.GetArena()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() {
+    return GetDescriptor();
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() {
+    return GetMetadataStatic().descriptor;
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() {
+    return GetMetadataStatic().reflection;
+  }
+  static const Struct& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const Struct* internal_default_instance() {
+    return reinterpret_cast<const Struct*>(
+               &_Struct_default_instance_);
+  }
+  static constexpr int kIndexInFileMessages =
+    1;
+
+  friend void swap(Struct& a, Struct& b) {
+    a.Swap(&b);
+  }
+  inline void Swap(Struct* other) {
+    if (other == this) return;
+    if (GetArena() == other->GetArena()) {
+      InternalSwap(other);
+    } else {
+      ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other);
+    }
+  }
+  void UnsafeArenaSwap(Struct* other) {
+    if (other == this) return;
+    GOOGLE_DCHECK(GetArena() == other->GetArena());
+    InternalSwap(other);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline Struct* New() const final {
+    return CreateMaybeMessage<Struct>(nullptr);
+  }
+
+  Struct* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final {
+    return CreateMaybeMessage<Struct>(arena);
+  }
+  void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void CopyFrom(const Struct& from);
+  void MergeFrom(const Struct& from);
+  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
+  bool IsInitialized() const final;
+
+  size_t ByteSizeLong() const final;
+  const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final;
+  ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize(
+      ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final;
+  int GetCachedSize() const final { return _cached_size_.Get(); }
+
+  private:
+  inline void SharedCtor();
+  inline void SharedDtor();
+  void SetCachedSize(int size) const final;
+  void InternalSwap(Struct* other);
+  friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata;
+  static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() {
+    return "google.protobuf.Struct";
+  }
+  protected:
+  explicit Struct(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  private:
+  static void ArenaDtor(void* object);
+  inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  public:
+
+  ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final;
+  private:
+  static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() {
+    ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fstruct_2eproto);
+    return ::descriptor_table_google_2fprotobuf_2fstruct_2eproto.file_level_metadata[kIndexInFileMessages];
+  }
+
+  public:
+
+  // nested types ----------------------------------------------------
+
+
+  // accessors -------------------------------------------------------
+
+  enum : int {
+    kFieldsFieldNumber = 1,
+  };
+  // map<string, .google.protobuf.Value> fields = 1;
+  int fields_size() const;
+  private:
+  int _internal_fields_size() const;
+  public:
+  void clear_fields();
+  private:
+  const ::PROTOBUF_NAMESPACE_ID::Map< std::string, PROTOBUF_NAMESPACE_ID::Value >&
+      _internal_fields() const;
+  ::PROTOBUF_NAMESPACE_ID::Map< std::string, PROTOBUF_NAMESPACE_ID::Value >*
+      _internal_mutable_fields();
+  public:
+  const ::PROTOBUF_NAMESPACE_ID::Map< std::string, PROTOBUF_NAMESPACE_ID::Value >&
+      fields() const;
+  ::PROTOBUF_NAMESPACE_ID::Map< std::string, PROTOBUF_NAMESPACE_ID::Value >*
+      mutable_fields();
+
+  // @@protoc_insertion_point(class_scope:google.protobuf.Struct)
+ private:
+  class _Internal;
+
+  template <typename T> friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper;
+  typedef void InternalArenaConstructable_;
+  typedef void DestructorSkippable_;
+  ::PROTOBUF_NAMESPACE_ID::internal::MapField<
+      Struct_FieldsEntry_DoNotUse,
+      std::string, PROTOBUF_NAMESPACE_ID::Value,
+      ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::TYPE_STRING,
+      ::PROTOBUF_NAMESPACE_ID::internal::WireFormatLite::TYPE_MESSAGE,
+      0 > fields_;
+  mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_;
+  friend struct ::TableStruct_google_2fprotobuf_2fstruct_2eproto;
+};
+// -------------------------------------------------------------------
+
+class PROTOBUF_EXPORT Value PROTOBUF_FINAL :
+    public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.Value) */ {
+ public:
+  inline Value() : Value(nullptr) {}
+  virtual ~Value();
+
+  Value(const Value& from);
+  Value(Value&& from) noexcept
+    : Value() {
+    *this = ::std::move(from);
+  }
+
+  inline Value& operator=(const Value& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  inline Value& operator=(Value&& from) noexcept {
+    if (GetArena() == from.GetArena()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() {
+    return GetDescriptor();
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() {
+    return GetMetadataStatic().descriptor;
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() {
+    return GetMetadataStatic().reflection;
+  }
+  static const Value& default_instance();
+
+  enum KindCase {
+    kNullValue = 1,
+    kNumberValue = 2,
+    kStringValue = 3,
+    kBoolValue = 4,
+    kStructValue = 5,
+    kListValue = 6,
+    KIND_NOT_SET = 0,
+  };
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const Value* internal_default_instance() {
+    return reinterpret_cast<const Value*>(
+               &_Value_default_instance_);
+  }
+  static constexpr int kIndexInFileMessages =
+    2;
+
+  friend void swap(Value& a, Value& b) {
+    a.Swap(&b);
+  }
+  inline void Swap(Value* other) {
+    if (other == this) return;
+    if (GetArena() == other->GetArena()) {
+      InternalSwap(other);
+    } else {
+      ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other);
+    }
+  }
+  void UnsafeArenaSwap(Value* other) {
+    if (other == this) return;
+    GOOGLE_DCHECK(GetArena() == other->GetArena());
+    InternalSwap(other);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline Value* New() const final {
+    return CreateMaybeMessage<Value>(nullptr);
+  }
+
+  Value* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final {
+    return CreateMaybeMessage<Value>(arena);
+  }
+  void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void CopyFrom(const Value& from);
+  void MergeFrom(const Value& from);
+  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
+  bool IsInitialized() const final;
+
+  size_t ByteSizeLong() const final;
+  const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final;
+  ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize(
+      ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final;
+  int GetCachedSize() const final { return _cached_size_.Get(); }
+
+  private:
+  inline void SharedCtor();
+  inline void SharedDtor();
+  void SetCachedSize(int size) const final;
+  void InternalSwap(Value* other);
+  friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata;
+  static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() {
+    return "google.protobuf.Value";
+  }
+  protected:
+  explicit Value(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  private:
+  static void ArenaDtor(void* object);
+  inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  public:
+
+  ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final;
+  private:
+  static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() {
+    ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fstruct_2eproto);
+    return ::descriptor_table_google_2fprotobuf_2fstruct_2eproto.file_level_metadata[kIndexInFileMessages];
+  }
+
+  public:
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  enum : int {
+    kNullValueFieldNumber = 1,
+    kNumberValueFieldNumber = 2,
+    kStringValueFieldNumber = 3,
+    kBoolValueFieldNumber = 4,
+    kStructValueFieldNumber = 5,
+    kListValueFieldNumber = 6,
+  };
+  // .google.protobuf.NullValue null_value = 1;
+  private:
+  bool _internal_has_null_value() const;
+  public:
+  void clear_null_value();
+  PROTOBUF_NAMESPACE_ID::NullValue null_value() const;
+  void set_null_value(PROTOBUF_NAMESPACE_ID::NullValue value);
+  private:
+  PROTOBUF_NAMESPACE_ID::NullValue _internal_null_value() const;
+  void _internal_set_null_value(PROTOBUF_NAMESPACE_ID::NullValue value);
+  public:
+
+  // double number_value = 2;
+  private:
+  bool _internal_has_number_value() const;
+  public:
+  void clear_number_value();
+  double number_value() const;
+  void set_number_value(double value);
+  private:
+  double _internal_number_value() const;
+  void _internal_set_number_value(double value);
+  public:
+
+  // string string_value = 3;
+  private:
+  bool _internal_has_string_value() const;
+  public:
+  void clear_string_value();
+  const std::string& string_value() const;
+  void set_string_value(const std::string& value);
+  void set_string_value(std::string&& value);
+  void set_string_value(const char* value);
+  void set_string_value(const char* value, size_t size);
+  std::string* mutable_string_value();
+  std::string* release_string_value();
+  void set_allocated_string_value(std::string* string_value);
+  private:
+  const std::string& _internal_string_value() const;
+  void _internal_set_string_value(const std::string& value);
+  std::string* _internal_mutable_string_value();
+  public:
+
+  // bool bool_value = 4;
+  private:
+  bool _internal_has_bool_value() const;
+  public:
+  void clear_bool_value();
+  bool bool_value() const;
+  void set_bool_value(bool value);
+  private:
+  bool _internal_bool_value() const;
+  void _internal_set_bool_value(bool value);
+  public:
+
+  // .google.protobuf.Struct struct_value = 5;
+  bool has_struct_value() const;
+  private:
+  bool _internal_has_struct_value() const;
+  public:
+  void clear_struct_value();
+  const PROTOBUF_NAMESPACE_ID::Struct& struct_value() const;
+  PROTOBUF_NAMESPACE_ID::Struct* release_struct_value();
+  PROTOBUF_NAMESPACE_ID::Struct* mutable_struct_value();
+  void set_allocated_struct_value(PROTOBUF_NAMESPACE_ID::Struct* struct_value);
+  private:
+  const PROTOBUF_NAMESPACE_ID::Struct& _internal_struct_value() const;
+  PROTOBUF_NAMESPACE_ID::Struct* _internal_mutable_struct_value();
+  public:
+  void unsafe_arena_set_allocated_struct_value(
+      PROTOBUF_NAMESPACE_ID::Struct* struct_value);
+  PROTOBUF_NAMESPACE_ID::Struct* unsafe_arena_release_struct_value();
+
+  // .google.protobuf.ListValue list_value = 6;
+  bool has_list_value() const;
+  private:
+  bool _internal_has_list_value() const;
+  public:
+  void clear_list_value();
+  const PROTOBUF_NAMESPACE_ID::ListValue& list_value() const;
+  PROTOBUF_NAMESPACE_ID::ListValue* release_list_value();
+  PROTOBUF_NAMESPACE_ID::ListValue* mutable_list_value();
+  void set_allocated_list_value(PROTOBUF_NAMESPACE_ID::ListValue* list_value);
+  private:
+  const PROTOBUF_NAMESPACE_ID::ListValue& _internal_list_value() const;
+  PROTOBUF_NAMESPACE_ID::ListValue* _internal_mutable_list_value();
+  public:
+  void unsafe_arena_set_allocated_list_value(
+      PROTOBUF_NAMESPACE_ID::ListValue* list_value);
+  PROTOBUF_NAMESPACE_ID::ListValue* unsafe_arena_release_list_value();
+
+  void clear_kind();
+  KindCase kind_case() const;
+  // @@protoc_insertion_point(class_scope:google.protobuf.Value)
+ private:
+  class _Internal;
+  void set_has_null_value();
+  void set_has_number_value();
+  void set_has_string_value();
+  void set_has_bool_value();
+  void set_has_struct_value();
+  void set_has_list_value();
+
+  inline bool has_kind() const;
+  inline void clear_has_kind();
+
+  template <typename T> friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper;
+  typedef void InternalArenaConstructable_;
+  typedef void DestructorSkippable_;
+  union KindUnion {
+    KindUnion() {}
+    int null_value_;
+    double number_value_;
+    ::PROTOBUF_NAMESPACE_ID::internal::ArenaStringPtr string_value_;
+    bool bool_value_;
+    PROTOBUF_NAMESPACE_ID::Struct* struct_value_;
+    PROTOBUF_NAMESPACE_ID::ListValue* list_value_;
+  } kind_;
+  mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_;
+  ::PROTOBUF_NAMESPACE_ID::uint32 _oneof_case_[1];
+
+  friend struct ::TableStruct_google_2fprotobuf_2fstruct_2eproto;
+};
+// -------------------------------------------------------------------
+
+class PROTOBUF_EXPORT ListValue PROTOBUF_FINAL :
+    public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.ListValue) */ {
+ public:
+  inline ListValue() : ListValue(nullptr) {}
+  virtual ~ListValue();
+
+  ListValue(const ListValue& from);
+  ListValue(ListValue&& from) noexcept
+    : ListValue() {
+    *this = ::std::move(from);
+  }
+
+  inline ListValue& operator=(const ListValue& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  inline ListValue& operator=(ListValue&& from) noexcept {
+    if (GetArena() == from.GetArena()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() {
+    return GetDescriptor();
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() {
+    return GetMetadataStatic().descriptor;
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() {
+    return GetMetadataStatic().reflection;
+  }
+  static const ListValue& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const ListValue* internal_default_instance() {
+    return reinterpret_cast<const ListValue*>(
+               &_ListValue_default_instance_);
+  }
+  static constexpr int kIndexInFileMessages =
+    3;
+
+  friend void swap(ListValue& a, ListValue& b) {
+    a.Swap(&b);
+  }
+  inline void Swap(ListValue* other) {
+    if (other == this) return;
+    if (GetArena() == other->GetArena()) {
+      InternalSwap(other);
+    } else {
+      ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other);
+    }
+  }
+  void UnsafeArenaSwap(ListValue* other) {
+    if (other == this) return;
+    GOOGLE_DCHECK(GetArena() == other->GetArena());
+    InternalSwap(other);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline ListValue* New() const final {
+    return CreateMaybeMessage<ListValue>(nullptr);
+  }
+
+  ListValue* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final {
+    return CreateMaybeMessage<ListValue>(arena);
+  }
+  void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void CopyFrom(const ListValue& from);
+  void MergeFrom(const ListValue& from);
+  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
+  bool IsInitialized() const final;
+
+  size_t ByteSizeLong() const final;
+  const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final;
+  ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize(
+      ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final;
+  int GetCachedSize() const final { return _cached_size_.Get(); }
+
+  private:
+  inline void SharedCtor();
+  inline void SharedDtor();
+  void SetCachedSize(int size) const final;
+  void InternalSwap(ListValue* other);
+  friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata;
+  static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() {
+    return "google.protobuf.ListValue";
+  }
+  protected:
+  explicit ListValue(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  private:
+  static void ArenaDtor(void* object);
+  inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  public:
+
+  ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final;
+  private:
+  static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() {
+    ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2fstruct_2eproto);
+    return ::descriptor_table_google_2fprotobuf_2fstruct_2eproto.file_level_metadata[kIndexInFileMessages];
+  }
+
+  public:
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  enum : int {
+    kValuesFieldNumber = 1,
+  };
+  // repeated .google.protobuf.Value values = 1;
+  int values_size() const;
+  private:
+  int _internal_values_size() const;
+  public:
+  void clear_values();
+  PROTOBUF_NAMESPACE_ID::Value* mutable_values(int index);
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::Value >*
+      mutable_values();
+  private:
+  const PROTOBUF_NAMESPACE_ID::Value& _internal_values(int index) const;
+  PROTOBUF_NAMESPACE_ID::Value* _internal_add_values();
+  public:
+  const PROTOBUF_NAMESPACE_ID::Value& values(int index) const;
+  PROTOBUF_NAMESPACE_ID::Value* add_values();
+  const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::Value >&
+      values() const;
+
+  // @@protoc_insertion_point(class_scope:google.protobuf.ListValue)
+ private:
+  class _Internal;
+
+  template <typename T> friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper;
+  typedef void InternalArenaConstructable_;
+  typedef void DestructorSkippable_;
+  ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::Value > values_;
+  mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_;
+  friend struct ::TableStruct_google_2fprotobuf_2fstruct_2eproto;
+};
+// ===================================================================
+
+
+// ===================================================================
+
+#ifdef __GNUC__
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#endif  // __GNUC__
+// -------------------------------------------------------------------
+
+// Struct
+
+// map<string, .google.protobuf.Value> fields = 1;
+inline int Struct::_internal_fields_size() const {
+  return fields_.size();
+}
+inline int Struct::fields_size() const {
+  return _internal_fields_size();
+}
+inline void Struct::clear_fields() {
+  fields_.Clear();
+}
+inline const ::PROTOBUF_NAMESPACE_ID::Map< std::string, PROTOBUF_NAMESPACE_ID::Value >&
+Struct::_internal_fields() const {
+  return fields_.GetMap();
+}
+inline const ::PROTOBUF_NAMESPACE_ID::Map< std::string, PROTOBUF_NAMESPACE_ID::Value >&
+Struct::fields() const {
+  // @@protoc_insertion_point(field_map:google.protobuf.Struct.fields)
+  return _internal_fields();
+}
+inline ::PROTOBUF_NAMESPACE_ID::Map< std::string, PROTOBUF_NAMESPACE_ID::Value >*
+Struct::_internal_mutable_fields() {
+  return fields_.MutableMap();
+}
+inline ::PROTOBUF_NAMESPACE_ID::Map< std::string, PROTOBUF_NAMESPACE_ID::Value >*
+Struct::mutable_fields() {
+  // @@protoc_insertion_point(field_mutable_map:google.protobuf.Struct.fields)
+  return _internal_mutable_fields();
+}
+
+// -------------------------------------------------------------------
+
+// Value
+
+// .google.protobuf.NullValue null_value = 1;
+inline bool Value::_internal_has_null_value() const {
+  return kind_case() == kNullValue;
+}
+inline void Value::set_has_null_value() {
+  _oneof_case_[0] = kNullValue;
+}
+inline void Value::clear_null_value() {
+  if (_internal_has_null_value()) {
+    kind_.null_value_ = 0;
+    clear_has_kind();
+  }
+}
+inline PROTOBUF_NAMESPACE_ID::NullValue Value::_internal_null_value() const {
+  if (_internal_has_null_value()) {
+    return static_cast< PROTOBUF_NAMESPACE_ID::NullValue >(kind_.null_value_);
+  }
+  return static_cast< PROTOBUF_NAMESPACE_ID::NullValue >(0);
+}
+inline PROTOBUF_NAMESPACE_ID::NullValue Value::null_value() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.Value.null_value)
+  return _internal_null_value();
+}
+inline void Value::_internal_set_null_value(PROTOBUF_NAMESPACE_ID::NullValue value) {
+  if (!_internal_has_null_value()) {
+    clear_kind();
+    set_has_null_value();
+  }
+  kind_.null_value_ = value;
+}
+inline void Value::set_null_value(PROTOBUF_NAMESPACE_ID::NullValue value) {
+  // @@protoc_insertion_point(field_set:google.protobuf.Value.null_value)
+  _internal_set_null_value(value);
+}
+
+// double number_value = 2;
+inline bool Value::_internal_has_number_value() const {
+  return kind_case() == kNumberValue;
+}
+inline void Value::set_has_number_value() {
+  _oneof_case_[0] = kNumberValue;
+}
+inline void Value::clear_number_value() {
+  if (_internal_has_number_value()) {
+    kind_.number_value_ = 0;
+    clear_has_kind();
+  }
+}
+inline double Value::_internal_number_value() const {
+  if (_internal_has_number_value()) {
+    return kind_.number_value_;
+  }
+  return 0;
+}
+inline void Value::_internal_set_number_value(double value) {
+  if (!_internal_has_number_value()) {
+    clear_kind();
+    set_has_number_value();
+  }
+  kind_.number_value_ = value;
+}
+inline double Value::number_value() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.Value.number_value)
+  return _internal_number_value();
+}
+inline void Value::set_number_value(double value) {
+  _internal_set_number_value(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.Value.number_value)
+}
+
+// string string_value = 3;
+inline bool Value::_internal_has_string_value() const {
+  return kind_case() == kStringValue;
+}
+inline void Value::set_has_string_value() {
+  _oneof_case_[0] = kStringValue;
+}
+inline void Value::clear_string_value() {
+  if (_internal_has_string_value()) {
+    kind_.string_value_.Destroy(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+    clear_has_kind();
+  }
+}
+inline const std::string& Value::string_value() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.Value.string_value)
+  return _internal_string_value();
+}
+inline void Value::set_string_value(const std::string& value) {
+  _internal_set_string_value(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.Value.string_value)
+}
+inline std::string* Value::mutable_string_value() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.Value.string_value)
+  return _internal_mutable_string_value();
+}
+inline const std::string& Value::_internal_string_value() const {
+  if (_internal_has_string_value()) {
+    return kind_.string_value_.Get();
+  }
+  return *&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited();
+}
+inline void Value::_internal_set_string_value(const std::string& value) {
+  if (!_internal_has_string_value()) {
+    clear_kind();
+    set_has_string_value();
+    kind_.string_value_.UnsafeSetDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited());
+  }
+  kind_.string_value_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), value, GetArena());
+}
+inline void Value::set_string_value(std::string&& value) {
+  // @@protoc_insertion_point(field_set:google.protobuf.Value.string_value)
+  if (!_internal_has_string_value()) {
+    clear_kind();
+    set_has_string_value();
+    kind_.string_value_.UnsafeSetDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited());
+  }
+  kind_.string_value_.Set(
+    &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::move(value), GetArena());
+  // @@protoc_insertion_point(field_set_rvalue:google.protobuf.Value.string_value)
+}
+inline void Value::set_string_value(const char* value) {
+  GOOGLE_DCHECK(value != nullptr);
+  if (!_internal_has_string_value()) {
+    clear_kind();
+    set_has_string_value();
+    kind_.string_value_.UnsafeSetDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited());
+  }
+  kind_.string_value_.Set(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(),
+      ::std::string(value), GetArena());
+  // @@protoc_insertion_point(field_set_char:google.protobuf.Value.string_value)
+}
+inline void Value::set_string_value(const char* value,
+                             size_t size) {
+  if (!_internal_has_string_value()) {
+    clear_kind();
+    set_has_string_value();
+    kind_.string_value_.UnsafeSetDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited());
+  }
+  kind_.string_value_.Set(
+      &::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), ::std::string(
+      reinterpret_cast<const char*>(value), size),
+      GetArena());
+  // @@protoc_insertion_point(field_set_pointer:google.protobuf.Value.string_value)
+}
+inline std::string* Value::_internal_mutable_string_value() {
+  if (!_internal_has_string_value()) {
+    clear_kind();
+    set_has_string_value();
+    kind_.string_value_.UnsafeSetDefault(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited());
+  }
+  return kind_.string_value_.Mutable(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+}
+inline std::string* Value::release_string_value() {
+  // @@protoc_insertion_point(field_release:google.protobuf.Value.string_value)
+  if (_internal_has_string_value()) {
+    clear_has_kind();
+    return kind_.string_value_.Release(&::PROTOBUF_NAMESPACE_ID::internal::GetEmptyStringAlreadyInited(), GetArena());
+  } else {
+    return nullptr;
+  }
+}
+inline void Value::set_allocated_string_value(std::string* string_value) {
+  if (has_kind()) {
+    clear_kind();
+  }
+  if (string_value != nullptr) {
+    set_has_string_value();
+    kind_.string_value_.UnsafeSetDefault(string_value);
+    ::PROTOBUF_NAMESPACE_ID::Arena* arena = GetArena();
+    if (arena != nullptr) {
+      arena->Own(string_value);
+    }
+  }
+  // @@protoc_insertion_point(field_set_allocated:google.protobuf.Value.string_value)
+}
+
+// bool bool_value = 4;
+inline bool Value::_internal_has_bool_value() const {
+  return kind_case() == kBoolValue;
+}
+inline void Value::set_has_bool_value() {
+  _oneof_case_[0] = kBoolValue;
+}
+inline void Value::clear_bool_value() {
+  if (_internal_has_bool_value()) {
+    kind_.bool_value_ = false;
+    clear_has_kind();
+  }
+}
+inline bool Value::_internal_bool_value() const {
+  if (_internal_has_bool_value()) {
+    return kind_.bool_value_;
+  }
+  return false;
+}
+inline void Value::_internal_set_bool_value(bool value) {
+  if (!_internal_has_bool_value()) {
+    clear_kind();
+    set_has_bool_value();
+  }
+  kind_.bool_value_ = value;
+}
+inline bool Value::bool_value() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.Value.bool_value)
+  return _internal_bool_value();
+}
+inline void Value::set_bool_value(bool value) {
+  _internal_set_bool_value(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.Value.bool_value)
+}
+
+// .google.protobuf.Struct struct_value = 5;
+inline bool Value::_internal_has_struct_value() const {
+  return kind_case() == kStructValue;
+}
+inline bool Value::has_struct_value() const {
+  return _internal_has_struct_value();
+}
+inline void Value::set_has_struct_value() {
+  _oneof_case_[0] = kStructValue;
+}
+inline void Value::clear_struct_value() {
+  if (_internal_has_struct_value()) {
+    if (GetArena() == nullptr) {
+      delete kind_.struct_value_;
+    }
+    clear_has_kind();
+  }
+}
+inline PROTOBUF_NAMESPACE_ID::Struct* Value::release_struct_value() {
+  // @@protoc_insertion_point(field_release:google.protobuf.Value.struct_value)
+  if (_internal_has_struct_value()) {
+    clear_has_kind();
+      PROTOBUF_NAMESPACE_ID::Struct* temp = kind_.struct_value_;
+    if (GetArena() != nullptr) {
+      temp = ::PROTOBUF_NAMESPACE_ID::internal::DuplicateIfNonNull(temp);
+    }
+    kind_.struct_value_ = nullptr;
+    return temp;
+  } else {
+    return nullptr;
+  }
+}
+inline const PROTOBUF_NAMESPACE_ID::Struct& Value::_internal_struct_value() const {
+  return _internal_has_struct_value()
+      ? *kind_.struct_value_
+      : *reinterpret_cast< PROTOBUF_NAMESPACE_ID::Struct*>(&PROTOBUF_NAMESPACE_ID::_Struct_default_instance_);
+}
+inline const PROTOBUF_NAMESPACE_ID::Struct& Value::struct_value() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.Value.struct_value)
+  return _internal_struct_value();
+}
+inline PROTOBUF_NAMESPACE_ID::Struct* Value::unsafe_arena_release_struct_value() {
+  // @@protoc_insertion_point(field_unsafe_arena_release:google.protobuf.Value.struct_value)
+  if (_internal_has_struct_value()) {
+    clear_has_kind();
+    PROTOBUF_NAMESPACE_ID::Struct* temp = kind_.struct_value_;
+    kind_.struct_value_ = nullptr;
+    return temp;
+  } else {
+    return nullptr;
+  }
+}
+inline void Value::unsafe_arena_set_allocated_struct_value(PROTOBUF_NAMESPACE_ID::Struct* struct_value) {
+  clear_kind();
+  if (struct_value) {
+    set_has_struct_value();
+    kind_.struct_value_ = struct_value;
+  }
+  // @@protoc_insertion_point(field_unsafe_arena_set_allocated:google.protobuf.Value.struct_value)
+}
+inline PROTOBUF_NAMESPACE_ID::Struct* Value::_internal_mutable_struct_value() {
+  if (!_internal_has_struct_value()) {
+    clear_kind();
+    set_has_struct_value();
+    kind_.struct_value_ = CreateMaybeMessage< PROTOBUF_NAMESPACE_ID::Struct >(GetArena());
+  }
+  return kind_.struct_value_;
+}
+inline PROTOBUF_NAMESPACE_ID::Struct* Value::mutable_struct_value() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.Value.struct_value)
+  return _internal_mutable_struct_value();
+}
+
+// .google.protobuf.ListValue list_value = 6;
+inline bool Value::_internal_has_list_value() const {
+  return kind_case() == kListValue;
+}
+inline bool Value::has_list_value() const {
+  return _internal_has_list_value();
+}
+inline void Value::set_has_list_value() {
+  _oneof_case_[0] = kListValue;
+}
+inline void Value::clear_list_value() {
+  if (_internal_has_list_value()) {
+    if (GetArena() == nullptr) {
+      delete kind_.list_value_;
+    }
+    clear_has_kind();
+  }
+}
+inline PROTOBUF_NAMESPACE_ID::ListValue* Value::release_list_value() {
+  // @@protoc_insertion_point(field_release:google.protobuf.Value.list_value)
+  if (_internal_has_list_value()) {
+    clear_has_kind();
+      PROTOBUF_NAMESPACE_ID::ListValue* temp = kind_.list_value_;
+    if (GetArena() != nullptr) {
+      temp = ::PROTOBUF_NAMESPACE_ID::internal::DuplicateIfNonNull(temp);
+    }
+    kind_.list_value_ = nullptr;
+    return temp;
+  } else {
+    return nullptr;
+  }
+}
+inline const PROTOBUF_NAMESPACE_ID::ListValue& Value::_internal_list_value() const {
+  return _internal_has_list_value()
+      ? *kind_.list_value_
+      : *reinterpret_cast< PROTOBUF_NAMESPACE_ID::ListValue*>(&PROTOBUF_NAMESPACE_ID::_ListValue_default_instance_);
+}
+inline const PROTOBUF_NAMESPACE_ID::ListValue& Value::list_value() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.Value.list_value)
+  return _internal_list_value();
+}
+inline PROTOBUF_NAMESPACE_ID::ListValue* Value::unsafe_arena_release_list_value() {
+  // @@protoc_insertion_point(field_unsafe_arena_release:google.protobuf.Value.list_value)
+  if (_internal_has_list_value()) {
+    clear_has_kind();
+    PROTOBUF_NAMESPACE_ID::ListValue* temp = kind_.list_value_;
+    kind_.list_value_ = nullptr;
+    return temp;
+  } else {
+    return nullptr;
+  }
+}
+inline void Value::unsafe_arena_set_allocated_list_value(PROTOBUF_NAMESPACE_ID::ListValue* list_value) {
+  clear_kind();
+  if (list_value) {
+    set_has_list_value();
+    kind_.list_value_ = list_value;
+  }
+  // @@protoc_insertion_point(field_unsafe_arena_set_allocated:google.protobuf.Value.list_value)
+}
+inline PROTOBUF_NAMESPACE_ID::ListValue* Value::_internal_mutable_list_value() {
+  if (!_internal_has_list_value()) {
+    clear_kind();
+    set_has_list_value();
+    kind_.list_value_ = CreateMaybeMessage< PROTOBUF_NAMESPACE_ID::ListValue >(GetArena());
+  }
+  return kind_.list_value_;
+}
+inline PROTOBUF_NAMESPACE_ID::ListValue* Value::mutable_list_value() {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.Value.list_value)
+  return _internal_mutable_list_value();
+}
+
+inline bool Value::has_kind() const {
+  return kind_case() != KIND_NOT_SET;
+}
+inline void Value::clear_has_kind() {
+  _oneof_case_[0] = KIND_NOT_SET;
+}
+inline Value::KindCase Value::kind_case() const {
+  return Value::KindCase(_oneof_case_[0]);
+}
+// -------------------------------------------------------------------
+
+// ListValue
+
+// repeated .google.protobuf.Value values = 1;
+inline int ListValue::_internal_values_size() const {
+  return values_.size();
+}
+inline int ListValue::values_size() const {
+  return _internal_values_size();
+}
+inline void ListValue::clear_values() {
+  values_.Clear();
+}
+inline PROTOBUF_NAMESPACE_ID::Value* ListValue::mutable_values(int index) {
+  // @@protoc_insertion_point(field_mutable:google.protobuf.ListValue.values)
+  return values_.Mutable(index);
+}
+inline ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::Value >*
+ListValue::mutable_values() {
+  // @@protoc_insertion_point(field_mutable_list:google.protobuf.ListValue.values)
+  return &values_;
+}
+inline const PROTOBUF_NAMESPACE_ID::Value& ListValue::_internal_values(int index) const {
+  return values_.Get(index);
+}
+inline const PROTOBUF_NAMESPACE_ID::Value& ListValue::values(int index) const {
+  // @@protoc_insertion_point(field_get:google.protobuf.ListValue.values)
+  return _internal_values(index);
+}
+inline PROTOBUF_NAMESPACE_ID::Value* ListValue::_internal_add_values() {
+  return values_.Add();
+}
+inline PROTOBUF_NAMESPACE_ID::Value* ListValue::add_values() {
+  // @@protoc_insertion_point(field_add:google.protobuf.ListValue.values)
+  return _internal_add_values();
+}
+inline const ::PROTOBUF_NAMESPACE_ID::RepeatedPtrField< PROTOBUF_NAMESPACE_ID::Value >&
+ListValue::values() const {
+  // @@protoc_insertion_point(field_list:google.protobuf.ListValue.values)
+  return values_;
+}
+
+#ifdef __GNUC__
+  #pragma GCC diagnostic pop
+#endif  // __GNUC__
+// -------------------------------------------------------------------
+
+// -------------------------------------------------------------------
+
+// -------------------------------------------------------------------
+
+
+// @@protoc_insertion_point(namespace_scope)
+
+PROTOBUF_NAMESPACE_CLOSE
+
+PROTOBUF_NAMESPACE_OPEN
+
+template <> struct is_proto_enum< PROTOBUF_NAMESPACE_ID::NullValue> : ::std::true_type {};
+template <>
+inline const EnumDescriptor* GetEnumDescriptor< PROTOBUF_NAMESPACE_ID::NullValue>() {
+  return PROTOBUF_NAMESPACE_ID::NullValue_descriptor();
+}
+
+PROTOBUF_NAMESPACE_CLOSE
+
+// @@protoc_insertion_point(global_scope)
+
+#include <google/protobuf/port_undef.inc>
+#endif  // GOOGLE_PROTOBUF_INCLUDED_GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2fstruct_2eproto
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/timestamp.pb.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/timestamp.pb.h
new file mode 100644
index 0000000000000000000000000000000000000000..95f333e728d8c81d5315bb1ad4143b024ecb9c4c
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/timestamp.pb.h
@@ -0,0 +1,282 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Generated by the protocol buffer compiler.  DO NOT EDIT!
+// source: google/protobuf/timestamp.proto
+
+#ifndef GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2ftimestamp_2eproto
+#define GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2ftimestamp_2eproto
+
+#include <limits>
+#include <string>
+
+#include <google/protobuf/port_def.inc>
+#if PROTOBUF_VERSION < 3013000
+#error This file was generated by a newer version of protoc which is
+#error incompatible with your Protocol Buffer headers. Please update
+#error your headers.
+#endif
+#if 3013000 < PROTOBUF_MIN_PROTOC_VERSION
+#error This file was generated by an older version of protoc which is
+#error incompatible with your Protocol Buffer headers. Please
+#error regenerate this file with a newer version of protoc.
+#endif
+
+#include <google/protobuf/port_undef.inc>
+#include <google/protobuf/io/coded_stream.h>
+#include <google/protobuf/arena.h>
+#include <google/protobuf/arenastring.h>
+#include <google/protobuf/generated_message_table_driven.h>
+#include <google/protobuf/generated_message_util.h>
+#include <google/protobuf/inlined_string_field.h>
+#include <google/protobuf/metadata_lite.h>
+#include <google/protobuf/generated_message_reflection.h>
+#include <google/protobuf/message.h>
+#include <google/protobuf/repeated_field.h>  // IWYU pragma: export
+#include <google/protobuf/extension_set.h>  // IWYU pragma: export
+#include <google/protobuf/unknown_field_set.h>
+// @@protoc_insertion_point(includes)
+#include <google/protobuf/port_def.inc>
+#define PROTOBUF_INTERNAL_EXPORT_google_2fprotobuf_2ftimestamp_2eproto PROTOBUF_EXPORT
+PROTOBUF_NAMESPACE_OPEN
+namespace internal {
+class AnyMetadata;
+}  // namespace internal
+PROTOBUF_NAMESPACE_CLOSE
+
+// Internal implementation detail -- do not use these members.
+struct PROTOBUF_EXPORT TableStruct_google_2fprotobuf_2ftimestamp_2eproto {
+  static const ::PROTOBUF_NAMESPACE_ID::internal::ParseTableField entries[]
+    PROTOBUF_SECTION_VARIABLE(protodesc_cold);
+  static const ::PROTOBUF_NAMESPACE_ID::internal::AuxiliaryParseTableField aux[]
+    PROTOBUF_SECTION_VARIABLE(protodesc_cold);
+  static const ::PROTOBUF_NAMESPACE_ID::internal::ParseTable schema[1]
+    PROTOBUF_SECTION_VARIABLE(protodesc_cold);
+  static const ::PROTOBUF_NAMESPACE_ID::internal::FieldMetadata field_metadata[];
+  static const ::PROTOBUF_NAMESPACE_ID::internal::SerializationTable serialization_table[];
+  static const ::PROTOBUF_NAMESPACE_ID::uint32 offsets[];
+};
+extern PROTOBUF_EXPORT const ::PROTOBUF_NAMESPACE_ID::internal::DescriptorTable descriptor_table_google_2fprotobuf_2ftimestamp_2eproto;
+PROTOBUF_NAMESPACE_OPEN
+class Timestamp;
+class TimestampDefaultTypeInternal;
+PROTOBUF_EXPORT extern TimestampDefaultTypeInternal _Timestamp_default_instance_;
+PROTOBUF_NAMESPACE_CLOSE
+PROTOBUF_NAMESPACE_OPEN
+template<> PROTOBUF_EXPORT PROTOBUF_NAMESPACE_ID::Timestamp* Arena::CreateMaybeMessage<PROTOBUF_NAMESPACE_ID::Timestamp>(Arena*);
+PROTOBUF_NAMESPACE_CLOSE
+PROTOBUF_NAMESPACE_OPEN
+
+// ===================================================================
+
+class PROTOBUF_EXPORT Timestamp PROTOBUF_FINAL :
+    public ::PROTOBUF_NAMESPACE_ID::Message /* @@protoc_insertion_point(class_definition:google.protobuf.Timestamp) */ {
+ public:
+  inline Timestamp() : Timestamp(nullptr) {}
+  virtual ~Timestamp();
+
+  Timestamp(const Timestamp& from);
+  Timestamp(Timestamp&& from) noexcept
+    : Timestamp() {
+    *this = ::std::move(from);
+  }
+
+  inline Timestamp& operator=(const Timestamp& from) {
+    CopyFrom(from);
+    return *this;
+  }
+  inline Timestamp& operator=(Timestamp&& from) noexcept {
+    if (GetArena() == from.GetArena()) {
+      if (this != &from) InternalSwap(&from);
+    } else {
+      CopyFrom(from);
+    }
+    return *this;
+  }
+
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* descriptor() {
+    return GetDescriptor();
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Descriptor* GetDescriptor() {
+    return GetMetadataStatic().descriptor;
+  }
+  static const ::PROTOBUF_NAMESPACE_ID::Reflection* GetReflection() {
+    return GetMetadataStatic().reflection;
+  }
+  static const Timestamp& default_instance();
+
+  static void InitAsDefaultInstance();  // FOR INTERNAL USE ONLY
+  static inline const Timestamp* internal_default_instance() {
+    return reinterpret_cast<const Timestamp*>(
+               &_Timestamp_default_instance_);
+  }
+  static constexpr int kIndexInFileMessages =
+    0;
+
+  friend void swap(Timestamp& a, Timestamp& b) {
+    a.Swap(&b);
+  }
+  inline void Swap(Timestamp* other) {
+    if (other == this) return;
+    if (GetArena() == other->GetArena()) {
+      InternalSwap(other);
+    } else {
+      ::PROTOBUF_NAMESPACE_ID::internal::GenericSwap(this, other);
+    }
+  }
+  void UnsafeArenaSwap(Timestamp* other) {
+    if (other == this) return;
+    GOOGLE_DCHECK(GetArena() == other->GetArena());
+    InternalSwap(other);
+  }
+
+  // implements Message ----------------------------------------------
+
+  inline Timestamp* New() const final {
+    return CreateMaybeMessage<Timestamp>(nullptr);
+  }
+
+  Timestamp* New(::PROTOBUF_NAMESPACE_ID::Arena* arena) const final {
+    return CreateMaybeMessage<Timestamp>(arena);
+  }
+  void CopyFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void MergeFrom(const ::PROTOBUF_NAMESPACE_ID::Message& from) final;
+  void CopyFrom(const Timestamp& from);
+  void MergeFrom(const Timestamp& from);
+  PROTOBUF_ATTRIBUTE_REINITIALIZES void Clear() final;
+  bool IsInitialized() const final;
+
+  size_t ByteSizeLong() const final;
+  const char* _InternalParse(const char* ptr, ::PROTOBUF_NAMESPACE_ID::internal::ParseContext* ctx) final;
+  ::PROTOBUF_NAMESPACE_ID::uint8* _InternalSerialize(
+      ::PROTOBUF_NAMESPACE_ID::uint8* target, ::PROTOBUF_NAMESPACE_ID::io::EpsCopyOutputStream* stream) const final;
+  int GetCachedSize() const final { return _cached_size_.Get(); }
+
+  private:
+  inline void SharedCtor();
+  inline void SharedDtor();
+  void SetCachedSize(int size) const final;
+  void InternalSwap(Timestamp* other);
+  friend class ::PROTOBUF_NAMESPACE_ID::internal::AnyMetadata;
+  static ::PROTOBUF_NAMESPACE_ID::StringPiece FullMessageName() {
+    return "google.protobuf.Timestamp";
+  }
+  protected:
+  explicit Timestamp(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  private:
+  static void ArenaDtor(void* object);
+  inline void RegisterArenaDtor(::PROTOBUF_NAMESPACE_ID::Arena* arena);
+  public:
+
+  ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadata() const final;
+  private:
+  static ::PROTOBUF_NAMESPACE_ID::Metadata GetMetadataStatic() {
+    ::PROTOBUF_NAMESPACE_ID::internal::AssignDescriptors(&::descriptor_table_google_2fprotobuf_2ftimestamp_2eproto);
+    return ::descriptor_table_google_2fprotobuf_2ftimestamp_2eproto.file_level_metadata[kIndexInFileMessages];
+  }
+
+  public:
+
+  // nested types ----------------------------------------------------
+
+  // accessors -------------------------------------------------------
+
+  enum : int {
+    kSecondsFieldNumber = 1,
+    kNanosFieldNumber = 2,
+  };
+  // int64 seconds = 1;
+  void clear_seconds();
+  ::PROTOBUF_NAMESPACE_ID::int64 seconds() const;
+  void set_seconds(::PROTOBUF_NAMESPACE_ID::int64 value);
+  private:
+  ::PROTOBUF_NAMESPACE_ID::int64 _internal_seconds() const;
+  void _internal_set_seconds(::PROTOBUF_NAMESPACE_ID::int64 value);
+  public:
+
+  // int32 nanos = 2;
+  void clear_nanos();
+  ::PROTOBUF_NAMESPACE_ID::int32 nanos() const;
+  void set_nanos(::PROTOBUF_NAMESPACE_ID::int32 value);
+  private:
+  ::PROTOBUF_NAMESPACE_ID::int32 _internal_nanos() const;
+  void _internal_set_nanos(::PROTOBUF_NAMESPACE_ID::int32 value);
+  public:
+
+  // @@protoc_insertion_point(class_scope:google.protobuf.Timestamp)
+ private:
+  class _Internal;
+
+  template <typename T> friend class ::PROTOBUF_NAMESPACE_ID::Arena::InternalHelper;
+  typedef void InternalArenaConstructable_;
+  typedef void DestructorSkippable_;
+  ::PROTOBUF_NAMESPACE_ID::int64 seconds_;
+  ::PROTOBUF_NAMESPACE_ID::int32 nanos_;
+  mutable ::PROTOBUF_NAMESPACE_ID::internal::CachedSize _cached_size_;
+  friend struct ::TableStruct_google_2fprotobuf_2ftimestamp_2eproto;
+};
+// ===================================================================
+
+
+// ===================================================================
+
+#ifdef __GNUC__
+  #pragma GCC diagnostic push
+  #pragma GCC diagnostic ignored "-Wstrict-aliasing"
+#endif  // __GNUC__
+// Timestamp
+
+// int64 seconds = 1;
+inline void Timestamp::clear_seconds() {
+  seconds_ = PROTOBUF_LONGLONG(0);
+}
+inline ::PROTOBUF_NAMESPACE_ID::int64 Timestamp::_internal_seconds() const {
+  return seconds_;
+}
+inline ::PROTOBUF_NAMESPACE_ID::int64 Timestamp::seconds() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.Timestamp.seconds)
+  return _internal_seconds();
+}
+inline void Timestamp::_internal_set_seconds(::PROTOBUF_NAMESPACE_ID::int64 value) {
+  
+  seconds_ = value;
+}
+inline void Timestamp::set_seconds(::PROTOBUF_NAMESPACE_ID::int64 value) {
+  _internal_set_seconds(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.Timestamp.seconds)
+}
+
+// int32 nanos = 2;
+inline void Timestamp::clear_nanos() {
+  nanos_ = 0;
+}
+inline ::PROTOBUF_NAMESPACE_ID::int32 Timestamp::_internal_nanos() const {
+  return nanos_;
+}
+inline ::PROTOBUF_NAMESPACE_ID::int32 Timestamp::nanos() const {
+  // @@protoc_insertion_point(field_get:google.protobuf.Timestamp.nanos)
+  return _internal_nanos();
+}
+inline void Timestamp::_internal_set_nanos(::PROTOBUF_NAMESPACE_ID::int32 value) {
+  
+  nanos_ = value;
+}
+inline void Timestamp::set_nanos(::PROTOBUF_NAMESPACE_ID::int32 value) {
+  _internal_set_nanos(value);
+  // @@protoc_insertion_point(field_set:google.protobuf.Timestamp.nanos)
+}
+
+#ifdef __GNUC__
+  #pragma GCC diagnostic pop
+#endif  // __GNUC__
+
+// @@protoc_insertion_point(namespace_scope)
+
+PROTOBUF_NAMESPACE_CLOSE
+
+// @@protoc_insertion_point(global_scope)
+
+#include <google/protobuf/port_undef.inc>
+#endif  // GOOGLE_PROTOBUF_INCLUDED_GOOGLE_PROTOBUF_INCLUDED_google_2fprotobuf_2ftimestamp_2eproto
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/unknown_field_set.h b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/unknown_field_set.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe676b9f8dd787f2aa933bd991b29cce97e58577
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/torch/include/google/protobuf/unknown_field_set.h
@@ -0,0 +1,416 @@
+#if !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
+// Protocol Buffers - Google's data interchange format
+// Copyright 2008 Google Inc.  All rights reserved.
+// https://developers.google.com/protocol-buffers/
+//
+// Redistribution and use in source and binary forms, with or without
+// modification, are permitted provided that the following conditions are
+// met:
+//
+//     * Redistributions of source code must retain the above copyright
+// notice, this list of conditions and the following disclaimer.
+//     * Redistributions in binary form must reproduce the above
+// copyright notice, this list of conditions and the following disclaimer
+// in the documentation and/or other materials provided with the
+// distribution.
+//     * Neither the name of Google Inc. nor the names of its
+// contributors may be used to endorse or promote products derived from
+// this software without specific prior written permission.
+//
+// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+// Author: kenton@google.com (Kenton Varda)
+//  Based on original Protocol Buffers design by
+//  Sanjay Ghemawat, Jeff Dean, and others.
+//
+// Contains classes used to keep track of unrecognized fields seen while
+// parsing a protocol message.
+
+#ifndef GOOGLE_PROTOBUF_UNKNOWN_FIELD_SET_H__
+#define GOOGLE_PROTOBUF_UNKNOWN_FIELD_SET_H__
+
+#include <assert.h>
+
+#include <string>
+#include <vector>
+
+#include <google/protobuf/stubs/common.h>
+#include <google/protobuf/stubs/logging.h>
+#include <google/protobuf/parse_context.h>
+#include <google/protobuf/io/coded_stream.h>
+#include <google/protobuf/io/zero_copy_stream_impl_lite.h>
+#include <google/protobuf/message_lite.h>
+#include <google/protobuf/port.h>
+
+#include <google/protobuf/port_def.inc>
+
+#ifdef SWIG
+#error "You cannot SWIG proto headers"
+#endif
+
+namespace google {
+namespace protobuf {
+namespace internal {
+class InternalMetadata;           // metadata_lite.h
+class WireFormat;                 // wire_format.h
+class MessageSetFieldSkipperUsingCord;
+// extension_set_heavy.cc
+}  // namespace internal
+
+class Message;       // message.h
+class UnknownField;  // below
+
+// An UnknownFieldSet contains fields that were encountered while parsing a
+// message but were not defined by its type.  Keeping track of these can be
+// useful, especially in that they may be written if the message is serialized
+// again without being cleared in between.  This means that software which
+// simply receives messages and forwards them to other servers does not need
+// to be updated every time a new field is added to the message definition.
+//
+// To get the UnknownFieldSet attached to any message, call
+// Reflection::GetUnknownFields().
+//
+// This class is necessarily tied to the protocol buffer wire format, unlike
+// the Reflection interface which is independent of any serialization scheme.
+class PROTOBUF_EXPORT UnknownFieldSet {
+ public:
+  UnknownFieldSet();
+  ~UnknownFieldSet();
+
+  // Remove all fields.
+  inline void Clear();
+
+  // Remove all fields and deallocate internal data objects
+  void ClearAndFreeMemory();
+
+  // Is this set empty?
+  inline bool empty() const;
+
+  // Merge the contents of some other UnknownFieldSet with this one.
+  void MergeFrom(const UnknownFieldSet& other);
+
+  // Similar to above, but this function will destroy the contents of other.
+  void MergeFromAndDestroy(UnknownFieldSet* other);
+
+  // Merge the contents an UnknownFieldSet with the UnknownFieldSet in
+  // *metadata, if there is one.  If *metadata doesn't have an UnknownFieldSet
+  // then add one to it and make it be a copy of the first arg.
+  static void MergeToInternalMetadata(const UnknownFieldSet& other,
+                                      internal::InternalMetadata* metadata);
+
+  // Swaps the contents of some other UnknownFieldSet with this one.
+  inline void Swap(UnknownFieldSet* x);
+
+  // Computes (an estimate of) the total number of bytes currently used for
+  // storing the unknown fields in memory. Does NOT include
+  // sizeof(*this) in the calculation.
+  size_t SpaceUsedExcludingSelfLong() const;
+
+  int SpaceUsedExcludingSelf() const {
+    return internal::ToIntSize(SpaceUsedExcludingSelfLong());
+  }
+
+  // Version of SpaceUsed() including sizeof(*this).
+  size_t SpaceUsedLong() const;
+
+  int SpaceUsed() const { return internal::ToIntSize(SpaceUsedLong()); }
+
+  // Returns the number of fields present in the UnknownFieldSet.
+  inline int field_count() const;
+  // Get a field in the set, where 0 <= index < field_count().  The fields
+  // appear in the order in which they were added.
+  inline const UnknownField& field(int index) const;
+  // Get a mutable pointer to a field in the set, where
+  // 0 <= index < field_count().  The fields appear in the order in which
+  // they were added.
+  inline UnknownField* mutable_field(int index);
+
+  // Adding fields ---------------------------------------------------
+
+  void AddVarint(int number, uint64 value);
+  void AddFixed32(int number, uint32 value);
+  void AddFixed64(int number, uint64 value);
+  void AddLengthDelimited(int number, const std::string& value);
+  std::string* AddLengthDelimited(int number);
+  UnknownFieldSet* AddGroup(int number);
+
+  // Adds an unknown field from another set.
+  void AddField(const UnknownField& field);
+
+  // Delete fields with indices in the range [start .. start+num-1].
+  // Caution: implementation moves all fields with indices [start+num .. ].
+  void DeleteSubrange(int start, int num);
+
+  // Delete all fields with a specific field number. The order of left fields
+  // is preserved.
+  // Caution: implementation moves all fields after the first deleted field.
+  void DeleteByNumber(int number);
+
+  // Parsing helpers -------------------------------------------------
+  // These work exactly like the similarly-named methods of Message.
+
+  bool MergeFromCodedStream(io::CodedInputStream* input);
+  bool ParseFromCodedStream(io::CodedInputStream* input);
+  bool ParseFromZeroCopyStream(io::ZeroCopyInputStream* input);
+  bool ParseFromArray(const void* data, int size);
+  inline bool ParseFromString(const std::string& data) {
+    return ParseFromArray(data.data(), static_cast<int>(data.size()));
+  }
+
+  // Merges this message's unknown field data (if any).  This works whether
+  // the message is a lite or full proto (for legacy reasons, lite and full
+  // return different types for MessageType::unknown_fields()).
+  template <typename MessageType>
+  bool MergeFromMessage(const MessageType& message);
+
+  static const UnknownFieldSet& default_instance();
+
+ private:
+  // For InternalMergeFrom
+  friend class UnknownField;
+  // Merges from other UnknownFieldSet. This method assumes, that this object
+  // is newly created and has no fields.
+  void InternalMergeFrom(const UnknownFieldSet& other);
+  void ClearFallback();
+
+  template <typename MessageType,
+            typename std::enable_if<
+                std::is_base_of<Message, MessageType>::value, int>::type = 0>
+  bool InternalMergeFromMessage(const MessageType& message) {
+    MergeFrom(message.GetReflection()->GetUnknownFields(message));
+    return true;
+  }
+
+  template <typename MessageType,
+            typename std::enable_if<
+                std::is_base_of<MessageLite, MessageType>::value &&
+                    !std::is_base_of<Message, MessageType>::value,
+                int>::type = 0>
+  bool InternalMergeFromMessage(const MessageType& message) {
+    const auto& unknown_fields = message.unknown_fields();
+    io::ArrayInputStream array_stream(unknown_fields.data(),
+                                      unknown_fields.size());
+    io::CodedInputStream coded_stream(&array_stream);
+    return MergeFromCodedStream(&coded_stream);
+  }
+
+  std::vector<UnknownField> fields_;
+  GOOGLE_DISALLOW_EVIL_CONSTRUCTORS(UnknownFieldSet);
+};
+
+namespace internal {
+
+inline void WriteVarint(uint32 num, uint64 val, UnknownFieldSet* unknown) {
+  unknown->AddVarint(num, val);
+}
+inline void WriteLengthDelimited(uint32 num, StringPiece val,
+                                 UnknownFieldSet* unknown) {
+  unknown->AddLengthDelimited(num)->assign(val.data(), val.size());
+}
+
+PROTOBUF_EXPORT
+const char* UnknownGroupParse(UnknownFieldSet* unknown, const char* ptr,
+                              ParseContext* ctx);
+PROTOBUF_EXPORT
+const char* UnknownFieldParse(uint64 tag, UnknownFieldSet* unknown,
+                              const char* ptr, ParseContext* ctx);
+
+}  // namespace internal
+
+// Represents one field in an UnknownFieldSet.
+class PROTOBUF_EXPORT UnknownField {
+ public:
+  enum Type {
+    TYPE_VARINT,
+    TYPE_FIXED32,
+    TYPE_FIXED64,
+    TYPE_LENGTH_DELIMITED,
+    TYPE_GROUP
+  };
+
+  // The field's field number, as seen on the wire.
+  inline int number() const;
+
+  // The field type.
+  inline Type type() const;
+
+  // Accessors -------------------------------------------------------
+  // Each method works only for UnknownFields of the corresponding type.
+
+  inline uint64 varint() const;
+  inline uint32 fixed32() const;
+  inline uint64 fixed64() const;
+  inline const std::string& length_delimited() const;
+  inline const UnknownFieldSet& group() const;
+
+  inline void set_varint(uint64 value);
+  inline void set_fixed32(uint32 value);
+  inline void set_fixed64(uint64 value);
+  inline void set_length_delimited(const std::string& value);
+  inline std::string* mutable_length_delimited();
+  inline UnknownFieldSet* mutable_group();
+
+  // Serialization API.
+  // These methods can take advantage of the underlying implementation and may
+  // archieve a better performance than using getters to retrieve the data and
+  // do the serialization yourself.
+  void SerializeLengthDelimitedNoTag(io::CodedOutputStream* output) const {
+    output->SetCur(InternalSerializeLengthDelimitedNoTag(output->Cur(),
+                                                         output->EpsCopy()));
+  }
+
+  inline size_t GetLengthDelimitedSize() const;
+  uint8* InternalSerializeLengthDelimitedNoTag(
+      uint8* target, io::EpsCopyOutputStream* stream) const;
+
+
+  // If this UnknownField contains a pointer, delete it.
+  void Delete();
+
+  // Make a deep copy of any pointers in this UnknownField.
+  void DeepCopy(const UnknownField& other);
+
+  // Set the wire type of this UnknownField. Should only be used when this
+  // UnknownField is being created.
+  inline void SetType(Type type);
+
+  union LengthDelimited {
+    std::string* string_value;
+  };
+
+  uint32 number_;
+  uint32 type_;
+  union {
+    uint64 varint_;
+    uint32 fixed32_;
+    uint64 fixed64_;
+    mutable union LengthDelimited length_delimited_;
+    UnknownFieldSet* group_;
+  } data_;
+};
+
+// ===================================================================
+// inline implementations
+
+inline UnknownFieldSet::UnknownFieldSet() {}
+
+inline UnknownFieldSet::~UnknownFieldSet() { Clear(); }
+
+inline void UnknownFieldSet::ClearAndFreeMemory() { Clear(); }
+
+inline void UnknownFieldSet::Clear() {
+  if (!fields_.empty()) {
+    ClearFallback();
+  }
+}
+
+inline bool UnknownFieldSet::empty() const { return fields_.empty(); }
+
+inline void UnknownFieldSet::Swap(UnknownFieldSet* x) {
+  fields_.swap(x->fields_);
+}
+
+inline int UnknownFieldSet::field_count() const {
+  return static_cast<int>(fields_.size());
+}
+inline const UnknownField& UnknownFieldSet::field(int index) const {
+  return (fields_)[static_cast<size_t>(index)];
+}
+inline UnknownField* UnknownFieldSet::mutable_field(int index) {
+  return &(fields_)[static_cast<size_t>(index)];
+}
+
+inline void UnknownFieldSet::AddLengthDelimited(int number,
+                                                const std::string& value) {
+  AddLengthDelimited(number)->assign(value);
+}
+
+
+
+
+inline int UnknownField::number() const { return static_cast<int>(number_); }
+inline UnknownField::Type UnknownField::type() const {
+  return static_cast<Type>(type_);
+}
+
+inline uint64 UnknownField::varint() const {
+  assert(type() == TYPE_VARINT);
+  return data_.varint_;
+}
+inline uint32 UnknownField::fixed32() const {
+  assert(type() == TYPE_FIXED32);
+  return data_.fixed32_;
+}
+inline uint64 UnknownField::fixed64() const {
+  assert(type() == TYPE_FIXED64);
+  return data_.fixed64_;
+}
+inline const std::string& UnknownField::length_delimited() const {
+  assert(type() == TYPE_LENGTH_DELIMITED);
+  return *data_.length_delimited_.string_value;
+}
+inline const UnknownFieldSet& UnknownField::group() const {
+  assert(type() == TYPE_GROUP);
+  return *data_.group_;
+}
+
+inline void UnknownField::set_varint(uint64 value) {
+  assert(type() == TYPE_VARINT);
+  data_.varint_ = value;
+}
+inline void UnknownField::set_fixed32(uint32 value) {
+  assert(type() == TYPE_FIXED32);
+  data_.fixed32_ = value;
+}
+inline void UnknownField::set_fixed64(uint64 value) {
+  assert(type() == TYPE_FIXED64);
+  data_.fixed64_ = value;
+}
+inline void UnknownField::set_length_delimited(const std::string& value) {
+  assert(type() == TYPE_LENGTH_DELIMITED);
+  data_.length_delimited_.string_value->assign(value);
+}
+inline std::string* UnknownField::mutable_length_delimited() {
+  assert(type() == TYPE_LENGTH_DELIMITED);
+  return data_.length_delimited_.string_value;
+}
+inline UnknownFieldSet* UnknownField::mutable_group() {
+  assert(type() == TYPE_GROUP);
+  return data_.group_;
+}
+template <typename MessageType>
+bool UnknownFieldSet::MergeFromMessage(const MessageType& message) {
+  // SFINAE will route to the right version.
+  return InternalMergeFromMessage(message);
+}
+
+
+inline size_t UnknownField::GetLengthDelimitedSize() const {
+  GOOGLE_DCHECK_EQ(TYPE_LENGTH_DELIMITED, type());
+  return data_.length_delimited_.string_value->size();
+}
+
+inline void UnknownField::SetType(Type type) {
+  type_ = type;
+}
+
+
+}  // namespace protobuf
+}  // namespace google
+
+#include <google/protobuf/port_undef.inc>
+#endif  // GOOGLE_PROTOBUF_UNKNOWN_FIELD_SET_H__
+
+#else
+#error "This file should not be included when either TORCH_STABLE_ONLY or TORCH_TARGET_VERSION is defined."
+#endif  // !defined(TORCH_STABLE_ONLY) && !defined(TORCH_TARGET_VERSION)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/albert/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/albert/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b5104066e5acbb8a8792cfa92f219f03f8648724
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/albert/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/albert/__pycache__/configuration_albert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/albert/__pycache__/configuration_albert.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cfdf49b2893e65654786213423f9e81001342e97
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/albert/__pycache__/configuration_albert.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/albert/__pycache__/modeling_flax_albert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/albert/__pycache__/modeling_flax_albert.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cd34030a87233f03fc18610dcc5c7afafff72f70
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/albert/__pycache__/modeling_flax_albert.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/albert/__pycache__/tokenization_albert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/albert/__pycache__/tokenization_albert.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..946ff44e42202ded5f49ba32270f31ec59d82de9
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/albert/__pycache__/tokenization_albert.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/align/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/align/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eb22bd35378a4effd105932791e050ff09c846ca
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/align/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/align/__pycache__/configuration_align.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/align/__pycache__/configuration_align.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c259fb33d77887788cd82cb9f2c4bac766fc1ab7
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/align/__pycache__/configuration_align.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/align/__pycache__/modeling_align.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/align/__pycache__/modeling_align.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f5183d063f6581189577fea33420073c554b51da
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/align/__pycache__/modeling_align.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/align/__pycache__/processing_align.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/align/__pycache__/processing_align.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3beb50940016d119f1606898e3c0efce8681b68b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/align/__pycache__/processing_align.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/altclip/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/altclip/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..124478739f61fe4b2e3620adc75f13c5afe36587
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/altclip/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/altclip/__pycache__/processing_altclip.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/altclip/__pycache__/processing_altclip.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..88e15cf50dc6dc8ae4b1b19ed6623899be34dd73
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/altclip/__pycache__/processing_altclip.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/apertus/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/apertus/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fc90b3c9f905ef221430dc646bbbbff46b474262
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/apertus/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/apertus/__pycache__/configuration_apertus.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/apertus/__pycache__/configuration_apertus.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ee06407c1dd6128cbf2528c51e04e534b9d617eb
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/apertus/__pycache__/configuration_apertus.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/apertus/__pycache__/modeling_apertus.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/apertus/__pycache__/modeling_apertus.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ad0085cef93a3b64a47ee114d76317edf6d4415c
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/apertus/__pycache__/modeling_apertus.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/apertus/__pycache__/modular_apertus.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/apertus/__pycache__/modular_apertus.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..deacc9538cf55b1a94fa67741b79f5904bdc6833
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/apertus/__pycache__/modular_apertus.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/arcee/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/arcee/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..79cc66dbd420a23db279478f0aeae4f230bf82cf
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/arcee/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/arcee/__pycache__/configuration_arcee.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/arcee/__pycache__/configuration_arcee.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9332d2a44a6b78fcf989a02a5f4faf9f09e2b9b4
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/arcee/__pycache__/configuration_arcee.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/arcee/__pycache__/modeling_arcee.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/arcee/__pycache__/modeling_arcee.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0a142383651b5f5b95746bea9f82fa7c42cf7652
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/arcee/__pycache__/modeling_arcee.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/arcee/__pycache__/modular_arcee.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/arcee/__pycache__/modular_arcee.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ba720015a1c761daa1c18a9bdcb76784ce7e720e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/arcee/__pycache__/modular_arcee.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/aria/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/aria/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..709d0be6c74a10a1eaef56b1168dabb3008a8ff3
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/aria/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/aria/__pycache__/configuration_aria.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/aria/__pycache__/configuration_aria.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..100cc5c6c41009fcb0e4ba9a4f526761d912c25d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/aria/__pycache__/configuration_aria.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/aria/__pycache__/image_processing_aria.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/aria/__pycache__/image_processing_aria.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bcb6197dad26e6f4dee10188ff7e0be3df19448b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/aria/__pycache__/image_processing_aria.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/aria/__pycache__/modeling_aria.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/aria/__pycache__/modeling_aria.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1713498288eb168034b6274fc54e0dd5a6e86524
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/aria/__pycache__/modeling_aria.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/aria/__pycache__/modular_aria.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/aria/__pycache__/modular_aria.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c75c6152f0e3c3b71cd9f20183469c23324fc9a4
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/aria/__pycache__/modular_aria.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/aria/__pycache__/processing_aria.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/aria/__pycache__/processing_aria.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..318812012bc51709e983a3e423fc939cdc9a2ba9
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/aria/__pycache__/processing_aria.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/autoformer/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/autoformer/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e15e6ac1e94e7b894e096e197c75bf05f00907ea
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/autoformer/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/autoformer/__pycache__/configuration_autoformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/autoformer/__pycache__/configuration_autoformer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..88cff3d1864f3729489bb16a58ed98612d55f4c8
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/autoformer/__pycache__/configuration_autoformer.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bark/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bark/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8c047eb12c6d24e3d1af407ad450eaa425c40036
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bark/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bark/__pycache__/configuration_bark.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bark/__pycache__/configuration_bark.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c1d9d1d6d6958fb3cbddcb83d503248b5911be77
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bark/__pycache__/configuration_bark.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bark/__pycache__/generation_configuration_bark.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bark/__pycache__/generation_configuration_bark.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aa727af000416cf656d074157929400db7f0c7b4
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bark/__pycache__/generation_configuration_bark.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bark/__pycache__/modeling_bark.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bark/__pycache__/modeling_bark.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..79f0dc17e1c0d59afa3867b2df0cece63b20b62b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bark/__pycache__/modeling_bark.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bark/__pycache__/processing_bark.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bark/__pycache__/processing_bark.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..56f3dd054e729a8675e9ae8171467cc33639b7cb
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bark/__pycache__/processing_bark.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/beit/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/beit/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2b6759566c9a61c2064ff95bb340c7011fe5e6ae
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/beit/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/beit/__pycache__/configuration_beit.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/beit/__pycache__/configuration_beit.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7ea247c01c3ed71972b5a0137cd1cce9460dbd02
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/beit/__pycache__/configuration_beit.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/beit/__pycache__/feature_extraction_beit.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/beit/__pycache__/feature_extraction_beit.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0a6325e616e28ea07dc98176d42a2c8521f6d0b8
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/beit/__pycache__/feature_extraction_beit.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/beit/__pycache__/image_processing_beit.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/beit/__pycache__/image_processing_beit.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3a246ea0e0e768ab28fd145bc789525d7d91f63d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/beit/__pycache__/image_processing_beit.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/beit/__pycache__/image_processing_beit_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/beit/__pycache__/image_processing_beit_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..01b4c1cf2d9098c1304c5616d6eff03eed6d6426
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/beit/__pycache__/image_processing_beit_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/beit/__pycache__/modeling_beit.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/beit/__pycache__/modeling_beit.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6350c942559de86d482b1656099812582274a6b6
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/beit/__pycache__/modeling_beit.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/beit/__pycache__/modeling_flax_beit.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/beit/__pycache__/modeling_flax_beit.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..64b344a532adbd86af8624d3e0cb414f27adf1f2
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/beit/__pycache__/modeling_flax_beit.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e0921587deb0f9f3790adeb655aea06da24443bd
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert/__pycache__/configuration_bert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert/__pycache__/configuration_bert.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4e7e8e6555522a39d6d9e52fbd03c9535c0ed504
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert/__pycache__/configuration_bert.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert/__pycache__/modeling_flax_bert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert/__pycache__/modeling_flax_bert.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8ba6df18a425eaba9eec9377855ce9ab0d71d37e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert/__pycache__/modeling_flax_bert.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert/__pycache__/modeling_tf_bert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert/__pycache__/modeling_tf_bert.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..03fcc508e4ca96a9f4fcd576023b298d24087be4
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert/__pycache__/modeling_tf_bert.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert/__pycache__/tokenization_bert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert/__pycache__/tokenization_bert.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b6957ad4b6fea47fb10f2947c7990cd5b462527f
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert/__pycache__/tokenization_bert.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert/__pycache__/tokenization_bert_tf.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert/__pycache__/tokenization_bert_tf.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..696c020e07597039947856eeffa81dccc0568a37
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert/__pycache__/tokenization_bert_tf.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert_generation/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert_generation/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f8d7e6b9e48f4cf94e72aecce2d5729a965aee17
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert_generation/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert_generation/__pycache__/configuration_bert_generation.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert_generation/__pycache__/configuration_bert_generation.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bcff567016ed93845a983d99c48bb638d6c2bae6
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert_generation/__pycache__/configuration_bert_generation.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert_generation/__pycache__/modeling_bert_generation.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert_generation/__pycache__/modeling_bert_generation.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8989002978aa3a3fdc09ad32bbcd165b26361708
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert_generation/__pycache__/modeling_bert_generation.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert_generation/__pycache__/tokenization_bert_generation.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert_generation/__pycache__/tokenization_bert_generation.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b5e323823e6653e8404d9b47899715bb237bdc5a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bert_generation/__pycache__/tokenization_bert_generation.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..87419e69e5c7f0572076832237e10f236186ec33
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/__init__.py
@@ -0,0 +1,30 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_big_bird import *
+    from .modeling_big_bird import *
+    from .modeling_flax_big_bird import *
+    from .tokenization_big_bird import *
+    from .tokenization_big_bird_fast import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/configuration_big_bird.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/configuration_big_bird.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e29439bc4bd54f06689b7a3e93008baf93427ad
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/configuration_big_bird.py
@@ -0,0 +1,176 @@
+# coding=utf-8
+# Copyright 2021 Google Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""BigBird model configuration"""
+
+from collections import OrderedDict
+from collections.abc import Mapping
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class BigBirdConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`BigBirdModel`]. It is used to instantiate an
+    BigBird model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the BigBird
+    [google/bigbird-roberta-base](https://huggingface.co/google/bigbird-roberta-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50358):
+            Vocabulary size of the BigBird model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`BigBirdModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimension of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_new"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 4096):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 1024 or 2048 or 4096).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`BigBirdModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        attention_type (`str`, *optional*, defaults to `"block_sparse"`)
+            Whether to use block sparse attention (with n complexity) as introduced in paper or original attention
+            layer (with n^2 complexity). Possible values are `"original_full"` and `"block_sparse"`.
+        use_bias (`bool`, *optional*, defaults to `True`)
+            Whether to use bias in query, key, value.
+        rescale_embeddings (`bool`, *optional*, defaults to `False`)
+            Whether to rescale embeddings with (hidden_size ** 0.5).
+        block_size (`int`, *optional*, defaults to 64)
+            Size of each block. Useful only when `attention_type == "block_sparse"`.
+        num_random_blocks (`int`, *optional*, defaults to 3)
+            Each query is going to attend these many number of random blocks. Useful only when `attention_type ==
+            "block_sparse"`.
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+
+    Example:
+
+    ```python
+    >>> from transformers import BigBirdConfig, BigBirdModel
+
+    >>> # Initializing a BigBird google/bigbird-roberta-base style configuration
+    >>> configuration = BigBirdConfig()
+
+    >>> # Initializing a model (with random weights) from the google/bigbird-roberta-base style configuration
+    >>> model = BigBirdModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "big_bird"
+
+    def __init__(
+        self,
+        vocab_size=50358,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu_new",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=4096,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        use_cache=True,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        sep_token_id=66,
+        attention_type="block_sparse",
+        use_bias=True,
+        rescale_embeddings=False,
+        block_size=64,
+        num_random_blocks=3,
+        classifier_dropout=None,
+        **kwargs,
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            sep_token_id=sep_token_id,
+            **kwargs,
+        )
+
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.type_vocab_size = type_vocab_size
+        self.layer_norm_eps = layer_norm_eps
+        self.use_cache = use_cache
+
+        self.rescale_embeddings = rescale_embeddings
+        self.attention_type = attention_type
+        self.use_bias = use_bias
+        self.block_size = block_size
+        self.num_random_blocks = num_random_blocks
+        self.classifier_dropout = classifier_dropout
+
+
+class BigBirdOnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        if self.task == "multiple-choice":
+            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
+        else:
+            dynamic_axis = {0: "batch", 1: "sequence"}
+        return OrderedDict(
+            [
+                ("input_ids", dynamic_axis),
+                ("attention_mask", dynamic_axis),
+            ]
+        )
+
+
+__all__ = ["BigBirdConfig", "BigBirdOnnxConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/modeling_big_bird.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/modeling_big_bird.py
new file mode 100644
index 0000000000000000000000000000000000000000..eb89d9872be8816c83158faa4ac81c2238c634d7
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/modeling_big_bird.py
@@ -0,0 +1,2957 @@
+# coding=utf-8
+# Copyright 2021 Google Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BigBird model."""
+
+import math
+import os
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import apply_chunking_to_forward
+from ...utils import ModelOutput, auto_docstring, logging
+from ...utils.deprecation import deprecate_kwarg
+from .configuration_big_bird import BigBirdConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+_TRIVIA_QA_MAPPING = {
+    "big_bird_attention": "attention/self",
+    "output_layer_norm": "output/LayerNorm",
+    "attention_output": "attention/output/dense",
+    "output": "output/dense",
+    "self_attention_layer_norm": "attention/output/LayerNorm",
+    "intermediate": "intermediate/dense",
+    "word_embeddings": "bert/embeddings/word_embeddings",
+    "position_embedding": "bert/embeddings/position_embeddings",
+    "type_embeddings": "bert/embeddings/token_type_embeddings",
+    "embeddings": "bert/embeddings",
+    "layer_normalization": "output/LayerNorm",
+    "layer_norm": "LayerNorm",
+    "trivia_qa_head": "qa_classifier",
+    "dense": "intermediate/dense",
+    "dense_1": "qa_outputs",
+}
+
+
+def load_tf_weights_in_big_bird(model, tf_checkpoint_path, is_trivia_qa=False):
+    """Load tf checkpoints in a pytorch model."""
+
+    def load_tf_weights_bert(init_vars, tf_path):
+        names = []
+        tf_weights = {}
+
+        for name, shape in init_vars:
+            array = tf.train.load_variable(tf_path, name)
+            name = name.replace("bert/encoder/LayerNorm", "bert/embeddings/LayerNorm")
+            logger.info(f"Loading TF weight {name} with shape {shape}")
+            names.append(name)
+            tf_weights[name] = array
+
+        return names, tf_weights
+
+    def load_tf_weights_trivia_qa(init_vars):
+        names = []
+        tf_weights = {}
+
+        for i, var in enumerate(init_vars):
+            name_items = var.name.split("/")
+
+            if "transformer_scaffold" in name_items[0]:
+                layer_name_items = name_items[0].split("_")
+                if len(layer_name_items) < 3:
+                    layer_name_items += [0]
+
+                name_items[0] = f"bert/encoder/layer_{layer_name_items[2]}"
+
+            name = "/".join([_TRIVIA_QA_MAPPING.get(x, x) for x in name_items])[:-2]  # remove last :0 in variable
+
+            if "self/attention/output" in name:
+                name = name.replace("self/attention/output", "output")
+
+            if i >= len(init_vars) - 2:
+                name = name.replace("intermediate", "output")
+
+            logger.info(f"Loading TF weight {name} with shape {var.shape}")
+            array = var.value().numpy()
+            names.append(name)
+            tf_weights[name] = array
+
+        return names, tf_weights
+
+    try:
+        import re
+
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
+
+    # Load weights from TF model
+    init_vars = tf.saved_model.load(tf_path).variables if is_trivia_qa else tf.train.list_variables(tf_path)
+
+    if len(init_vars) <= 0:
+        raise ValueError("Loaded trained variables cannot be empty.")
+
+    pt_names = list(model.state_dict().keys())
+
+    if is_trivia_qa:
+        names, tf_weights = load_tf_weights_trivia_qa(init_vars)
+    else:
+        names, tf_weights = load_tf_weights_bert(init_vars, tf_path)
+
+    for txt_name in names:
+        array = tf_weights[txt_name]
+        name = txt_name.split("/")
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(
+            n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
+            for n in name
+        ):
+            logger.info(f"Skipping {'/'.join(name)}")
+            continue
+        pointer = model
+        pt_name = []
+        for m_name in name:
+            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
+                scope_names = re.split(r"_(\d+)", m_name)
+            else:
+                scope_names = [m_name]
+            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
+                pointer = getattr(pointer, "weight")
+                pt_name.append("weight")
+            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
+                pointer = getattr(pointer, "bias")
+                pt_name.append("bias")
+            elif scope_names[0] == "output_weights":
+                pointer = getattr(pointer, "weight")
+                pt_name.append("weight")
+            elif scope_names[0] == "squad":
+                pointer = getattr(pointer, "classifier")
+                pt_name.append("classifier")
+            elif scope_names[0] == "transform":
+                pointer = getattr(pointer, "transform")
+                pt_name.append("transform")
+                if ("bias" in name) or ("kernel" in name):
+                    pointer = getattr(pointer, "dense")
+                    pt_name.append("dense")
+                elif ("beta" in name) or ("gamma" in name):
+                    pointer = getattr(pointer, "LayerNorm")
+                    pt_name.append("LayerNorm")
+            else:
+                try:
+                    pointer = getattr(pointer, scope_names[0])
+                    pt_name.append(f"{scope_names[0]}")
+                except AttributeError:
+                    logger.info(f"Skipping {m_name}")
+                    continue
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
+                pointer = pointer[num]
+                pt_name.append(f"{num}")
+        if m_name[-11:] == "_embeddings" or m_name == "embeddings":
+            pointer = getattr(pointer, "weight")
+            pt_name.append("weight")
+        elif m_name == "kernel":
+            array = np.transpose(array)
+        try:
+            if len(array.shape) > len(pointer.shape) and math.prod(array.shape) == math.prod(pointer.shape):
+                # print(txt_name, array.shape)
+                if (
+                    txt_name.endswith("attention/self/key/kernel")
+                    or txt_name.endswith("attention/self/query/kernel")
+                    or txt_name.endswith("attention/self/value/kernel")
+                ):
+                    array = array.transpose(1, 0, 2).reshape(pointer.shape)
+                elif txt_name.endswith("attention/output/dense/kernel"):
+                    array = array.transpose(0, 2, 1).reshape(pointer.shape)
+                else:
+                    array = array.reshape(pointer.shape)
+
+            if pointer.shape != array.shape:
+                raise ValueError(
+                    f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched of {txt_name}."
+                )
+        except ValueError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        pt_weight_name = ".".join(pt_name)
+        logger.info(f"Initialize PyTorch weight {pt_weight_name} from {txt_name}.")
+        pointer.data = torch.from_numpy(array)
+        tf_weights.pop(txt_name, None)
+        pt_names.remove(pt_weight_name)
+
+    logger.info(f"Weights not copied to PyTorch model: {', '.join(tf_weights.keys())}.")
+    logger.info(f"Weights not initialized in PyTorch model: {', '.join(pt_names)}.")
+    return model
+
+
+class BigBirdEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    # Copied from transformers.models.bert.modeling_bert.BertEmbeddings.__init__
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+        self.register_buffer(
+            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
+        )
+        # End copy
+
+        self.rescale_embeddings = config.rescale_embeddings
+        self.hidden_size = config.hidden_size
+
+    def forward(
+        self, input_ids=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
+    ):
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
+
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        if self.rescale_embeddings:
+            inputs_embeds = inputs_embeds * (self.hidden_size**0.5)
+
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+
+        position_embeddings = self.position_embeddings(position_ids)
+        embeddings += position_embeddings
+
+        embeddings = self.dropout(embeddings)
+        embeddings = self.LayerNorm(embeddings)
+        return embeddings
+
+
+class BigBirdSelfAttention(nn.Module):
+    def __init__(self, config, layer_idx=None):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.use_bias)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.use_bias)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.use_bias)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.is_decoder = config.is_decoder
+        self.layer_idx = layer_idx
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        output_attentions=False,
+        cache_position=None,
+    ):
+        batch_size, seq_length, _ = hidden_states.shape
+        query_layer = (
+            self.query(hidden_states)
+            .view(batch_size, -1, self.num_attention_heads, self.attention_head_size)
+            .transpose(1, 2)
+        )
+
+        is_cross_attention = encoder_hidden_states is not None
+        current_states = encoder_hidden_states if is_cross_attention else hidden_states
+        attention_mask = encoder_attention_mask if is_cross_attention else attention_mask
+        if is_cross_attention and past_key_values is not None and past_key_values.get_seq_length(self.layer_idx) > 0:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_values.layers[self.layer_idx].keys
+            value_layer = past_key_values.layers[self.layer_idx].values
+        else:
+            key_layer = (
+                self.key(current_states)
+                .view(batch_size, -1, self.num_attention_heads, self.attention_head_size)
+                .transpose(1, 2)
+            )
+            value_layer = (
+                self.value(current_states)
+                .view(batch_size, -1, self.num_attention_heads, self.attention_head_size)
+                .transpose(1, 2)
+            )
+
+            if past_key_values is not None:
+                # save all key/value_layer to cache to be re-used for fast auto-regressive generation
+                key_layer, value_layer = past_key_values.update(
+                    key_layer,
+                    value_layer,
+                    self.layer_idx,
+                )
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BigBirdModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        return context_layer, attention_probs
+
+
+class BigBirdBlockSparseAttention(nn.Module):
+    def __init__(self, config, seed=None):
+        super().__init__()
+
+        self.max_seqlen = config.max_position_embeddings
+        self.seed = seed
+
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
+                f"heads {config.num_attention_heads}."
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.num_random_blocks = config.num_random_blocks
+        self.block_size = config.block_size
+
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.use_bias)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.use_bias)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.use_bias)
+
+    def forward(
+        self,
+        hidden_states,
+        band_mask=None,
+        from_mask=None,
+        to_mask=None,
+        from_blocked_mask=None,
+        to_blocked_mask=None,
+        output_attentions=None,
+    ):
+        # Currently this `class` can't be used in decoder.
+
+        batch_size, seqlen, _ = hidden_states.size()
+        to_seq_length = from_seq_length = seqlen
+        from_block_size = to_block_size = self.block_size
+
+        if from_seq_length % from_block_size != 0:
+            raise ValueError("Query sided sequence length must be multiple of block size")
+
+        if to_seq_length % to_block_size != 0:
+            raise ValueError("Key/Value sided sequence length must be multiple of block size")
+
+        query_layer = (
+            self.query(hidden_states)
+            .view(batch_size, -1, self.num_attention_heads, self.attention_head_size)
+            .transpose(1, 2)
+        )
+        key_layer = (
+            self.key(hidden_states)
+            .view(batch_size, -1, self.num_attention_heads, self.attention_head_size)
+            .transpose(1, 2)
+        )
+        value_layer = (
+            self.value(hidden_states)
+            .view(batch_size, -1, self.num_attention_heads, self.attention_head_size)
+            .transpose(1, 2)
+        )
+
+        context_layer, attention_probs = self.bigbird_block_sparse_attention(
+            query_layer,
+            key_layer,
+            value_layer,
+            band_mask,
+            from_mask,
+            to_mask,
+            from_blocked_mask,
+            to_blocked_mask,
+            self.num_attention_heads,
+            self.num_random_blocks,
+            self.attention_head_size,
+            from_block_size,
+            to_block_size,
+            batch_size,
+            from_seq_length,
+            to_seq_length,
+            seed=self.seed,
+            plan_from_length=None,
+            plan_num_rand_blocks=None,
+            output_attentions=output_attentions,
+        )
+
+        context_layer = context_layer.contiguous().view(batch_size, from_seq_length, -1)
+        return context_layer, attention_probs
+
+    @staticmethod
+    def torch_bmm_nd(inp_1, inp_2, ndim=None):
+        """Fast nd matrix multiplication"""
+        # faster replacement of torch.einsum ("bhqk,bhkd->bhqd")
+        return torch.bmm(inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:])).view(
+            inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 1])
+        )
+
+    @staticmethod
+    def torch_bmm_nd_transpose(inp_1, inp_2, ndim=None):
+        """Fast nd matrix multiplication with transpose"""
+        # faster replacement of torch.einsum (bhqd,bhkd->bhqk)
+        return torch.bmm(
+            inp_1.reshape((-1,) + inp_1.shape[-2:]), inp_2.reshape((-1,) + inp_2.shape[-2:]).transpose(1, 2)
+        ).view(inp_1.shape[: ndim - 2] + (inp_1.shape[ndim - 2], inp_2.shape[ndim - 2]))
+
+    def bigbird_block_sparse_attention(
+        self,
+        query_layer,
+        key_layer,
+        value_layer,
+        band_mask,
+        from_mask,
+        to_mask,
+        from_blocked_mask,
+        to_blocked_mask,
+        n_heads,
+        n_rand_blocks,
+        attention_head_size,
+        from_block_size,
+        to_block_size,
+        batch_size,
+        from_seq_len,
+        to_seq_len,
+        seed,
+        plan_from_length,
+        plan_num_rand_blocks,
+        output_attentions,
+    ):
+        # BigBird block-sparse attention as suggested in paper
+
+        # ITC:
+        #     global tokens: 2 x block_size
+        #     window tokens: 3 x block_size
+        #     random tokens: num_rand_tokens x block_size
+
+        # ETC:
+        #     global tokens: extra_globals_tokens + 2 x block_size
+        #     window tokens: 3 x block_size
+        #     random tokens: num_rand_tokens x block_size
+
+        # Note:
+        #     1) Currently, ETC is not supported.
+        #     2) Window size is fixed to 3 blocks & it can be changed only by
+        #     changing `block_size`.
+        #     3) Number of global blocks are fixed (2 blocks here) & global tokens can be
+        #     controlled only by `block_size`.
+
+        # attention is calculated separately for q[0], q[1], q[2:-2], q[-2], q[-1] in order to use special trick of shifting tokens (for calculating sliding attention)
+        # hence following code can be divided into 5 parts.
+
+        if from_seq_len // from_block_size != to_seq_len // to_block_size:
+            raise ValueError("Error the number of blocks needs to be same!")
+
+        rsqrt_d = 1 / math.sqrt(attention_head_size)
+        bsz = batch_size
+        attn_mask_penalty = -10000.0
+
+        # generate random attention and corresponding masks
+        np.random.seed(seed)
+        if from_seq_len in [1024, 3072, 4096]:  # old plans used in paper
+            rand_attn = [
+                self._bigbird_block_rand_mask(
+                    self.max_seqlen, self.max_seqlen, from_block_size, to_block_size, n_rand_blocks, last_idx=1024
+                )[: (from_seq_len // from_block_size - 2)]
+                for _ in range(n_heads)
+            ]
+        else:
+            if plan_from_length is None:
+                plan_from_length, plan_num_rand_blocks = self._get_rand_attn_plan(
+                    from_seq_len, from_block_size, n_rand_blocks
+                )
+
+            rand_attn = self._bigbird_block_rand_mask_with_head(
+                from_seq_length=from_seq_len,
+                to_seq_length=to_seq_len,
+                from_block_size=from_block_size,
+                to_block_size=to_block_size,
+                num_heads=n_heads,
+                plan_from_length=plan_from_length,
+                plan_num_rand_blocks=plan_num_rand_blocks,
+            )
+
+        rand_attn = np.stack(rand_attn, axis=0)
+        rand_attn = torch.tensor(rand_attn, device=query_layer.device, dtype=torch.long)
+        rand_attn.unsqueeze_(0)
+        rand_attn = torch.cat([rand_attn for _ in range(batch_size)], dim=0)
+
+        rand_mask = self._create_rand_mask_from_inputs(
+            from_blocked_mask, to_blocked_mask, rand_attn, n_heads, n_rand_blocks, bsz, from_seq_len, from_block_size
+        )
+
+        blocked_query_matrix = query_layer.view(bsz, n_heads, from_seq_len // from_block_size, from_block_size, -1)
+        blocked_key_matrix = key_layer.view(bsz, n_heads, to_seq_len // to_block_size, to_block_size, -1)
+        blocked_value_matrix = value_layer.view(bsz, n_heads, to_seq_len // to_block_size, to_block_size, -1)
+
+        # preparing block for randn attn
+        gathered_key = self.torch_gather_b2(blocked_key_matrix, rand_attn)
+        gathered_key = gathered_key.view(
+            bsz, n_heads, to_seq_len // to_block_size - 2, n_rand_blocks * to_block_size, -1
+        )  # [bsz, n_heads, to_seq_len//to_block_size-2, n_rand_blocks, to_block_size, -1]
+        gathered_value = self.torch_gather_b2(blocked_value_matrix, rand_attn)
+        gathered_value = gathered_value.view(
+            bsz, n_heads, to_seq_len // to_block_size - 2, n_rand_blocks * to_block_size, -1
+        )  # [bsz, n_heads, to_seq_len//to_block_size-2, n_rand_blocks, to_block_size, -1]
+
+        # 1st PART
+        # 1st block (global block) attention scores
+        # q[0] x (k[0], k[1], k[2], k[3], k[4] .... )
+
+        # [bsz, n_heads, from_block_size, -1] x [bsz, n_heads, to_seq_len, -1] ==> [bsz, n_heads, from_block_size, to_seq_len]
+        first_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, 0], key_layer, ndim=4)
+
+        first_product = first_product * rsqrt_d
+        first_product += (1.0 - to_mask) * attn_mask_penalty
+        first_attn_weights = nn.functional.softmax(
+            first_product, dim=-1
+        )  # [bsz, n_heads, from_block_size, to_seq_len]
+
+        # [bsz, n_heads, from_block_size, to_seq_len] x [bsz, n_heads, to_seq_len, -1] ==> [bsz, n_heads, from_block_size, -1]
+        first_context_layer = self.torch_bmm_nd(first_attn_weights, value_layer, ndim=4)
+        first_context_layer.unsqueeze_(2)
+
+        # 2nd PART
+        # 2nd block attention scores
+        # q[1] x (sliding_keys, random_keys, global_keys)
+        # sliding key blocks -> 2nd, 3rd blocks
+        # global key blocks -> 1st block
+
+        second_key_mat = torch.cat(
+            [
+                blocked_key_matrix[:, :, 0],
+                blocked_key_matrix[:, :, 1],
+                blocked_key_matrix[:, :, 2],
+                blocked_key_matrix[:, :, -1],
+                gathered_key[:, :, 0],
+            ],
+            dim=2,
+        )  # [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1]
+        second_value_mat = torch.cat(
+            [
+                blocked_value_matrix[:, :, 0],
+                blocked_value_matrix[:, :, 1],
+                blocked_value_matrix[:, :, 2],
+                blocked_value_matrix[:, :, -1],
+                gathered_value[:, :, 0],
+            ],
+            dim=2,
+        )  # [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1]
+
+        # [bsz, n_heads, from_block_size, -1] x [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1] ==> [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size]
+        second_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, 1], second_key_mat, ndim=4)
+        second_seq_pad = torch.cat(
+            [
+                to_mask[:, :, :, : 3 * to_block_size],
+                to_mask[:, :, :, -to_block_size:],
+                to_mask.new_ones([bsz, 1, 1, n_rand_blocks * to_block_size]),
+            ],
+            dim=3,
+        )
+        second_rand_pad = torch.cat(
+            [
+                rand_mask.new_ones([bsz, n_heads, from_block_size, 4 * to_block_size]),
+                rand_mask[:, :, 0],
+            ],
+            dim=3,
+        )
+        second_product = second_product * rsqrt_d
+        second_product += (1.0 - torch.minimum(second_seq_pad, second_rand_pad)) * attn_mask_penalty
+        second_attn_weights = nn.functional.softmax(
+            second_product, dim=-1
+        )  # [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size]
+
+        # [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size] x [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1] ==> [bsz, n_heads, from_block_size, -1]
+        second_context_layer = self.torch_bmm_nd(second_attn_weights, second_value_mat, ndim=4)
+
+        second_context_layer.unsqueeze_(2)
+
+        # 3rd PART
+        # Middle blocks attention scores
+        # q[-2:2] x (sliding_keys, random_keys, global_keys)
+        # sliding attn is calculated using special trick of shifting tokens as discussed in paper
+        # random keys are generated by taking random indices as per `rand_attn`
+        # global keys -> 1st & last block
+
+        exp_blocked_key_matrix = torch.cat(
+            [blocked_key_matrix[:, :, 1:-3], blocked_key_matrix[:, :, 2:-2], blocked_key_matrix[:, :, 3:-1]], dim=3
+        )  # [bsz, n_heads, from_seq_len//from_block_size-4, 3*to_block_size, -1]
+        exp_blocked_value_matrix = torch.cat(
+            [blocked_value_matrix[:, :, 1:-3], blocked_value_matrix[:, :, 2:-2], blocked_value_matrix[:, :, 3:-1]],
+            dim=3,
+        )  # [bsz, n_heads, from_seq_len//from_block_size-4, 3*to_block_size, -1]
+        middle_query_matrix = blocked_query_matrix[:, :, 2:-2]
+
+        # sliding attention scores for q[-2:2]
+        # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] x [b, n_heads, from_seq_len//from_block_size-4, 3*to_block_size, -1]
+        inner_band_product = self.torch_bmm_nd_transpose(middle_query_matrix, exp_blocked_key_matrix, ndim=5)
+        #     ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, 3*to_block_size]
+        inner_band_product = inner_band_product * rsqrt_d
+
+        # randn attention scores for q[-2:2]
+        # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] x [bsz, n_heads, from_seq_len//from_block_size-4, n_rand_blocks*to_block_size, -1]
+        rand_band_product = self.torch_bmm_nd_transpose(middle_query_matrix, gathered_key[:, :, 1:-1], ndim=5)
+        #     ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, n_rand_blocks*to_block_size]
+        rand_band_product = rand_band_product * rsqrt_d
+
+        # Including 1st block (since it's global)
+        first_band_product = torch.einsum(
+            "bhlqd,bhkd->bhlqk", middle_query_matrix, blocked_key_matrix[:, :, 0]
+        )  # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] x [bsz, n_heads, to_block_size, -1] ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, to_block_size]
+        first_band_product = first_band_product * rsqrt_d
+
+        # Including last block (since it's global)
+        last_band_product = torch.einsum(
+            "bhlqd,bhkd->bhlqk", middle_query_matrix, blocked_key_matrix[:, :, -1]
+        )  # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] x [bsz, n_heads, to_block_size, -1] ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, to_block_size]
+        last_band_product = last_band_product * rsqrt_d
+
+        # masking padded tokens
+        inner_band_product += (1.0 - band_mask) * attn_mask_penalty
+        first_band_product += (1.0 - to_mask[:, :, :, :to_block_size].unsqueeze(3)) * attn_mask_penalty
+        last_band_product += (1.0 - to_mask[:, :, :, -to_block_size:].unsqueeze(3)) * attn_mask_penalty
+        rand_band_product += (1.0 - rand_mask[:, :, 1:-1]) * attn_mask_penalty
+
+        # completing attention scores matrix for all q[-2:2]
+        band_product = torch.cat(
+            [first_band_product, inner_band_product, rand_band_product, last_band_product], dim=-1
+        )  # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, (5+n_rand_blocks)*to_block_size]
+
+        # safely doing softmax since attention matrix is completed
+        attn_weights = nn.functional.softmax(
+            band_product, dim=-1
+        )  # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, (5+n_rand_blocks)*to_block_size]
+
+        # contribution of sliding keys
+        # [bsz, n_heads, m//from_block_size-4, from_block_size, 3*to_block_size] x [bsz, n_heads, from_seq_len//from_block_size-4, 3*to_block_size, -1]
+        context_layer = self.torch_bmm_nd(
+            attn_weights[:, :, :, :, to_block_size : 4 * to_block_size], exp_blocked_value_matrix, ndim=5
+        )
+        #     ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1]
+
+        # adding contribution of random keys
+        # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, n_rand_blocks*to_block_size] x [bsz, n_heads, from_seq_len//from_block_size-4, n_rand_blocks*to_block_size, -1]
+        context_layer += self.torch_bmm_nd(
+            attn_weights[:, :, :, :, 4 * to_block_size : -to_block_size], gathered_value[:, :, 1:-1], ndim=5
+        )
+        #     ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1]
+
+        # adding contribution of global keys
+        context_layer += torch.einsum(
+            "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, :to_block_size], blocked_value_matrix[:, :, 0]
+        )  # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, to_block_size] x [bsz, n_heads, to_block_size, -1] ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1]
+        context_layer += torch.einsum(
+            "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, -to_block_size:], blocked_value_matrix[:, :, -1]
+        )  # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, to_block_size] x [bsz, n_heads, to_block_size, -1] ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1]
+
+        # 4th PART
+        # last 2nd token attention scores
+        # q[-2] x (sliding_keys, random_keys, global_keys)
+        # sliding key blocks -> last 3 blocks
+        # global key block -> 1st block
+        # random key block -> based on indices stored in `randn_attn`
+
+        second_last_key_mat = torch.cat(
+            [
+                blocked_key_matrix[:, :, 0],
+                blocked_key_matrix[:, :, -3],
+                blocked_key_matrix[:, :, -2],
+                blocked_key_matrix[:, :, -1],
+                gathered_key[:, :, -1],
+            ],
+            dim=2,
+        )  # [bsz, n_heads, (4+n_random_blocks)*to_block_size, -1]
+        second_last_value_mat = torch.cat(
+            [
+                blocked_value_matrix[:, :, 0],
+                blocked_value_matrix[:, :, -3],
+                blocked_value_matrix[:, :, -2],
+                blocked_value_matrix[:, :, -1],
+                gathered_value[:, :, -1],
+            ],
+            dim=2,
+        )  # [bsz, n_heads, (4+r)*to_block_size, -1]
+
+        # [bsz, n_heads, from_block_size, -1] x [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1] ==> [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size]
+        second_last_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, -2], second_last_key_mat, ndim=4)
+        second_last_seq_pad = torch.cat(
+            [
+                to_mask[:, :, :, :to_block_size],
+                to_mask[:, :, :, -3 * to_block_size :],
+                to_mask.new_ones([bsz, 1, 1, n_rand_blocks * to_block_size]),
+            ],
+            dim=3,
+        )
+        second_last_rand_pad = torch.cat(
+            [
+                rand_mask.new_ones([bsz, n_heads, from_block_size, 4 * to_block_size]),
+                rand_mask[:, :, -1],
+            ],
+            dim=3,
+        )
+        second_last_product = second_last_product * rsqrt_d
+        second_last_product += (1.0 - torch.minimum(second_last_seq_pad, second_last_rand_pad)) * attn_mask_penalty
+        second_last_attn_weights = nn.functional.softmax(
+            second_last_product, dim=-1
+        )  # [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size]
+
+        # [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size] x [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1] ==> [bsz, n_heads, from_block_size, -1]
+        second_last_context_layer = self.torch_bmm_nd(second_last_attn_weights, second_last_value_mat, ndim=4)
+        second_last_context_layer.unsqueeze_(2)
+
+        # 5th PART
+        # last block (global) attention scores
+        # q[-1] x (k[0], k[1], k[2], k[3], .... )
+
+        # [bsz, n_heads, from_block_size, -1] x [bsz, n_heads, to_seq_len, -1] ==> [bsz, n_heads, from_block_size, to_seq_len]
+        last_product = self.torch_bmm_nd_transpose(blocked_query_matrix[:, :, -1], key_layer, ndim=4)
+        last_product = last_product * rsqrt_d
+        last_product += (1.0 - to_mask) * attn_mask_penalty
+        last_attn_weights = nn.functional.softmax(last_product, dim=-1)  # [bsz, n_heads, from_block_size, n]
+
+        # [bsz, n_heads, from_block_size, to_seq_len] x [bsz, n_heads, to_seq_len, -1] ==> [bsz, n_heads, from_block_size, -1]
+        last_context_layer = self.torch_bmm_nd(last_attn_weights, value_layer, ndim=4)
+        last_context_layer.unsqueeze_(2)
+
+        # combining representations of all tokens
+        context_layer = torch.cat(
+            [first_context_layer, second_context_layer, context_layer, second_last_context_layer, last_context_layer],
+            dim=2,
+        )
+        context_layer = context_layer.view((bsz, n_heads, from_seq_len, -1)) * from_mask
+        context_layer = torch.transpose(context_layer, 1, 2)
+
+        # this is just for visualizing; forward pass doesn't depend on following code
+        if output_attentions:
+            # TODO(PVP): need to verify if below code is correct
+            attention_probs = torch.zeros(
+                bsz, n_heads, from_seq_len, to_seq_len, dtype=torch.float, device=context_layer.device
+            )
+
+            # 1st query block
+            # corresponding to `first_context_layer`
+            attention_probs[:, :, :from_block_size, :] = first_attn_weights  # all keys global
+
+            # 2nd query block
+            # corresponding to `second_context_layer`
+            attention_probs[:, :, from_block_size : 2 * from_block_size, : 3 * to_block_size] = second_attn_weights[
+                :, :, :, : 3 * to_block_size
+            ]  # 1st three key blocks (global + sliding)
+            attention_probs[:, :, from_block_size : 2 * from_block_size, -to_block_size:] = second_attn_weights[
+                :, :, :, 3 * to_block_size : 4 * to_block_size
+            ]  # last key block (global)
+            # random keys
+            for p1, i1, w1 in zip(range(bsz), rand_attn, second_attn_weights):
+                # p1, i1, w1 corresponds to batch_dim i.e. following operation is done for each sequence in batch
+                for p2, i2, w2 in zip(range(n_heads), i1, w1):
+                    # p2, i2, w2 corresponds to head_dim i.e. following operation is done for each heads
+                    attn_probs_view = attention_probs.view(
+                        bsz,
+                        n_heads,
+                        from_seq_len // from_block_size,
+                        from_block_size,
+                        to_seq_len // to_block_size,
+                        to_block_size,
+                    )
+                    right_slice = w2[:, 4 * to_block_size :]
+                    attn_probs_view[p1, p2, 1, :, i2[0]] = right_slice.view(
+                        from_block_size, n_rand_blocks, to_block_size
+                    )
+
+            # Middle query blocks
+            # corresponding to `context_layer`
+            # sliding keys
+            for q_idx in range(from_seq_len // from_block_size - 4):
+                attn_probs_view = attention_probs.view(
+                    bsz,
+                    n_heads,
+                    from_seq_len // from_block_size,
+                    from_block_size,
+                    to_seq_len // to_block_size,
+                    to_block_size,
+                )[:, :, 2:-2, :, 1:-1, :]
+                right_slice = attn_weights[:, :, q_idx, :, to_block_size : 4 * to_block_size]
+                attn_probs_view[:, :, q_idx, :, q_idx : q_idx + 3, :] = right_slice.view(
+                    bsz, n_heads, from_block_size, 3, to_block_size
+                )  # inner_band_product
+            # global keys (corresponding to 1st key block)
+            attention_probs[:, :, 2 * from_block_size : -2 * from_block_size, :to_block_size] = attn_weights[
+                :, :, :, :, :to_block_size
+            ].view(bsz, n_heads, -1, to_block_size)  # first_band_product
+            # global keys (corresponding to last key block)
+            attention_probs[:, :, 2 * from_block_size : -2 * from_block_size, -to_block_size:] = attn_weights[
+                :, :, :, :, -to_block_size:
+            ].view(bsz, n_heads, -1, to_block_size)  # last_band_product
+            # random keys
+            for p1, i1, w1 in zip(range(bsz), rand_attn, attn_weights):
+                # p1, i1, w1 corresponds to batch_dim i.e. following operation is done for each sequence in batch
+                for p2, i2, w2 in zip(range(n_heads), i1, w1):
+                    # p2, i2, w2 corresponds to head_dim i.e. following operation is done for each heads
+                    for q_idx in range(1, len(i2) - 1):
+                        attn_probs_view = attention_probs.view(
+                            bsz,
+                            n_heads,
+                            from_seq_len // from_block_size,
+                            from_block_size,
+                            to_seq_len // to_block_size,
+                            to_block_size,
+                        )
+                        right_slice = w2[q_idx - 1, :, 4 * to_block_size : -to_block_size]
+                        attn_probs_view[p1, p2, q_idx + 1, :, i2[q_idx]] = right_slice.view(
+                            from_block_size, n_rand_blocks, to_block_size
+                        )
+
+            # Second-last query block
+            # corresponding to `second_last_context_layer`
+            attention_probs[:, :, -2 * from_block_size : -from_block_size, :to_block_size] = second_last_attn_weights[
+                :, :, :, :to_block_size
+            ]  # 1st key block (global)
+            attention_probs[:, :, -2 * from_block_size : -from_block_size, -3 * to_block_size :] = (
+                second_last_attn_weights[:, :, :, to_block_size : 4 * to_block_size]
+            )  # last three blocks (global + sliding)
+            # random keys
+            for p1, i1, w1 in zip(range(bsz), rand_attn, second_last_attn_weights):
+                # p1, i1, w1 corresponds to batch_dim i.e. following operation is done for each sequence in batch
+                for p2, i2, w2 in zip(range(n_heads), i1, w1):
+                    # p2, i2, w2 corresponds to head_dim i.e. following operation is done for each heads
+                    attn_probs_view = attention_probs.view(
+                        bsz,
+                        n_heads,
+                        from_seq_len // from_block_size,
+                        from_block_size,
+                        to_seq_len // to_block_size,
+                        to_block_size,
+                    )
+                    right_slice = w2[:, 4 * to_block_size :]
+                    attn_probs_view[p1, p2, -2, :, i2[-1]] = right_slice.view(
+                        from_block_size, n_rand_blocks, to_block_size
+                    )
+
+            # last query block
+            # corresponding to `last_context_layer`
+            attention_probs[:, :, -from_block_size:, :] = last_attn_weights  # all keys global
+
+        else:
+            attention_probs = None
+
+        return context_layer, attention_probs
+
+    @staticmethod
+    def torch_gather_b2(params, indices):
+        # this operation is equivalent to tf.gather when batch_dims=2
+
+        if params.shape[:2] != indices.shape[:2]:
+            raise ValueError(
+                "Make sure that the first two dimensions of params and indices are identical,                 but"
+                f" they are params: {params.shape[:2]} vs. indices: {indices.shape[:2]}"
+            )
+        num_indices_to_gather = indices.shape[-2] * indices.shape[-1]
+        num_indices_to_pick_from = params.shape[2]
+
+        shift = torch.arange(indices.shape[0] * indices.shape[1] * num_indices_to_gather, device=indices.device)
+        indices_shift = torch.div(shift, num_indices_to_gather, rounding_mode="floor") * num_indices_to_pick_from
+
+        flattened_indices = indices.view(-1) + indices_shift
+        flattened_params = params.reshape(-1, params.shape[-2], params.shape[-1])
+
+        out_flattened = flattened_params.index_select(0, flattened_indices)
+
+        out = out_flattened.reshape(params.shape[:2] + (num_indices_to_gather,) + params.shape[3:])
+        return out
+
+    @staticmethod
+    def _create_rand_mask_from_inputs(
+        from_blocked_mask,
+        to_blocked_mask,
+        rand_attn,
+        num_attention_heads,
+        num_rand_blocks,
+        batch_size,
+        from_seq_length,
+        from_block_size,
+    ):
+        """
+        Create 3D attention mask from a 2D tensor mask.
+
+        Args:
+            from_blocked_mask: 2D Tensor of shape [batch_size,
+            from_seq_length//from_block_size, from_block_size].
+            to_blocked_mask: int32 Tensor of shape [batch_size,
+            to_seq_length//to_block_size, to_block_size].
+            rand_attn: [batch_size, num_attention_heads,
+            from_seq_length//from_block_size-2, num_rand_blocks]
+            num_attention_heads: int. Number of attention heads.
+            num_rand_blocks: int. Number of random chunks per row.
+            batch_size: int. Batch size for computation.
+            from_seq_length: int. length of from sequence.
+            from_block_size: int. size of block in from sequence.
+
+        Returns:
+            float Tensor of shape [batch_size, num_attention_heads, from_seq_length//from_block_size-2,
+            from_block_size, num_rand_blocks*to_block_size].
+        """
+        num_windows = from_seq_length // from_block_size - 2
+        rand_mask = torch.stack([p1[i1.flatten()] for p1, i1 in zip(to_blocked_mask, rand_attn)])
+        rand_mask = rand_mask.view(batch_size, num_attention_heads, num_windows, num_rand_blocks * from_block_size)
+        rand_mask = torch.einsum("blq,bhlk->bhlqk", from_blocked_mask[:, 1:-1], rand_mask)
+        return rand_mask
+
+    @staticmethod
+    def _get_rand_attn_plan(from_seq_length, from_block_size, num_rand_blocks):
+        """
+        Gives the plan of where to put random attention.
+
+        Args:
+            from_seq_length: int. length of from sequence.
+            from_block_size: int. size of block in from sequence.
+            num_rand_blocks: int. Number of random chunks per row.
+
+        Returns:
+            plan_from_length: ending location of from block plan_num_rand_blocks: number of random ending location for
+            each block
+        """
+
+        plan_from_length = []
+        plan_num_rand_blocks = []
+        if (2 * num_rand_blocks + 5) < (from_seq_length // from_block_size):
+            plan_from_length.append(int((2 * num_rand_blocks + 5) * from_block_size))
+            plan_num_rand_blocks.append(num_rand_blocks)
+            plan_from_length.append(from_seq_length)
+            plan_num_rand_blocks.append(0)
+        elif (num_rand_blocks + 5) < (from_seq_length // from_block_size):
+            plan_from_length.append(int((num_rand_blocks + 5) * from_block_size))
+            plan_num_rand_blocks.append(num_rand_blocks // 2)
+            plan_from_length.append(from_seq_length)
+            plan_num_rand_blocks.append(num_rand_blocks - (num_rand_blocks // 2))
+        else:
+            plan_from_length.append(from_seq_length)
+            plan_num_rand_blocks.append(num_rand_blocks)
+
+        return plan_from_length, plan_num_rand_blocks
+
+    def _bigbird_block_rand_mask(
+        self, from_seq_length, to_seq_length, from_block_size, to_block_size, num_rand_blocks, last_idx=-1
+    ):
+        """
+        Create adjacency list of random attention.
+
+        Args:
+            from_seq_length: int. length of from sequence.
+            to_seq_length: int. length of to sequence.
+            from_block_size: int. size of block in from sequence.
+            to_block_size: int. size of block in to sequence.
+            num_rand_blocks: int. Number of random chunks per row.
+            last_idx: if -1 then num_rand_blocks blocks chosen anywhere in to sequence,
+            if positive then num_rand_blocks blocks chosen only up to last_idx.
+
+        Returns:
+            adjacency list of size from_seq_length//from_block_size-2 by num_rand_blocks
+        """
+        # using this method when from_seq_length in [1024, 3072, 4096]
+
+        if from_seq_length // from_block_size != to_seq_length // to_block_size:
+            raise ValueError("Error the number of blocks needs to be same!")
+
+        rand_attn = np.zeros((from_seq_length // from_block_size - 2, num_rand_blocks), dtype=np.int32)
+        # During inference (eval) no randomness
+        if not self.training:
+            return rand_attn
+        middle_seq = np.arange(1, to_seq_length // to_block_size - 1, dtype=np.int32)
+        last = to_seq_length // to_block_size - 1
+        if last_idx > (2 * to_block_size):
+            last = (last_idx // to_block_size) - 1
+
+        r = num_rand_blocks  # shorthand
+        for i in range(1, from_seq_length // from_block_size - 1):
+            start = i - 2
+            end = i
+            if i == 1:
+                rand_attn[i - 1, :] = np.random.permutation(middle_seq[2:last])[:r]
+            elif i == 2:
+                rand_attn[i - 1, :] = np.random.permutation(middle_seq[3:last])[:r]
+            elif i == from_seq_length // from_block_size - 3:
+                rand_attn[i - 1, :] = np.random.permutation(middle_seq[:last])[:r]
+            # Missing -3: should have been sliced till last-3
+            elif i == from_seq_length // from_block_size - 2:
+                rand_attn[i - 1, :] = np.random.permutation(middle_seq[:last])[:r]
+            # Missing -4: should have been sliced till last-4
+            else:
+                if start > last:
+                    start = last
+                    rand_attn[i - 1, :] = np.random.permutation(middle_seq[:start])[:r]
+                elif (end + 1) == last:
+                    rand_attn[i - 1, :] = np.random.permutation(middle_seq[:start])[:r]
+                else:
+                    rand_attn[i - 1, :] = np.random.permutation(
+                        np.concatenate((middle_seq[:start], middle_seq[end + 1 : last]))
+                    )[:r]
+        return rand_attn
+
+    def _bigbird_block_rand_mask_with_head(
+        self,
+        from_seq_length,
+        to_seq_length,
+        from_block_size,
+        to_block_size,
+        num_heads,
+        plan_from_length,
+        plan_num_rand_blocks,
+        window_block_left=1,
+        window_block_right=1,
+        global_block_top=1,
+        global_block_bottom=1,
+        global_block_left=1,
+        global_block_right=1,
+    ):
+        """
+        Create adjacency list of random attention.
+
+        Args:
+            from_seq_length: int. length of from sequence.
+            to_seq_length: int. length of to sequence.
+            from_block_size: int. size of block in from sequence.
+            to_block_size: int. size of block in to sequence.
+            num_heads: int. total number of heads.
+            plan_from_length: list. plan from length where num_random_blocks are chosen from.
+            plan_num_rand_blocks: list. number of rand blocks within the plan.
+            window_block_left: int. number of blocks of window to left of a block.
+            window_block_right: int. number of blocks of window to right of a block.
+            global_block_top: int. number of blocks at the top.
+            global_block_bottom: int. number of blocks at the bottom.
+            global_block_left: int. Number of blocks globally used to the left.
+            global_block_right: int. Number of blocks globally used to the right.
+
+        Returns:
+            adjacency list of size num_head where each element is of size from_seq_length//from_block_size-2 by
+            num_rand_blocks
+        """
+        # using this method when from_seq_length not in [1024, 3072, 4096]
+
+        if from_seq_length // from_block_size != to_seq_length // to_block_size:
+            raise ValueError("Error the number of blocks needs to be same!")
+
+        if from_seq_length not in plan_from_length:
+            raise ValueError("Error from sequence length not in plan!")
+
+        # Total number of blocks in the mmask
+        num_blocks = from_seq_length // from_block_size
+        # Number of blocks per plan
+        plan_block_length = np.array(plan_from_length) // from_block_size
+        # till when to follow plan
+        max_plan_idx = plan_from_length.index(from_seq_length)
+
+        # Random Attention adjacency list
+        rand_attn = [
+            np.zeros((num_blocks, np.sum(plan_num_rand_blocks[: max_plan_idx + 1])), dtype=np.int32)
+            for i in range(num_heads)
+        ]
+        # During inference (eval) no randomness
+        if not self.training:
+            for nh in range(num_heads):
+                rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :]
+            return rand_attn
+
+        # We will go iteratively over the plan blocks and pick random number of
+        # Attention blocks from the legally allowed blocks
+        for plan_idx in range(max_plan_idx + 1):
+            rnd_r_cnt = 0
+            if plan_idx > 0:
+                # set the row for all from_blocks starting from 0 to
+                # plan_block_length[plan_idx-1]
+                # column indx start from plan_block_length[plan_idx-1] and ends at
+                # plan_block_length[plan_idx]
+                if plan_num_rand_blocks[plan_idx] > 0:
+                    rnd_r_cnt = int(np.sum(plan_num_rand_blocks[:plan_idx]))
+                    curr_r_cnt = int(np.sum(plan_num_rand_blocks[: plan_idx + 1]))
+                    for blk_rw_idx in range(global_block_top, plan_block_length[plan_idx - 1]):
+                        for h in range(num_heads):
+                            rand_attn[h][blk_rw_idx, rnd_r_cnt:curr_r_cnt] = self._get_single_block_row_attention(
+                                block_id=blk_rw_idx,
+                                to_start_block_id=plan_block_length[plan_idx - 1],
+                                to_end_block_id=plan_block_length[plan_idx],
+                                num_rand_blocks=plan_num_rand_blocks[plan_idx],
+                                window_block_left=window_block_left,
+                                window_block_right=window_block_right,
+                                global_block_left=global_block_left,
+                                global_block_right=global_block_right,
+                            )
+
+                for pl_id in range(plan_idx):
+                    if plan_num_rand_blocks[pl_id] == 0:
+                        continue
+                    for blk_rw_idx in range(plan_block_length[plan_idx - 1], plan_block_length[plan_idx]):
+                        rnd_r_cnt = 0
+                        to_start_block_id = 0
+                        if pl_id > 0:
+                            rnd_r_cnt = int(np.sum(plan_num_rand_blocks[:pl_id]))
+                            to_start_block_id = plan_block_length[pl_id - 1]
+                        curr_r_cnt = int(np.sum(plan_num_rand_blocks[: pl_id + 1]))
+                        for h in range(num_heads):
+                            rand_attn[h][blk_rw_idx, rnd_r_cnt:curr_r_cnt] = self._get_single_block_row_attention(
+                                block_id=blk_rw_idx,
+                                to_start_block_id=to_start_block_id,
+                                to_end_block_id=plan_block_length[pl_id],
+                                num_rand_blocks=plan_num_rand_blocks[pl_id],
+                                window_block_left=window_block_left,
+                                window_block_right=window_block_right,
+                                global_block_left=global_block_left,
+                                global_block_right=global_block_right,
+                            )
+
+            if plan_num_rand_blocks[plan_idx] == 0:
+                continue
+            curr_r_cnt = int(np.sum(plan_num_rand_blocks[: plan_idx + 1]))
+            from_start_block_id = global_block_top
+            to_start_block_id = 0
+            if plan_idx > 0:
+                rnd_r_cnt = int(np.sum(plan_num_rand_blocks[:plan_idx]))
+                from_start_block_id = plan_block_length[plan_idx - 1]
+                to_start_block_id = plan_block_length[plan_idx - 1]
+
+            for blk_rw_idx in range(from_start_block_id, plan_block_length[plan_idx]):
+                for h in range(num_heads):
+                    rand_attn[h][blk_rw_idx, rnd_r_cnt:curr_r_cnt] = self._get_single_block_row_attention(
+                        block_id=blk_rw_idx,
+                        to_start_block_id=to_start_block_id,
+                        to_end_block_id=plan_block_length[plan_idx],
+                        num_rand_blocks=plan_num_rand_blocks[plan_idx],
+                        window_block_left=window_block_left,
+                        window_block_right=window_block_right,
+                        global_block_left=global_block_left,
+                        global_block_right=global_block_right,
+                    )
+
+        for nh in range(num_heads):
+            rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :]
+
+        return rand_attn
+
+    @staticmethod
+    def _get_single_block_row_attention(
+        block_id,
+        to_start_block_id,
+        to_end_block_id,
+        num_rand_blocks,
+        window_block_left=1,
+        window_block_right=1,
+        global_block_left=1,
+        global_block_right=1,
+    ):
+        """
+        For a single row block get random row attention.
+
+        Args:
+            block_id: int. block id of row.
+            to_start_block_id: int. random attention column start id.
+            to_end_block_id: int. random attention column end id.
+            num_rand_blocks: int. number of random blocks to be selected.
+            window_block_left: int. number of blocks of window to left of a block.
+            window_block_right: int. number of blocks of window to right of a block.
+            global_block_left: int. Number of blocks globally used to the left.
+            global_block_right: int. Number of blocks globally used to the right.
+
+        Returns:
+            row containing the random attention vector of size num_rand_blocks.
+        """
+        # list of to_blocks from which to choose random attention
+        to_block_list = np.arange(to_start_block_id, to_end_block_id, dtype=np.int32)
+        # permute the blocks
+        perm_block = np.random.permutation(to_block_list)
+
+        # illegal blocks for the current block id, using window
+        illegal_blocks = list(range(block_id - window_block_left, block_id + window_block_right + 1))
+
+        # Add blocks at the start and at the end
+        illegal_blocks.extend(list(range(global_block_left)))
+        illegal_blocks.extend(list(range(to_end_block_id - global_block_right, to_end_block_id)))
+
+        # The second from_block cannot choose random attention on second last to_block
+        if block_id == 1:
+            illegal_blocks.append(to_end_block_id - 2)
+
+        # The second last from_block cannot choose random attention on second to_block
+        if block_id == to_end_block_id - 2:
+            illegal_blocks.append(1)
+
+        selected_random_blocks = []
+
+        for i in range(to_end_block_id - to_start_block_id):
+            if perm_block[i] not in illegal_blocks:
+                selected_random_blocks.append(perm_block[i])
+            if len(selected_random_blocks) == num_rand_blocks:
+                break
+        return np.array(selected_random_blocks, dtype=np.int32)
+
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert->BigBird
+class BigBirdSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BigBirdAttention(nn.Module):
+    def __init__(self, config, seed=None):
+        super().__init__()
+        self.attention_type = config.attention_type
+        self.config = config
+        self.seed = seed
+
+        if self.config.attention_type == "original_full":
+            self.self = BigBirdSelfAttention(config, layer_idx=seed)
+        elif self.config.attention_type == "block_sparse":
+            self.self = BigBirdBlockSparseAttention(config, seed)
+        else:
+            raise ValueError(
+                f"attention_type can either be original_full or block_sparse, but is {self.config.attention_type}"
+            )
+
+        self.output = BigBirdSelfOutput(config)
+
+    def set_attention_type(self, value: str, layer_idx=None):
+        if value not in ["original_full", "block_sparse"]:
+            raise ValueError(
+                f"attention_type can only be set to either 'original_full' or 'block_sparse', but is {value}"
+            )
+        # attention type is already correctly set
+        if value == self.attention_type:
+            return
+
+        self.attention_type = value
+        if value == "original_full":
+            # copy all weights to new full attention class
+            attn_weights = BigBirdSelfAttention(self.config, layer_idx=layer_idx)
+        else:
+            # copy all weights to new sparse attention class
+            attn_weights = BigBirdBlockSparseAttention(self.config, self.seed)
+
+        attn_weights.query = self.self.query
+        attn_weights.value = self.self.value
+        attn_weights.key = self.self.key
+        self.self = attn_weights
+        if not self.training:
+            self.self.eval()
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        output_attentions=False,
+        # block_sparse config
+        band_mask=None,
+        from_mask=None,
+        to_mask=None,
+        from_blocked_mask=None,
+        to_blocked_mask=None,
+        cache_position=None,
+    ):
+        # fp16 compatibility
+        if band_mask is not None:
+            band_mask = band_mask.to(hidden_states.dtype)
+        if from_mask is not None:
+            from_mask = from_mask.to(hidden_states.dtype)
+        if to_mask is not None:
+            to_mask = to_mask.to(hidden_states.dtype)
+        if self.attention_type == "original_full":
+            self_outputs = self.self(
+                hidden_states,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                cache_position=cache_position,
+            )
+        else:
+            if encoder_hidden_states is not None:
+                raise ValueError("BigBird cannot be used as a decoder when config.attention_type != 'original_full'")
+            self_outputs = self.self(
+                hidden_states, band_mask, from_mask, to_mask, from_blocked_mask, to_blocked_mask, output_attentions
+            )
+
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert->BigBird
+class BigBirdIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert->BigBird
+class BigBirdOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BigBirdLayer(GradientCheckpointingLayer):
+    def __init__(self, config, seed=None):
+        super().__init__()
+        self.config = config
+        self.attention_type = config.attention_type
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = BigBirdAttention(config, seed=seed)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise TypeError(f"{self} should be used as a decoder model if cross attention is added")
+            self.crossattention = BigBirdAttention(config, seed=seed)
+        self.intermediate = BigBirdIntermediate(config)
+        self.output = BigBirdOutput(config)
+
+    def set_attention_type(self, value: str, layer_idx=None):
+        if value not in ["original_full", "block_sparse"]:
+            raise ValueError(
+                f"attention_type can only be set to either 'original_full' or 'block_sparse', but is {value}"
+            )
+        # attention type is already correctly set
+        if value == self.attention_type:
+            return
+        self.attention_type = value
+        self.attention.set_attention_type(value, layer_idx=layer_idx)
+
+        if self.add_cross_attention:
+            self.crossattention.set_attention_type(value, layer_idx=layer_idx)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        band_mask=None,
+        from_mask=None,
+        to_mask=None,
+        blocked_encoder_mask=None,
+        past_key_values=None,
+        output_attentions=False,
+        cache_position=None,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            band_mask=band_mask,
+            from_mask=from_mask,
+            to_mask=to_mask,
+            from_blocked_mask=blocked_encoder_mask,
+            to_blocked_mask=blocked_encoder_mask,
+            cache_position=cache_position,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with                    "
+                    " cross-attention layers by setting `config.add_cross_attention=True`"
+                )
+
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask=encoder_attention_mask,
+                head_mask=head_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                cache_position=cache_position,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:]  # add cross attentions if we output attention weights
+
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+
+        return (layer_output,) + outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class BigBirdEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.attention_type = config.attention_type
+
+        self.layer = nn.ModuleList(
+            [BigBirdLayer(config, seed=layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.gradient_checkpointing = False
+
+    def set_attention_type(self, value: str):
+        if value not in ["original_full", "block_sparse"]:
+            raise ValueError(
+                f"attention_type can only be set to either 'original_full' or 'block_sparse', but is {value}"
+            )
+        # attention type is already correctly set
+        if value == self.attention_type:
+            return
+        self.attention_type = value
+        for i, layer in enumerate(self.layer):
+            layer.set_attention_type(value, layer_idx=i)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        band_mask=None,
+        from_mask=None,
+        to_mask=None,
+        blocked_encoder_mask=None,
+        return_dict=True,
+        cache_position=None,
+    ) -> Union[BaseModelOutputWithPastAndCrossAttentions, tuple]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+        if use_cache and isinstance(past_key_values, tuple):
+            logger.warning_once(
+                "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. "
+                "You should pass an instance of `DynamicCache` instead, e.g. "
+                "`past_key_values=DynamicCache.from_legacy_cache(past_key_values)`."
+            )
+            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            layer_outputs = layer_module(
+                hidden_states,
+                attention_mask,
+                layer_head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                band_mask,
+                from_mask,
+                to_mask,
+                blocked_encoder_mask,
+                past_key_values,
+                output_attentions,
+                cache_position,
+            )
+
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    past_key_values,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+# Copied from transformers.models.bert.modeling_bert.BertPredictionHeadTransform with Bert->BigBird
+class BigBirdPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertLMPredictionHead with Bert->BigBird
+class BigBirdLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = BigBirdPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def _tie_weights(self):
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOnlyMLMHead with Bert->BigBird
+class BigBirdOnlyMLMHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BigBirdLMPredictionHead(config)
+
+    def forward(self, sequence_output: torch.Tensor) -> torch.Tensor:
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOnlyNSPHead with Bert->BigBird
+class BigBirdOnlyNSPHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, pooled_output):
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+
+
+# Copied from transformers.models.bert.modeling_bert.BertPreTrainingHeads with Bert->BigBird
+class BigBirdPreTrainingHeads(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BigBirdLMPredictionHead(config)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+@auto_docstring
+class BigBirdPreTrainedModel(PreTrainedModel):
+    config: BigBirdConfig
+    load_tf_weights = load_tf_weights_in_big_bird
+    base_model_prefix = "bert"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, BigBirdLMPredictionHead):
+            module.bias.data.zero_()
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Output type of [`BigBirdForPreTraining`].
+    """
+)
+class BigBirdForPreTrainingOutput(ModelOutput):
+    r"""
+    loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
+        Total loss as the sum of the masked language modeling loss and the next sequence prediction
+        (classification) loss.
+    prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+    seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
+        Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
+        before SoftMax).
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    prediction_logits: Optional[torch.FloatTensor] = None
+    seq_relationship_logits: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for outputs of question answering models.
+    """
+)
+class BigBirdForQuestionAnsweringModelOutput(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Total span extraction loss is the sum of a Cross-Entropy for the start and end positions.
+    pooler_output (`torch.FloatTensor` of shape `(batch_size, 1)`):
+        pooler output from BigBigModel
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    start_logits: Optional[torch.FloatTensor] = None
+    end_logits: Optional[torch.FloatTensor] = None
+    pooler_output: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+
+
+@auto_docstring
+class BigBirdModel(BigBirdPreTrainedModel):
+    """
+
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
+    all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
+    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
+    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
+    """
+
+    def __init__(self, config, add_pooling_layer=True):
+        r"""
+        add_pooling_layer (bool, *optional*, defaults to `True`):
+            Whether to add a pooling layer
+        """
+        super().__init__(config)
+        self.attention_type = self.config.attention_type
+        self.config = config
+
+        self.block_size = self.config.block_size
+
+        self.embeddings = BigBirdEmbeddings(config)
+        self.encoder = BigBirdEncoder(config)
+
+        if add_pooling_layer:
+            self.pooler = nn.Linear(config.hidden_size, config.hidden_size)
+            self.activation = nn.Tanh()
+        else:
+            self.pooler = None
+            self.activation = None
+
+        if self.attention_type != "original_full" and config.add_cross_attention:
+            logger.warning(
+                "When using `BigBirdForCausalLM` as decoder, then `attention_type` must be `original_full`. Setting"
+                " `attention_type=original_full`"
+            )
+            self.set_attention_type("original_full")
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def set_attention_type(self, value: str):
+        if value not in ["original_full", "block_sparse"]:
+            raise ValueError(
+                f"attention_type can only be set to either 'original_full' or 'block_sparse', but is {value}"
+            )
+        # attention type is already correctly set
+        if value == self.attention_type:
+            return
+        self.attention_type = value
+        self.encoder.set_attention_type(value)
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+        **kwargs,  # NOOP kwargs, for now
+    ) -> Union[BaseModelOutputWithPoolingAndCrossAttentions, tuple[torch.FloatTensor]]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        past_key_values_length = 0
+        if past_key_values is not None:
+            past_key_values_length = (
+                past_key_values[0][0].shape[-2]
+                if not isinstance(past_key_values, Cache)
+                else past_key_values.get_seq_length()
+            )
+
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+        if token_type_ids is None:
+            if hasattr(self.embeddings, "token_type_ids"):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        # in order to use block_sparse attention, sequence_length has to be at least
+        # bigger than all global attentions: 2 * block_size
+        # + sliding tokens: 3 * block_size
+        # + random tokens: 2 * num_random_blocks * block_size
+        max_tokens_to_attend = (5 + 2 * self.config.num_random_blocks) * self.config.block_size
+        if self.attention_type == "block_sparse" and seq_length <= max_tokens_to_attend:
+            # change attention_type from block_sparse to original_full
+            sequence_length = input_ids.size(1) if input_ids is not None else inputs_embeds.size(1)
+            logger.warning(
+                "Attention type 'block_sparse' is not possible if sequence_length: "
+                f"{sequence_length} <= num global tokens: 2 * config.block_size "
+                "+ min. num sliding tokens: 3 * config.block_size "
+                "+ config.num_random_blocks * config.block_size "
+                "+ additional buffer: config.num_random_blocks * config.block_size "
+                f"= {max_tokens_to_attend} with config.block_size "
+                f"= {self.config.block_size}, config.num_random_blocks "
+                f"= {self.config.num_random_blocks}. "
+                "Changing attention type to 'original_full'..."
+            )
+            self.set_attention_type("original_full")
+
+        if self.attention_type == "block_sparse":
+            (
+                padding_len,
+                input_ids,
+                attention_mask,
+                token_type_ids,
+                position_ids,
+                inputs_embeds,
+            ) = self._pad_to_block_size(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                token_type_ids=token_type_ids,
+                position_ids=position_ids,
+                inputs_embeds=inputs_embeds,
+                pad_token_id=self.config.pad_token_id,
+            )
+        else:
+            padding_len = 0
+
+        if self.attention_type == "block_sparse":
+            blocked_encoder_mask, band_mask, from_mask, to_mask = self.create_masks_for_block_sparse_attn(
+                attention_mask, self.block_size
+            )
+            extended_attention_mask = None
+
+        elif self.attention_type == "original_full":
+            blocked_encoder_mask = None
+            band_mask = None
+            from_mask = None
+            to_mask = None
+            # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+            # ourselves in which case we just need to make it broadcastable to all heads.
+            extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
+        else:
+            raise ValueError(
+                f"attention_type can either be original_full or block_sparse, but is {self.attention_type}"
+            )
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            band_mask=band_mask,
+            from_mask=from_mask,
+            to_mask=to_mask,
+            blocked_encoder_mask=blocked_encoder_mask,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+        sequence_output = encoder_outputs[0]
+
+        pooler_output = self.activation(self.pooler(sequence_output[:, 0, :])) if (self.pooler is not None) else None
+
+        # undo padding
+        if padding_len > 0:
+            # unpad `sequence_output` because the calling function is expecting a length == input_ids.size(1)
+            sequence_output = sequence_output[:, :-padding_len]
+
+        if not return_dict:
+            return (sequence_output, pooler_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooler_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+    @staticmethod
+    def create_masks_for_block_sparse_attn(attention_mask: torch.Tensor, block_size: int):
+        batch_size, seq_length = attention_mask.size()
+        if seq_length % block_size != 0:
+            raise ValueError(
+                f"Sequence length must be multiple of block size, but sequence length is {seq_length}, while block"
+                f" size is {block_size}."
+            )
+
+        def create_band_mask_from_inputs(from_blocked_mask, to_blocked_mask):
+            """
+            Create 3D attention mask from a 2D tensor mask.
+
+            Args:
+                from_blocked_mask: 2D Tensor of shape [batch_size,
+                from_seq_length//from_block_size, from_block_size].
+                to_blocked_mask: int32 Tensor of shape [batch_size,
+                to_seq_length//to_block_size, to_block_size].
+
+            Returns:
+                float Tensor of shape [batch_size, 1, from_seq_length//from_block_size-4, from_block_size,
+                3*to_block_size].
+            """
+            exp_blocked_to_pad = torch.cat(
+                [to_blocked_mask[:, 1:-3], to_blocked_mask[:, 2:-2], to_blocked_mask[:, 3:-1]], dim=2
+            )
+            band_mask = torch.einsum("blq,blk->blqk", from_blocked_mask[:, 2:-2], exp_blocked_to_pad)
+            band_mask.unsqueeze_(1)
+            return band_mask
+
+        blocked_encoder_mask = attention_mask.view(batch_size, seq_length // block_size, block_size)
+        band_mask = create_band_mask_from_inputs(blocked_encoder_mask, blocked_encoder_mask)
+
+        from_mask = attention_mask.view(batch_size, 1, seq_length, 1)
+        to_mask = attention_mask.view(batch_size, 1, 1, seq_length)
+
+        return blocked_encoder_mask, band_mask, from_mask, to_mask
+
+    def _pad_to_block_size(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: torch.Tensor,
+        token_type_ids: torch.Tensor,
+        position_ids: torch.Tensor,
+        inputs_embeds: torch.Tensor,
+        pad_token_id: int,
+    ):
+        """A helper function to pad tokens and mask to work with implementation of BigBird block-sparse attention."""
+        # padding
+        block_size = self.config.block_size
+
+        input_shape = input_ids.shape if input_ids is not None else inputs_embeds.shape
+        batch_size, seq_len = input_shape[:2]
+
+        padding_len = (block_size - seq_len % block_size) % block_size
+        if padding_len > 0:
+            logger.warning_once(
+                f"Input ids are automatically padded from {seq_len} to {seq_len + padding_len} to be a multiple of "
+                f"`config.block_size`: {block_size}"
+            )
+            if input_ids is not None:
+                input_ids = nn.functional.pad(input_ids, (0, padding_len), value=pad_token_id)
+            if position_ids is not None:
+                # pad with position_id = pad_token_id as in modeling_bigbird.BigBirdEmbeddings
+                position_ids = nn.functional.pad(position_ids, (0, padding_len), value=pad_token_id)
+            if inputs_embeds is not None:
+                input_ids_padding = inputs_embeds.new_full(
+                    (batch_size, padding_len),
+                    self.config.pad_token_id,
+                    dtype=torch.long,
+                )
+                inputs_embeds_padding = self.embeddings(input_ids_padding)
+                inputs_embeds = torch.cat([inputs_embeds, inputs_embeds_padding], dim=-2)
+
+            attention_mask = nn.functional.pad(
+                attention_mask, (0, padding_len), value=False
+            )  # no attention on the padding tokens
+            token_type_ids = nn.functional.pad(token_type_ids, (0, padding_len), value=0)  # pad with token_type_id = 0
+
+        return padding_len, input_ids, attention_mask, token_type_ids, position_ids, inputs_embeds
+
+
+class BigBirdForPreTraining(BigBirdPreTrainedModel):
+    _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = BigBirdModel(config, add_pooling_layer=True)
+        self.cls = BigBirdPreTrainingHeads(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+        self.cls.predictions.bias = new_embeddings.bias
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.FloatTensor] = None,
+        next_sentence_label: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[BigBirdForPreTrainingOutput, tuple[torch.FloatTensor]]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the next sequence prediction (classification) loss. If specified, nsp loss will be
+            added to masked_lm loss. Input should be a sequence pair (see `input_ids` docstring) Indices should be in
+            `[0, 1]`:
+
+            - 0 indicates sequence B is a continuation of sequence A,
+            - 1 indicates sequence B is a random sequence.
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, BigBirdForPreTraining
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/bigbird-roberta-base")
+        >>> model = BigBirdForPreTraining.from_pretrained("google/bigbird-roberta-base")
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> prediction_logits = outputs.prediction_logits
+        >>> seq_relationship_logits = outputs.seq_relationship_logits
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output, pooled_output = outputs[:2]
+        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
+
+        total_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            total_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if next_sentence_label is not None and total_loss is not None:
+            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
+            total_loss = total_loss + next_sentence_loss
+
+        if not return_dict:
+            output = (prediction_scores, seq_relationship_score) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return BigBirdForPreTrainingOutput(
+            loss=total_loss,
+            prediction_logits=prediction_scores,
+            seq_relationship_logits=seq_relationship_score,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@auto_docstring
+class BigBirdForMaskedLM(BigBirdPreTrainedModel):
+    _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        if config.is_decoder:
+            logger.warning(
+                "If you want to use `BigBirdForMaskedLM` make sure `config.is_decoder=False` for "
+                "bi-directional self-attention."
+            )
+
+        self.bert = BigBirdModel(config)
+        self.cls = BigBirdOnlyMLMHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+        self.cls.predictions.bias = new_embeddings.bias
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[MaskedLMOutput, tuple[torch.FloatTensor]]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> import torch
+        >>> from transformers import AutoTokenizer, BigBirdForMaskedLM
+        >>> from datasets import load_dataset
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/bigbird-roberta-base")
+        >>> model = BigBirdForMaskedLM.from_pretrained("google/bigbird-roberta-base")
+        >>> squad_ds = load_dataset("rajpurkar/squad_v2", split="train")  # doctest: +IGNORE_RESULT
+
+        >>> # select random long article
+        >>> LONG_ARTICLE_TARGET = squad_ds[81514]["context"]
+        >>> # select random sentence
+        >>> LONG_ARTICLE_TARGET[332:398]
+        'the highest values are very close to the theoretical maximum value'
+
+        >>> # add mask_token
+        >>> LONG_ARTICLE_TO_MASK = LONG_ARTICLE_TARGET.replace("maximum", "[MASK]")
+        >>> inputs = tokenizer(LONG_ARTICLE_TO_MASK, return_tensors="pt")
+        >>> # long article input
+        >>> list(inputs["input_ids"].shape)
+        [1, 919]
+
+        >>> with torch.no_grad():
+        ...     logits = model(**inputs).logits
+        >>> # retrieve index of [MASK]
+        >>> mask_token_index = (inputs.input_ids == tokenizer.mask_token_id)[0].nonzero(as_tuple=True)[0]
+        >>> predicted_token_id = logits[0, mask_token_index].argmax(axis=-1)
+        >>> tokenizer.decode(predicted_token_id)
+        'maximum'
+        ```
+
+        ```python
+        >>> labels = tokenizer(LONG_ARTICLE_TARGET, return_tensors="pt")["input_ids"]
+        >>> labels = torch.where(inputs.input_ids == tokenizer.mask_token_id, labels, -100)
+        >>> outputs = model(**inputs, labels=labels)
+        >>> round(outputs.loss.item(), 2)
+        1.99
+        ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+        effective_batch_size = input_shape[0]
+
+        #  add a dummy token
+        if self.config.pad_token_id is None:
+            raise ValueError("The PAD token should be defined for generation")
+        attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1)
+        dummy_token = torch.full(
+            (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device
+        )
+        input_ids = torch.cat([input_ids, dummy_token], dim=1)
+
+        return {"input_ids": input_ids, "attention_mask": attention_mask}
+
+
+@auto_docstring(
+    custom_intro="""
+    BigBird Model with a `language modeling` head on top for CLM fine-tuning.
+    """
+)
+class BigBirdForCausalLM(BigBirdPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        if not config.is_decoder:
+            logger.warning("If you want to use `BigBirdForCausalLM` as a standalone, add `is_decoder=True.`")
+
+        self.bert = BigBirdModel(config)
+        self.cls = BigBirdOnlyMLMHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+        self.cls.predictions.bias = new_embeddings.bias
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[CausalLMOutputWithCrossAttentions, tuple[torch.FloatTensor]]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        lm_loss = None
+        if labels is not None:
+            lm_loss = self.loss_function(
+                prediction_scores,
+                labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+
+class BigBirdClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+
+        self.config = config
+
+    def forward(self, features, **kwargs):
+        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = ACT2FN[self.config.hidden_act](x)
+        x = self.dropout(x)
+        x = self.out_proj(x)
+        return x
+
+
+@auto_docstring(
+    custom_intro="""
+    BigBird Model transformer with a sequence classification/regression head on top (a linear layer on top of the
+    pooled output) e.g. for GLUE tasks.
+    """
+)
+class BigBirdForSequenceClassification(BigBirdPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+        self.bert = BigBirdModel(config)
+        self.classifier = BigBirdClassificationHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[SequenceClassifierOutput, tuple[torch.FloatTensor]]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+
+        Example:
+
+        ```python
+        >>> import torch
+        >>> from transformers import AutoTokenizer, BigBirdForSequenceClassification
+        >>> from datasets import load_dataset
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("l-yohai/bigbird-roberta-base-mnli")
+        >>> model = BigBirdForSequenceClassification.from_pretrained("l-yohai/bigbird-roberta-base-mnli")
+        >>> squad_ds = load_dataset("rajpurkar/squad_v2", split="train")  # doctest: +IGNORE_RESULT
+
+        >>> LONG_ARTICLE = squad_ds[81514]["context"]
+        >>> inputs = tokenizer(LONG_ARTICLE, return_tensors="pt")
+        >>> # long input article
+        >>> list(inputs["input_ids"].shape)
+        [1, 919]
+
+        >>> with torch.no_grad():
+        ...     logits = model(**inputs).logits
+        >>> predicted_class_id = logits.argmax().item()
+        >>> model.config.id2label[predicted_class_id]
+        'LABEL_0'
+        ```
+
+        ```python
+        >>> num_labels = len(model.config.id2label)
+        >>> model = BigBirdForSequenceClassification.from_pretrained(
+        ...     "l-yohai/bigbird-roberta-base-mnli", num_labels=num_labels
+        ... )
+        >>> labels = torch.tensor(1)
+        >>> loss = model(**inputs, labels=labels).loss
+        >>> round(loss.item(), 2)
+        1.13
+        ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@auto_docstring
+class BigBirdForMultipleChoice(BigBirdPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = BigBirdModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[MultipleChoiceModelOutput, tuple[torch.FloatTensor]]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        token_type_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `(batch_size, num_choices, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_choices, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+            `input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+
+        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@auto_docstring
+class BigBirdForTokenClassification(BigBirdPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.bert = BigBirdModel(config)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[TokenClassifierOutput, tuple[torch.FloatTensor]]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class BigBirdForQuestionAnsweringHead(nn.Module):
+    """Head for question answering tasks."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.intermediate = BigBirdIntermediate(config)
+        self.output = BigBirdOutput(config)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(self, encoder_output):
+        hidden_states = self.dropout(encoder_output)
+        hidden_states = self.intermediate(hidden_states)
+        hidden_states = self.output(hidden_states, encoder_output)
+        hidden_states = self.qa_outputs(hidden_states)
+        return hidden_states
+
+
+@auto_docstring
+class BigBirdForQuestionAnswering(BigBirdPreTrainedModel):
+    def __init__(self, config, add_pooling_layer=False):
+        r"""
+        add_pooling_layer (bool, *optional*, defaults to `True`):
+            Whether to add a pooling layer
+        """
+        super().__init__(config)
+
+        config.num_labels = 2
+        self.num_labels = config.num_labels
+        self.sep_token_id = config.sep_token_id
+
+        self.bert = BigBirdModel(config, add_pooling_layer=add_pooling_layer)
+        self.qa_classifier = BigBirdForQuestionAnsweringHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        question_lengths: Optional[torch.LongTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[BigBirdForQuestionAnsweringModelOutput, tuple[torch.FloatTensor]]:
+        r"""
+        question_lengths (`torch.LongTensor` of shape `(batch_size, 1)`, *optional*):
+            The lengths of the questions in the batch.
+
+        Example:
+
+        ```python
+        >>> import torch
+        >>> from transformers import AutoTokenizer, BigBirdForQuestionAnswering
+        >>> from datasets import load_dataset
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/bigbird-roberta-base")
+        >>> model = BigBirdForQuestionAnswering.from_pretrained("google/bigbird-roberta-base")
+        >>> squad_ds = load_dataset("rajpurkar/squad_v2", split="train")  # doctest: +IGNORE_RESULT
+
+        >>> # select random article and question
+        >>> LONG_ARTICLE = squad_ds[81514]["context"]
+        >>> QUESTION = squad_ds[81514]["question"]
+        >>> QUESTION
+        'During daytime how high can the temperatures reach?'
+
+        >>> inputs = tokenizer(QUESTION, LONG_ARTICLE, return_tensors="pt")
+        >>> # long article and question input
+        >>> list(inputs["input_ids"].shape)
+        [1, 929]
+
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+
+        >>> answer_start_index = outputs.start_logits.argmax()
+        >>> answer_end_index = outputs.end_logits.argmax()
+        >>> predict_answer_token_ids = inputs.input_ids[0, answer_start_index : answer_end_index + 1]
+        >>> predict_answer_token = tokenizer.decode(predict_answer_token_ids)
+        ```
+
+        ```python
+        >>> target_start_index, target_end_index = torch.tensor([130]), torch.tensor([132])
+        >>> outputs = model(**inputs, start_positions=target_start_index, end_positions=target_end_index)
+        >>> loss = outputs.loss
+        ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        seqlen = input_ids.size(1) if input_ids is not None else inputs_embeds.size(1)
+
+        if question_lengths is None and input_ids is not None:
+            # assuming input_ids format: <cls> <question> <sep> context <sep>
+            question_lengths = torch.argmax(input_ids.eq(self.sep_token_id).int(), dim=-1) + 1
+            question_lengths.unsqueeze_(1)
+
+        logits_mask = None
+        if question_lengths is not None:
+            # setting lengths logits to `-inf`
+            logits_mask = self.prepare_question_mask(question_lengths, seqlen)
+            if token_type_ids is None:
+                token_type_ids = torch.ones(logits_mask.size(), dtype=int, device=logits_mask.device) - logits_mask
+            logits_mask[:, 0] = False
+            logits_mask.unsqueeze_(2)
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        logits = self.qa_classifier(sequence_output)
+
+        if logits_mask is not None:
+            # removing question tokens from the competition
+            logits = logits - logits_mask * 1e6
+
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return BigBirdForQuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            pooler_output=outputs.pooler_output,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    @staticmethod
+    def prepare_question_mask(q_lengths: torch.Tensor, maxlen: int):
+        # q_lengths -> (bz, 1)
+        mask = torch.arange(0, maxlen).to(q_lengths.device)
+        mask.unsqueeze_(0)  # -> (1, maxlen)
+        mask = torch.where(mask < q_lengths, 1, 0)
+        return mask
+
+
+__all__ = [
+    "BigBirdForCausalLM",
+    "BigBirdForMaskedLM",
+    "BigBirdForMultipleChoice",
+    "BigBirdForPreTraining",
+    "BigBirdForQuestionAnswering",
+    "BigBirdForSequenceClassification",
+    "BigBirdForTokenClassification",
+    "BigBirdLayer",
+    "BigBirdModel",
+    "BigBirdPreTrainedModel",
+    "load_tf_weights_in_big_bird",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/modeling_flax_big_bird.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/modeling_flax_big_bird.py
new file mode 100644
index 0000000000000000000000000000000000000000..11dcb30f3d47e3ecf6d90d7a7305682d687e5828
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/modeling_flax_big_bird.py
@@ -0,0 +1,2648 @@
+# coding=utf-8
+# Copyright 2021 The Google Flax Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, Optional
+
+import flax
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
+from flax.linen import combine_masks, make_causal_mask
+from flax.linen import partitioning as nn_partitioning
+from flax.linen.attention import dot_product_attention_weights
+from flax.traverse_util import flatten_dict, unflatten_dict
+from jax import lax
+
+from ...modeling_flax_outputs import (
+    FlaxBaseModelOutputWithPastAndCrossAttentions,
+    FlaxBaseModelOutputWithPooling,
+    FlaxBaseModelOutputWithPoolingAndCrossAttentions,
+    FlaxCausalLMOutputWithCrossAttentions,
+    FlaxMaskedLMOutput,
+    FlaxMultipleChoiceModelOutput,
+    FlaxSequenceClassifierOutput,
+    FlaxTokenClassifierOutput,
+)
+from ...modeling_flax_utils import (
+    ACT2FN,
+    FlaxPreTrainedModel,
+    append_call_sample_docstring,
+    append_replace_return_docstrings,
+    overwrite_call_docstring,
+)
+from ...utils import ModelOutput, add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from .configuration_big_bird import BigBirdConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "google/bigbird-roberta-base"
+_CONFIG_FOR_DOC = "BigBirdConfig"
+
+remat = nn_partitioning.remat
+
+
+@flax.struct.dataclass
+class FlaxBigBirdForPreTrainingOutput(ModelOutput):
+    """
+    Output type of [`BigBirdForPreTraining`].
+
+    Args:
+        prediction_logits (`jnp.ndarray` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        seq_relationship_logits (`jnp.ndarray` of shape `(batch_size, 2)`):
+            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
+            before SoftMax).
+        hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    prediction_logits: jnp.ndarray = None
+    seq_relationship_logits: jnp.ndarray = None
+    hidden_states: Optional[tuple[jnp.ndarray]] = None
+    attentions: Optional[tuple[jnp.ndarray]] = None
+
+
+@flax.struct.dataclass
+class FlaxBigBirdForQuestionAnsweringModelOutput(ModelOutput):
+    """
+    Base class for outputs of question answering models.
+
+    Args:
+        start_logits (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
+            Span-start scores (before SoftMax).
+        end_logits (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
+            Span-end scores (before SoftMax).
+        pooled_output (`jnp.ndarray` of shape `(batch_size, hidden_size)`):
+            pooled_output returned by FlaxBigBirdModel.
+        hidden_states (`tuple(jnp.ndarray)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `jnp.ndarray` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(jnp.ndarray)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `jnp.ndarray` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    start_logits: jnp.ndarray = None
+    end_logits: jnp.ndarray = None
+    pooled_output: jnp.ndarray = None
+    hidden_states: Optional[tuple[jnp.ndarray]] = None
+    attentions: Optional[tuple[jnp.ndarray]] = None
+
+
+BIG_BIRD_START_DOCSTRING = r"""
+
+    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading, saving and converting weights from PyTorch models)
+
+    This model is also a
+    [flax.linen.Module](https://flax.readthedocs.io/en/latest/api_reference/flax.linen/module.html) subclass. Use it as
+    a regular Flax linen Module and refer to the Flax documentation for all matter related to general usage and
+    behavior.
+
+    Finally, this model supports inherent JAX features such as:
+
+    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
+
+    Parameters:
+        config ([`BigBirdConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
+        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
+            `jax.numpy.bfloat16` (on TPUs).
+
+            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
+            specified all the computation will be performed with the given `dtype`.
+
+            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
+            parameters.**
+
+            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
+            [`~FlaxPreTrainedModel.to_bf16`].
+"""
+
+BIG_BIRD_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`numpy.ndarray` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`numpy.ndarray` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`numpy.ndarray` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`numpy.ndarray` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+        head_mask (`numpy.ndarray` of shape `({0})`, `optional):
+            Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+
+"""
+
+
+class FlaxBigBirdEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    config: BigBirdConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    # Copied from transformers.models.bert.modeling_flax_bert.FlaxBertEmbeddings.setup
+    def setup(self):
+        self.word_embeddings = nn.Embed(
+            self.config.vocab_size,
+            self.config.hidden_size,
+            embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.position_embeddings = nn.Embed(
+            self.config.max_position_embeddings,
+            self.config.hidden_size,
+            embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.token_type_embeddings = nn.Embed(
+            self.config.type_vocab_size,
+            self.config.hidden_size,
+            embedding_init=jax.nn.initializers.normal(stddev=self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
+
+    def __call__(self, input_ids, token_type_ids, position_ids, attention_mask, deterministic: bool = True):
+        # Embed
+        inputs_embeds = self.word_embeddings(input_ids.astype("i4"))
+        position_embeds = self.position_embeddings(position_ids.astype("i4"))
+        token_type_embeddings = self.token_type_embeddings(token_type_ids.astype("i4"))
+
+        if self.config.rescale_embeddings:
+            inputs_embeds *= self.config.hidden_size**0.5
+
+        # Sum all embeddings
+        hidden_states = inputs_embeds + token_type_embeddings + position_embeds
+
+        # Layer Norm
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertSelfAttention with Bert->BigBird
+class FlaxBigBirdSelfAttention(nn.Module):
+    config: BigBirdConfig
+    causal: bool = False
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.head_dim = self.config.hidden_size // self.config.num_attention_heads
+        if self.config.hidden_size % self.config.num_attention_heads != 0:
+            raise ValueError(
+                "`config.hidden_size`: {self.config.hidden_size} has to be a multiple of `config.num_attention_heads` "
+                "                   : {self.config.num_attention_heads}"
+            )
+
+        self.query = nn.Dense(
+            self.config.hidden_size,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+        self.key = nn.Dense(
+            self.config.hidden_size,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+        self.value = nn.Dense(
+            self.config.hidden_size,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+
+        if self.causal:
+            self.causal_mask = make_causal_mask(
+                jnp.ones((1, self.config.max_position_embeddings), dtype="bool"), dtype="bool"
+            )
+
+    def _split_heads(self, hidden_states):
+        return hidden_states.reshape(hidden_states.shape[:2] + (self.config.num_attention_heads, self.head_dim))
+
+    def _merge_heads(self, hidden_states):
+        return hidden_states.reshape(hidden_states.shape[:2] + (self.config.hidden_size,))
+
+    @nn.compact
+    # Copied from transformers.models.bart.modeling_flax_bart.FlaxBartAttention._concatenate_to_cache
+    def _concatenate_to_cache(self, key, value, query, attention_mask):
+        """
+        This function takes projected key, value states from a single input token and concatenates the states to cached
+        states from previous steps. This function is slightly adapted from the official Flax repository:
+        https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
+        """
+        # detect if we're initializing by absence of existing cache data.
+        is_initialized = self.has_variable("cache", "cached_key")
+        cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype)
+        cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype)
+        cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))
+
+        if is_initialized:
+            *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape
+            # update key, value caches with our new 1d spatial slices
+            cur_index = cache_index.value
+            indices = (0,) * len(batch_dims) + (cur_index, 0, 0)
+            key = lax.dynamic_update_slice(cached_key.value, key, indices)
+            value = lax.dynamic_update_slice(cached_value.value, value, indices)
+            cached_key.value = key
+            cached_value.value = value
+            num_updated_cache_vectors = query.shape[1]
+            cache_index.value = cache_index.value + num_updated_cache_vectors
+            # causal mask for cached decoder self-attention: our single query position should only attend to those key positions that have already been generated and cached, not the remaining zero elements.
+            pad_mask = jnp.broadcast_to(
+                jnp.arange(max_length) < cur_index + num_updated_cache_vectors,
+                tuple(batch_dims) + (1, num_updated_cache_vectors, max_length),
+            )
+            attention_mask = combine_masks(pad_mask, attention_mask)
+        return key, value, attention_mask
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        layer_head_mask,
+        key_value_states: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        deterministic=True,
+        output_attentions: bool = False,
+    ):
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        batch_size = hidden_states.shape[0]
+
+        # get query proj
+        query_states = self.query(hidden_states)
+        # get key, value proj
+        if is_cross_attention:
+            # cross_attentions
+            key_states = self.key(key_value_states)
+            value_states = self.value(key_value_states)
+        else:
+            # self_attention
+            key_states = self.key(hidden_states)
+            value_states = self.value(hidden_states)
+
+        query_states = self._split_heads(query_states)
+        key_states = self._split_heads(key_states)
+        value_states = self._split_heads(value_states)
+
+        # handle cache prepare causal attention mask
+        if self.causal:
+            query_length, key_length = query_states.shape[1], key_states.shape[1]
+            if self.has_variable("cache", "cached_key"):
+                mask_shift = self.variables["cache"]["cache_index"]
+                max_decoder_length = self.variables["cache"]["cached_key"].shape[1]
+                causal_mask = lax.dynamic_slice(
+                    self.causal_mask, (0, 0, mask_shift, 0), (1, 1, query_length, max_decoder_length)
+                )
+            else:
+                causal_mask = self.causal_mask[:, :, :query_length, :key_length]
+            causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:])
+
+        # combine masks if needed
+        if attention_mask is not None and self.causal:
+            attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape)
+            attention_mask = combine_masks(attention_mask, causal_mask)
+        elif self.causal:
+            attention_mask = causal_mask
+        elif attention_mask is not None:
+            attention_mask = jnp.expand_dims(attention_mask, axis=(-3, -2))
+
+        # During fast autoregressive decoding, we feed one position at a time,
+        # and cache the keys and values step by step.
+        if self.causal and (self.has_variable("cache", "cached_key") or init_cache):
+            key_states, value_states, attention_mask = self._concatenate_to_cache(
+                key_states, value_states, query_states, attention_mask
+            )
+
+        # Convert the boolean attention mask to an attention bias.
+        if attention_mask is not None:
+            # attention mask in the form of attention bias
+            attention_bias = lax.select(
+                attention_mask > 0,
+                jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
+                jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
+            )
+        else:
+            attention_bias = None
+
+        dropout_rng = None
+        if not deterministic and self.config.attention_probs_dropout_prob > 0.0:
+            dropout_rng = self.make_rng("dropout")
+
+        attn_weights = dot_product_attention_weights(
+            query_states,
+            key_states,
+            bias=attention_bias,
+            dropout_rng=dropout_rng,
+            dropout_rate=self.config.attention_probs_dropout_prob,
+            broadcast_dropout=True,
+            deterministic=deterministic,
+            dtype=self.dtype,
+            precision=None,
+        )
+
+        # Mask heads if we want to
+        if layer_head_mask is not None:
+            attn_weights = jnp.einsum("...hqk,h->...hqk", attn_weights, layer_head_mask)
+
+        attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value_states)
+        attn_output = attn_output.reshape(attn_output.shape[:2] + (-1,))
+
+        outputs = (attn_output, attn_weights) if output_attentions else (attn_output,)
+        return outputs
+
+
+class FlaxBigBirdBlockSparseAttention(nn.Module):
+    config: BigBirdConfig
+    block_sparse_seed: Optional[int] = None
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.query = nn.Dense(
+            self.config.hidden_size,
+            dtype=self.dtype,
+            use_bias=self.config.use_bias,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+        self.key = nn.Dense(
+            self.config.hidden_size,
+            dtype=self.dtype,
+            use_bias=self.config.use_bias,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+        self.value = nn.Dense(
+            self.config.hidden_size,
+            dtype=self.dtype,
+            use_bias=self.config.use_bias,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+        )
+
+    @staticmethod
+    def transpose_for_scores(x, n_heads, head_size):
+        new_x_shape = x.shape[:-1] + (n_heads, head_size)
+        x = x.reshape(*new_x_shape)
+        return jnp.transpose(x, axes=(0, 2, 1, 3))
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        deterministic=True,
+        output_attentions=False,
+    ):
+        n_heads = self.config.num_attention_heads
+        head_size = self.config.hidden_size // n_heads
+
+        blocked_encoder_mask, band_mask, from_mask, to_mask = self.create_masks_for_block_sparse_attn(
+            attention_mask, self.config.block_size
+        )
+
+        query_layer = self.transpose_for_scores(self.query(hidden_states), n_heads, head_size)
+        key_layer = self.transpose_for_scores(self.key(hidden_states), n_heads, head_size)
+        value_layer = self.transpose_for_scores(self.value(hidden_states), n_heads, head_size)
+
+        indices_prng_key = None
+        if not deterministic:
+            indices_prng_key = self.make_rng("indices")
+
+        attn_output, attn_weights = self.bigbird_block_sparse_attention(
+            query_layer,
+            key_layer,
+            value_layer,
+            band_mask,
+            from_mask,
+            to_mask,
+            blocked_encoder_mask,
+            blocked_encoder_mask,
+            n_heads,
+            head_size,
+            indices_prng_key=indices_prng_key,
+            deterministic=deterministic,
+            plan_from_length=None,
+            plan_num_rand_blocks=None,
+            output_attentions=output_attentions,
+        )
+
+        outputs = (attn_output, attn_weights) if output_attentions else (attn_output,)
+        return outputs
+
+    @staticmethod
+    def create_masks_for_block_sparse_attn(attention_mask, block_size: int):
+        batch_size, seq_length = attention_mask.shape
+        if seq_length % block_size != 0:
+            raise ValueError(
+                f"Sequence length must be multiple of block size, but sequence length is {seq_length}, while block"
+                f" size is {block_size}."
+            )
+
+        def create_band_mask_from_inputs(from_blocked_mask, to_blocked_mask):
+            """
+            Create 3D attention mask from a 2D tensor mask.
+
+            Args:
+                from_blocked_mask: 2D Tensor of shape [batch_size,
+                from_seq_length//from_block_size, from_block_size].
+                to_blocked_mask: int32 Tensor of shape [batch_size,
+                to_seq_length//to_block_size, to_block_size].
+
+            Returns:
+                float Tensor of shape [batch_size, 1, from_seq_length//from_block_size-4, from_block_size,
+                3*to_block_size].
+            """
+            exp_blocked_to_pad = jnp.concatenate(
+                [to_blocked_mask[:, 1:-3], to_blocked_mask[:, 2:-2], to_blocked_mask[:, 3:-1]], axis=2
+            )
+            band_mask = jnp.einsum("blq,blk->blqk", from_blocked_mask[:, 2:-2], exp_blocked_to_pad)
+            band_mask = jnp.expand_dims(band_mask, 1)
+            return band_mask
+
+        blocked_encoder_mask = attention_mask.reshape(batch_size, seq_length // block_size, block_size)
+        band_mask = create_band_mask_from_inputs(blocked_encoder_mask, blocked_encoder_mask)
+
+        from_mask = attention_mask.reshape(batch_size, 1, seq_length, 1)
+        to_mask = attention_mask.reshape(batch_size, 1, 1, seq_length)
+
+        return blocked_encoder_mask, band_mask, from_mask, to_mask
+
+    def bigbird_block_sparse_attention(
+        self,
+        query_layer,
+        key_layer,
+        value_layer,
+        band_mask,
+        from_mask,
+        to_mask,
+        from_blocked_mask,
+        to_blocked_mask,
+        n_heads,
+        head_size,
+        indices_prng_key: Optional[jax.random.PRNGKey] = None,
+        deterministic: Optional[bool] = True,
+        plan_from_length=None,
+        plan_num_rand_blocks=None,
+        output_attentions=None,
+    ):
+        # BigBird block-sparse attention as suggested in paper
+
+        # ITC:
+        #     global tokens: 2 x block_size
+        #     window tokens: 3 x block_size
+        #     random tokens: num_rand_tokens x block_size
+
+        # ETC:
+        #     global tokens: extra_globals_tokens + 2 x block_size
+        #     window tokens: 3 x block_size
+        #     random tokens: num_rand_tokens x block_size
+
+        # Note:
+        #     1) Currently, ETC is not supported.
+        #     2) Window size is fixed to 3 blocks & it can be changed only by
+        #     changing `block_size`.
+        #     3) Number of global blocks are fixed (2 blocks here) & global tokens can be
+        #     controlled only by `block_size`.
+
+        # attention is calculated separately for q[0], q[1], q[2:-2], q[-2], q[-1] in order to use special trick of
+        # shifting tokens (for calculating sliding attention). hence following code can be divided into 5 parts.
+
+        bsz, _, from_seq_len, _ = query_layer.shape
+        to_seq_len = key_layer.shape[2]
+        from_block_size = to_block_size = self.config.block_size
+
+        if from_seq_len % from_block_size != 0:
+            raise ValueError("Query sided sequence length must be multiple of block size")
+
+        if to_seq_len % to_block_size != 0:
+            raise ValueError("Key/Value sided sequence length must be multiple of block size")
+
+        if from_seq_len // from_block_size != to_seq_len // to_block_size:
+            raise ValueError("Error the number of blocks needs to be same!")
+
+        n_rand_blocks = self.config.num_random_blocks
+        rsqrt_d = 1 / jnp.sqrt(head_size)
+        attn_mask_penalty = -10000.0
+
+        if from_seq_len in [1024, 3072, 4096]:  # old plans used in paper
+            max_seqlen = self.config.max_position_embeddings
+            rand_attn = [
+                self._bigbird_block_rand_mask(
+                    max_seqlen,
+                    max_seqlen,
+                    from_block_size,
+                    to_block_size,
+                    n_rand_blocks,
+                    indices_prng_key=indices_prng_key,
+                    deterministic=deterministic,
+                    last_idx=1024,
+                )[: (from_seq_len // from_block_size - 2)]
+                for _ in range(n_heads)
+            ]
+        else:
+            if plan_from_length is None:
+                plan_from_length, plan_num_rand_blocks = self._get_rand_attn_plan(
+                    from_seq_len, from_block_size, n_rand_blocks
+                )
+            rand_attn = self._bigbird_block_rand_mask_with_head(
+                from_seq_length=from_seq_len,
+                to_seq_length=to_seq_len,
+                from_block_size=from_block_size,
+                to_block_size=to_block_size,
+                num_heads=n_heads,
+                plan_from_length=plan_from_length,
+                plan_num_rand_blocks=plan_num_rand_blocks,
+                indices_prng_key=indices_prng_key,
+            )
+
+        rand_attn = jnp.stack(rand_attn, axis=0)
+        rand_attn = jnp.broadcast_to(rand_attn, (bsz,) + rand_attn.shape)
+
+        rand_mask = self._create_rand_mask_from_inputs(
+            from_blocked_mask, to_blocked_mask, rand_attn, n_heads, n_rand_blocks, bsz, from_seq_len, from_block_size
+        )
+
+        blocked_query_matrix = query_layer.reshape(bsz, n_heads, from_seq_len // from_block_size, from_block_size, -1)
+        blocked_key_matrix = key_layer.reshape(bsz, n_heads, to_seq_len // to_block_size, to_block_size, -1)
+        blocked_value_matrix = value_layer.reshape(bsz, n_heads, to_seq_len // to_block_size, to_block_size, -1)
+
+        shape = (bsz, n_heads, to_seq_len // to_block_size - 2, n_rand_blocks * to_block_size, -1)
+        gathered_key = self.jax_gather(blocked_key_matrix, rand_attn, batch_dims=2).reshape(*shape)
+        gathered_value = self.jax_gather(blocked_value_matrix, rand_attn, batch_dims=2).reshape(*shape)
+
+        # 1st PART
+        # 1st block (global block) attention scores
+        # q[0] x (k[0], k[1], k[2], k[3], k[4] .... )
+
+        # [bsz, n_heads, from_block_size, -1] x [bsz, n_heads, to_seq_len, -1] ==> [bsz, n_heads, from_block_size, to_seq_len]
+        first_product = jnp.einsum("bhqd,bhkd->bhqk", blocked_query_matrix[:, :, 0], key_layer)
+
+        first_product = first_product * rsqrt_d
+        first_product += (1.0 - to_mask) * attn_mask_penalty
+        first_attn_weights = jax.nn.softmax(first_product, axis=-1)  # [bsz, n_heads, from_block_size, to_seq_len]
+
+        # [bsz, n_heads, from_block_size, to_seq_len] x [bsz, n_heads, to_seq_len, -1] ==> [bsz, n_heads, from_block_size, -1]
+        first_context_layer = jnp.einsum("bhqk,bhkd->bhqd", first_attn_weights, value_layer)
+        first_context_layer = jnp.expand_dims(first_context_layer, 2)
+
+        # 2nd PART
+        # 2nd block attention scores
+        # q[1] x (sliding_keys, random_keys, global_keys)
+        # sliding key blocks -> 2nd, 3rd blocks
+        # global key blocks -> 1st block
+
+        second_key_mat = jnp.concatenate(
+            [
+                blocked_key_matrix[:, :, 0],
+                blocked_key_matrix[:, :, 1],
+                blocked_key_matrix[:, :, 2],
+                blocked_key_matrix[:, :, -1],
+                gathered_key[:, :, 0],
+            ],
+            axis=2,
+        )  # [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1]
+        second_value_mat = jnp.concatenate(
+            [
+                blocked_value_matrix[:, :, 0],
+                blocked_value_matrix[:, :, 1],
+                blocked_value_matrix[:, :, 2],
+                blocked_value_matrix[:, :, -1],
+                gathered_value[:, :, 0],
+            ],
+            axis=2,
+        )  # [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1]
+
+        # [bsz, n_heads, from_block_size, -1] x [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1]
+        # ==> [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size]
+        second_product = jnp.einsum("bhqd,bhkd->bhqk", blocked_query_matrix[:, :, 1], second_key_mat)
+        second_seq_pad = jnp.concatenate(
+            [
+                to_mask[:, :, :, : 3 * to_block_size],
+                to_mask[:, :, :, -to_block_size:],
+                jnp.ones([bsz, 1, 1, n_rand_blocks * to_block_size], dtype=to_mask.dtype),
+            ],
+            axis=3,
+        )
+        second_rand_pad = jnp.concatenate(
+            [
+                jnp.ones([bsz, n_heads, from_block_size, 4 * to_block_size], dtype=rand_mask.dtype),
+                rand_mask[:, :, 0],
+            ],
+            axis=3,
+        )
+        second_product = second_product * rsqrt_d
+        second_product += (1.0 - jnp.minimum(second_seq_pad, second_rand_pad)) * attn_mask_penalty
+        second_attn_weights = jax.nn.softmax(
+            second_product, axis=-1
+        )  # [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size]
+
+        # [bsz, n_heads, from_block_size, (4+r)*to_block_size] x [bsz, n_heads, (4+r)*to_block_size, -1]
+        #  ==> [bsz, n_heads, from_block_size, -1]
+        second_context_layer = jnp.einsum("bhqk,bhkd->bhqd", second_attn_weights, second_value_mat)
+        second_context_layer = jnp.expand_dims(second_context_layer, 2)
+
+        # 3rd PART
+        # Middle blocks attention scores
+        # q[-2:2] x (sliding_keys, random_keys, global_keys)
+        # sliding attn is calculated using special trick of shifting tokens as discussed in paper
+        # random keys are generated by taking random indices as per `rand_attn`
+        # global keys -> 1st & last block
+
+        exp_blocked_key_matrix = jnp.concatenate(
+            [blocked_key_matrix[:, :, 1:-3], blocked_key_matrix[:, :, 2:-2], blocked_key_matrix[:, :, 3:-1]], axis=3
+        )  # [bsz, n_heads, from_seq_len//from_block_size-4, 3*to_block_size, -1]
+        exp_blocked_value_matrix = jnp.concatenate(
+            [blocked_value_matrix[:, :, 1:-3], blocked_value_matrix[:, :, 2:-2], blocked_value_matrix[:, :, 3:-1]],
+            axis=3,
+        )  # [bsz, n_heads, from_seq_len//from_block_size-4, 3*to_block_size, -1]
+        middle_query_matrix = blocked_query_matrix[:, :, 2:-2]
+
+        # sliding attention scores for q[-2:2]
+        # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] x [b, n_heads, from_seq_len//from_block_size-4, 3*to_block_size, -1]
+        inner_band_product = jnp.einsum("bhlqd,bhlkd->bhlqk", middle_query_matrix, exp_blocked_key_matrix)
+        #     ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, 3*to_block_size]
+        inner_band_product = inner_band_product * rsqrt_d
+
+        # randn attention scores for q[-2:2]
+        # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1]
+        # x [bsz, n_heads, from_seq_len//from_block_size-4, n_rand_blocks*to_block_size, -1]
+        rand_band_product = jnp.einsum("bhlqd,bhlkd->bhlqk", middle_query_matrix, gathered_key[:, :, 1:-1])
+        #     ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, n_rand_blocks*to_block_size]
+        rand_band_product = rand_band_product * rsqrt_d
+
+        # Including 1st block (since it's global)
+        # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] x [bsz, n_heads, to_block_size, -1]
+        #  ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, to_block_size]
+        first_band_product = jnp.einsum("bhlqd,bhkd->bhlqk", middle_query_matrix, blocked_key_matrix[:, :, 0])
+        first_band_product = first_band_product * rsqrt_d
+
+        # Including last block (since it's global)
+        # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1] x [bsz, n_heads, to_block_size, -1]
+        #  ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, to_block_size]
+        last_band_product = jnp.einsum("bhlqd,bhkd->bhlqk", middle_query_matrix, blocked_key_matrix[:, :, -1])
+        last_band_product = last_band_product * rsqrt_d
+
+        # masking padded tokens
+        inner_band_product += (1.0 - band_mask) * attn_mask_penalty
+        first_band_product += (1.0 - jnp.expand_dims(to_mask[:, :, :, :to_block_size], 3)) * attn_mask_penalty
+        last_band_product += (1.0 - jnp.expand_dims(to_mask[:, :, :, -to_block_size:], 3)) * attn_mask_penalty
+        rand_band_product += (1.0 - rand_mask[:, :, 1:-1]) * attn_mask_penalty
+
+        # completing attention scores matrix for all q[-2:2]
+        band_product = jnp.concatenate(
+            [first_band_product, inner_band_product, rand_band_product, last_band_product], axis=-1
+        )  # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, (5+n_rand_blocks)*to_block_size]
+
+        # safely doing softmax since attention matrix is completed
+        attn_weights = jax.nn.softmax(
+            band_product, axis=-1
+        )  # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, (5+n_rand_blocks)*to_block_size]
+
+        # contribution of sliding keys
+        # [bsz, n_heads, m//from_block_size-4, from_block_size, 3*to_block_size]
+        # x [bsz, n_heads, from_seq_len//from_block_size-4, 3*to_block_size, -1]
+        context_layer = jnp.einsum(
+            "bhlqk,bhlkd->bhlqd", attn_weights[:, :, :, :, to_block_size : 4 * to_block_size], exp_blocked_value_matrix
+        )
+        #     ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1]
+
+        # adding contribution of random keys
+        # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, n_rand_blocks*to_block_size]
+        # x [bsz, n_heads, from_seq_len//from_block_size-4, n_rand_blocks*to_block_size, -1]
+        context_layer += jnp.einsum(
+            "bhlqk,bhlkd->bhlqd",
+            attn_weights[:, :, :, :, 4 * to_block_size : -to_block_size],
+            gathered_value[:, :, 1:-1],
+        )
+        #     ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1]
+
+        # adding contribution of global keys
+        # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, to_block_size] x [bsz, n_heads, to_block_size, -1]
+        #  ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1]
+        context_layer += jnp.einsum(
+            "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, :to_block_size], blocked_value_matrix[:, :, 0]
+        )
+        # [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, to_block_size] x [bsz, n_heads, to_block_size, -1]
+        # ==> [bsz, n_heads, from_seq_len//from_block_size-4, from_block_size, -1]
+        context_layer += jnp.einsum(
+            "bhlqk,bhkd->bhlqd", attn_weights[:, :, :, :, -to_block_size:], blocked_value_matrix[:, :, -1]
+        )
+
+        # 4th PART
+        # last 2nd token attention scores
+        # q[-2] x (sliding_keys, random_keys, global_keys)
+        # sliding key blocks -> last 3 blocks
+        # global key block -> 1st block
+        # random key block -> based on indices stored in `randn_attn`
+
+        second_last_key_mat = jnp.concatenate(
+            [
+                blocked_key_matrix[:, :, 0],
+                blocked_key_matrix[:, :, -3],
+                blocked_key_matrix[:, :, -2],
+                blocked_key_matrix[:, :, -1],
+                gathered_key[:, :, -1],
+            ],
+            axis=2,
+        )  # [bsz, n_heads, (4+n_random_blocks)*to_block_size, -1]
+        second_last_value_mat = jnp.concatenate(
+            [
+                blocked_value_matrix[:, :, 0],
+                blocked_value_matrix[:, :, -3],
+                blocked_value_matrix[:, :, -2],
+                blocked_value_matrix[:, :, -1],
+                gathered_value[:, :, -1],
+            ],
+            axis=2,
+        )  # [bsz, n_heads, (4+r)*to_block_size, -1]
+
+        # [bsz, n_heads, from_block_size, -1] x [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1]
+        # ==> [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size]
+        second_last_product = jnp.einsum("bhqd,bhkd->bhqk", blocked_query_matrix[:, :, -2], second_last_key_mat)
+        second_last_seq_pad = jnp.concatenate(
+            [
+                to_mask[:, :, :, :to_block_size],
+                to_mask[:, :, :, -3 * to_block_size :],
+                jnp.ones([bsz, 1, 1, n_rand_blocks * to_block_size], dtype=to_mask.dtype),
+            ],
+            axis=3,
+        )
+        second_last_rand_pad = jnp.concatenate(
+            [
+                jnp.ones([bsz, n_heads, from_block_size, 4 * to_block_size], dtype=rand_mask.dtype),
+                rand_mask[:, :, -1],
+            ],
+            axis=3,
+        )
+        second_last_product = second_last_product * rsqrt_d
+        second_last_product += (1.0 - jnp.minimum(second_last_seq_pad, second_last_rand_pad)) * attn_mask_penalty
+        second_last_attn_weights = jax.nn.softmax(
+            second_last_product, axis=-1
+        )  # [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size]
+
+        # [bsz, n_heads, from_block_size, (4+n_rand_blocks)*to_block_size] x [bsz, n_heads, (4+n_rand_blocks)*to_block_size, -1]
+        # ==> [bsz, n_heads, from_block_size, -1]
+        second_last_context_layer = jnp.einsum("bhqk,bhkd->bhqd", second_last_attn_weights, second_last_value_mat)
+        second_last_context_layer = jnp.expand_dims(second_last_context_layer, 2)
+
+        # 5th PART
+        # last block (global) attention scores
+        # q[-1] x (k[0], k[1], k[2], k[3], .... )
+
+        # [bsz, n_heads, from_block_size, -1] x [bsz, n_heads, to_seq_len, -1] ==> [bsz, n_heads, from_block_size, to_seq_len]
+        last_product = jnp.einsum("bhqd,bhkd->bhqk", blocked_query_matrix[:, :, -1], key_layer)
+        last_product = last_product * rsqrt_d
+        last_product += (1.0 - to_mask) * attn_mask_penalty
+        last_attn_weights = jax.nn.softmax(last_product, axis=-1)  # [bsz, n_heads, from_block_size, n]
+
+        # [bsz, n_heads, from_block_size, to_seq_len] x [bsz, n_heads, to_seq_len, -1] ==> [bsz, n_heads, from_block_size, -1]
+        last_context_layer = jnp.einsum("bhqk,bhkd->bhqd", last_attn_weights, value_layer)
+        last_context_layer = jnp.expand_dims(last_context_layer, 2)
+
+        # combining representations of all tokens
+        context_layer = jnp.concatenate(
+            [first_context_layer, second_context_layer, context_layer, second_last_context_layer, last_context_layer],
+            axis=2,
+        )
+        context_layer = context_layer.reshape(bsz, n_heads, from_seq_len, -1) * from_mask
+        context_layer = jnp.transpose(context_layer, axes=(0, 2, 1, 3)).reshape(bsz, from_seq_len, -1)
+
+        attention_probs = None
+
+        return context_layer, attention_probs
+
+    @staticmethod
+    def jax_gather(params, indices, batch_dims=2):
+        """
+        Gather the indices from params correctly (equivalent to tf.gather but with modifications)
+
+        Args:
+            params: (bsz, n_heads, num_blocks, block_size, head_dim)
+            indices: (<num_blocks, 1)
+        """
+
+        def _jax_gather(params, indices):
+            return params[indices]
+
+        for _ in range(batch_dims):
+            _jax_gather = jax.vmap(_jax_gather, in_axes=(0, 0))
+
+        return _jax_gather(params, indices)  # params.shape[:batch_dims] + indices.shape + params.shape[batch_dims+1:]
+
+    def _create_rand_mask_from_inputs(
+        self,
+        from_blocked_mask,
+        to_blocked_mask,
+        broadcasted_rand_attn,
+        num_attention_heads,
+        num_random_blocks,
+        batch_size,
+        from_seq_length,
+        from_block_size,
+    ):
+        """
+        Create 3D attention mask from a 2D tensor mask.
+
+        Args:
+            from_blocked_mask: 2D Tensor of shape [batch_size, from_seq_length//from_block_size, from_block_size].
+            to_blocked_mask: int32 Tensor of shape [batch_size, to_seq_length//to_block_size, to_block_size].
+            broadcasted_rand_attn:
+                [batch_size, num_attention_heads, from_seq_length//from_block_size-2, num_rand_blocks]
+            num_attention_heads: int. Number of attention heads.
+            num_random_blocks: int. Number of random chunks per row.
+            batch_size: int. Batch size for computation.
+            from_seq_length: int. length of from sequence.
+            from_block_size: int. size of block in from sequence.
+
+        Returns:
+            float Tensor of shape [batch_size, num_attention_heads, from_seq_length//from_block_size-2,
+            from_block_size, num_rand_blocks*to_block_size].
+        """
+        num_windows = from_seq_length // from_block_size - 2
+        rand_mask = self.jax_gather(to_blocked_mask, broadcasted_rand_attn, batch_dims=1)
+        rand_mask = rand_mask.reshape(
+            batch_size, num_attention_heads, num_windows, num_random_blocks * from_block_size
+        )
+        rand_mask = jnp.einsum("blq,bhlk->bhlqk", from_blocked_mask[:, 1:-1], rand_mask)
+        return rand_mask
+
+    @staticmethod
+    def _get_rand_attn_plan(from_seq_length, from_block_size, num_rand_blocks):
+        """
+        Gives the plan of where to put random attention.
+
+        Args:
+            from_seq_length: int. length of from sequence.
+            from_block_size: int. size of block in from sequence.
+            num_rand_blocks: int. Number of random chunks per row.
+
+        Returns:
+            plan_from_length: ending location of from block plan_num_rand_blocks: number of random ending location for
+            each block
+        """
+
+        plan_from_length = []
+        plan_num_rand_blocks = []
+        if (2 * num_rand_blocks + 5) < (from_seq_length // from_block_size):
+            plan_from_length.append(int((2 * num_rand_blocks + 5) * from_block_size))
+            plan_num_rand_blocks.append(num_rand_blocks)
+            plan_from_length.append(from_seq_length)
+            plan_num_rand_blocks.append(0)
+        elif (num_rand_blocks + 5) < (from_seq_length // from_block_size):
+            plan_from_length.append(int((num_rand_blocks + 5) * from_block_size))
+            plan_num_rand_blocks.append(num_rand_blocks // 2)
+            plan_from_length.append(from_seq_length)
+            plan_num_rand_blocks.append(num_rand_blocks - (num_rand_blocks // 2))
+        else:
+            plan_from_length.append(from_seq_length)
+            plan_num_rand_blocks.append(num_rand_blocks)
+
+        return plan_from_length, plan_num_rand_blocks
+
+    @staticmethod
+    def _bigbird_block_rand_mask(
+        from_seq_length,
+        to_seq_length,
+        from_block_size,
+        to_block_size,
+        num_rand_blocks,
+        indices_prng_key: Optional[jax.random.PRNGKey] = None,
+        deterministic: Optional[bool] = True,
+        last_idx: Optional[int] = -1,
+    ):
+        """
+        Create adjacency list of random attention.
+
+        Args:
+            from_seq_length: int. length of from sequence.
+            to_seq_length: int. length of to sequence.
+            from_block_size: int. size of block in from sequence.
+            to_block_size: int. size of block in to sequence.
+            num_rand_blocks: int. Number of random chunks per row.
+            indices_prng_key: jax.random.PRNGKey. PRNG key that is used to perform random jax operations.
+            deterministic: bool. When False random attention will be used.
+            last_idx: if -1 then num_rand_blocks blocks chosen anywhere in to sequence,
+            if positive then num_rand_blocks blocks chosen only up to last_idx.
+
+        Returns:
+            adjacency list of size from_seq_length//from_block_size-2 by num_rand_blocks
+        """
+        # using this method when from_seq_length in [1024, 3072, 4096]
+
+        if from_seq_length // from_block_size != to_seq_length // to_block_size:
+            raise ValueError("Error the number of blocks needs to be same!")
+        rand_attn = jnp.zeros((from_seq_length // from_block_size - 2, num_rand_blocks), dtype=jnp.int32)
+        # deterministic nor randomness
+        if deterministic:
+            return rand_attn
+
+        middle_seq = jnp.arange(1, to_seq_length // to_block_size - 1, dtype=jnp.int32)
+        last = to_seq_length // to_block_size - 1
+        if last_idx > (2 * to_block_size):
+            last = (last_idx // to_block_size) - 1
+
+        r = num_rand_blocks  # shorthand
+        for i in range(1, from_seq_length // from_block_size - 1):
+            start = i - 2
+            end = i
+            if i == 1:
+                seq_values = jax.random.permutation(indices_prng_key, middle_seq[2:last])[:r]
+                rand_attn = rand_attn.at[i - 1].set(seq_values)
+            elif i == 2:
+                seq_values = jax.random.permutation(indices_prng_key, middle_seq[3:last])[:r]
+                rand_attn = rand_attn.at[i - 1].set(seq_values)
+            elif i == from_seq_length // from_block_size - 3:
+                seq_values = jax.random.permutation(indices_prng_key, middle_seq[:last])[:r]
+                rand_attn = rand_attn.at[i - 1].set(seq_values)
+            # Missing -3: should have been sliced till last-3
+            elif i == from_seq_length // from_block_size - 2:
+                seq_values = jax.random.permutation(indices_prng_key, middle_seq[:last])[:r]
+                rand_attn = rand_attn.at[i - 1].set(seq_values)
+            # Missing -4: should have been sliced till last-4
+            else:
+                if start > last:
+                    start = last
+                    seq_values = jax.random.permutation(indices_prng_key, middle_seq[:start])[:r]
+                    rand_attn = rand_attn.at[i - 1].set(seq_values)
+                elif (end + 1) == last:
+                    seq_values = jax.random.permutation(indices_prng_key, middle_seq[:start])[:r]
+                    rand_attn = rand_attn.at[i - 1].set(seq_values)
+                else:
+                    concat_values = jnp.concatenate((middle_seq[:start], middle_seq[end + 1 : last]))
+                    seq_values = jax.random.permutation(indices_prng_key, concat_values)[:r]
+                    rand_attn = rand_attn.at[i - 1].set(seq_values)
+        return rand_attn
+
+    def _bigbird_block_rand_mask_with_head(
+        self,
+        from_seq_length,
+        to_seq_length,
+        from_block_size,
+        to_block_size,
+        num_heads,
+        plan_from_length,
+        plan_num_rand_blocks,
+        indices_prng_key: Optional[jax.random.PRNGKey] = None,
+        deterministic: Optional[bool] = True,
+        window_block_left=1,
+        window_block_right=1,
+        global_block_top=1,
+        global_block_bottom=1,
+        global_block_left=1,
+        global_block_right=1,
+    ):
+        """
+        Create adjacency list of random attention.
+
+        Args:
+            from_seq_length: int. length of from sequence.
+            to_seq_length: int. length of to sequence.
+            from_block_size: int. size of block in from sequence.
+            to_block_size: int. size of block in to sequence.
+            num_heads: int. total number of heads.
+            plan_from_length: list. plan from length where num_random_blocks are chosen from.
+            plan_num_rand_blocks: list. number of rand blocks within the plan.
+            indices_prng_key: jax.random.PRNGKey. PRNG key that is used to perform random jax operations.
+            deterministic: bool. When False random attention will be used.
+            window_block_left: int. number of blocks of window to left of a block.
+            window_block_right: int. number of blocks of window to right of a block.
+            global_block_top: int. number of blocks at the top.
+            global_block_bottom: int. number of blocks at the bottom.
+            global_block_left: int. Number of blocks globally used to the left.
+            global_block_right: int. Number of blocks globally used to the right.
+
+        Returns:
+            adjacency list of size num_head where each element is of size from_seq_length//from_block_size-2 by
+            num_rand_blocks
+        """
+        # using this method when from_seq_length not in [1024, 3072, 4096]
+
+        if from_seq_length // from_block_size != to_seq_length // to_block_size:
+            raise ValueError("Error the number of blocks needs to be same!")
+
+        if from_seq_length not in plan_from_length:
+            raise ValueError("Error from sequence length not in plan!")
+
+        # Total number of blocks in the mmask
+        num_blocks = from_seq_length // from_block_size
+        # Number of blocks per plan
+        plan_block_length = jnp.array(plan_from_length) // from_block_size
+        # till when to follow plan
+        max_plan_idx = plan_from_length.index(from_seq_length)
+
+        # Random Attention adjacency list
+        rand_attn = [
+            jnp.zeros((num_blocks, sum(plan_num_rand_blocks[: max_plan_idx + 1])), dtype=jnp.int32)
+            for i in range(num_heads)
+        ]
+
+        # deterministic
+        if deterministic:
+            for nh in range(num_heads):
+                rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :]
+            return rand_attn
+
+        # We will go iteratively over the plan blocks and pick random number of
+        # Attention blocks from the legally allowed blocks
+        for plan_idx in range(max_plan_idx + 1):
+            rnd_r_cnt = 0
+            if plan_idx > 0:
+                # set the row for all from_blocks starting from 0 to
+                # plan_block_length[plan_idx-1]
+                # column indx start from plan_block_length[plan_idx-1] and ends at
+                # plan_block_length[plan_idx]
+                if plan_num_rand_blocks[plan_idx] > 0:
+                    rnd_r_cnt = int(sum(plan_num_rand_blocks[:plan_idx]))
+                    curr_r_cnt = int(sum(plan_num_rand_blocks[: plan_idx + 1]))
+                    for blk_rw_idx in range(global_block_top, plan_block_length[plan_idx - 1]):
+                        for h in range(num_heads):
+                            single_block_row_attention = self._get_single_block_row_attention(
+                                block_id=blk_rw_idx,
+                                to_start_block_id=plan_block_length[plan_idx - 1],
+                                to_end_block_id=plan_block_length[plan_idx],
+                                num_rand_blocks=plan_num_rand_blocks[plan_idx],
+                                window_block_left=window_block_left,
+                                window_block_right=window_block_right,
+                                global_block_left=global_block_left,
+                                global_block_right=global_block_right,
+                                indices_prng_key=indices_prng_key,
+                            )
+                            rand_attn[h] = (
+                                rand_attn[h].at[blk_rw_idx, rnd_r_cnt:curr_r_cnt].set(single_block_row_attention)
+                            )
+
+                for pl_id in range(plan_idx):
+                    if plan_num_rand_blocks[pl_id] == 0:
+                        continue
+                    for blk_rw_idx in range(plan_block_length[plan_idx - 1], plan_block_length[plan_idx]):
+                        rnd_r_cnt = 0
+                        to_start_block_id = 0
+                        if pl_id > 0:
+                            rnd_r_cnt = int(sum(plan_num_rand_blocks[:pl_id]))
+                            to_start_block_id = plan_block_length[pl_id - 1]
+                        curr_r_cnt = int(sum(plan_num_rand_blocks[: pl_id + 1]))
+                        for h in range(num_heads):
+                            single_block_row_attention = self._get_single_block_row_attention(
+                                block_id=blk_rw_idx,
+                                to_start_block_id=to_start_block_id,
+                                to_end_block_id=plan_block_length[pl_id],
+                                num_rand_blocks=plan_num_rand_blocks[pl_id],
+                                window_block_left=window_block_left,
+                                window_block_right=window_block_right,
+                                global_block_left=global_block_left,
+                                global_block_right=global_block_right,
+                                indices_prng_key=indices_prng_key,
+                            )
+                            rand_attn[h] = (
+                                rand_attn[h].at[blk_rw_idx, rnd_r_cnt:curr_r_cnt].set(single_block_row_attention)
+                            )
+
+            if plan_num_rand_blocks[plan_idx] == 0:
+                continue
+            curr_r_cnt = int(sum(plan_num_rand_blocks[: plan_idx + 1]))
+            from_start_block_id = global_block_top
+            to_start_block_id = 0
+            if plan_idx > 0:
+                rnd_r_cnt = int(sum(plan_num_rand_blocks[:plan_idx]))
+                from_start_block_id = plan_block_length[plan_idx - 1]
+                to_start_block_id = plan_block_length[plan_idx - 1]
+            for blk_rw_idx in range(from_start_block_id, plan_block_length[plan_idx]):
+                for h in range(num_heads):
+                    single_block_row_attention = self._get_single_block_row_attention(
+                        block_id=blk_rw_idx,
+                        to_start_block_id=to_start_block_id,
+                        to_end_block_id=plan_block_length[plan_idx],
+                        num_rand_blocks=plan_num_rand_blocks[plan_idx],
+                        window_block_left=window_block_left,
+                        window_block_right=window_block_right,
+                        global_block_left=global_block_left,
+                        global_block_right=global_block_right,
+                        indices_prng_key=indices_prng_key,
+                    )
+                    rand_attn[h] = rand_attn[h].at[blk_rw_idx, rnd_r_cnt:curr_r_cnt].set(single_block_row_attention)
+
+        for nh in range(num_heads):
+            rand_attn[nh] = rand_attn[nh][global_block_top : num_blocks - global_block_bottom, :]
+        return rand_attn
+
+    @staticmethod
+    def _get_single_block_row_attention(
+        block_id,
+        to_start_block_id,
+        to_end_block_id,
+        num_rand_blocks,
+        indices_prng_key: Optional[jax.random.PRNGKey] = None,
+        window_block_left=1,
+        window_block_right=1,
+        global_block_left=1,
+        global_block_right=1,
+    ):
+        """
+        For a single row block get random row attention.
+
+        Args:
+            block_id: int. block id of row.
+            to_start_block_id: int. random attention column start id.
+            to_end_block_id: int. random attention column end id.
+            num_rand_blocks: int. number of random blocks to be selected.
+            indices_prng_key: jax.random.PRNGKey. PRNG key that is used to perform random jax operations
+            window_block_left: int. number of blocks of window to left of a block.
+            window_block_right: int. number of blocks of window to right of a block.
+            global_block_left: int. Number of blocks globally used to the left.
+            global_block_right: int. Number of blocks globally used to the right.
+
+        Returns:
+            row containing the random attention vector of size num_rand_blocks.
+        """
+        # list of to_blocks from which to choose random attention
+        to_block_list = jnp.arange(to_start_block_id, to_end_block_id, dtype=jnp.int32)
+        # permute the blocks
+        perm_block = jax.random.permutation(indices_prng_key, to_block_list)
+
+        # illegal blocks for the current block id, using window
+        illegal_blocks = list(range(block_id - window_block_left, block_id + window_block_right + 1))
+
+        # Add blocks at the start and at the end
+        illegal_blocks.extend(list(range(global_block_left)))
+        illegal_blocks.extend(list(range(to_end_block_id - global_block_right, to_end_block_id)))
+
+        # The second from_block cannot choose random attention on second last to_block
+        if block_id == 1:
+            illegal_blocks.append(to_end_block_id - 2)
+
+        # The second last from_block cannot choose random attention on second to_block
+        if block_id == to_end_block_id - 2:
+            illegal_blocks.append(1)
+
+        selected_random_blocks = []
+
+        for i in range(to_end_block_id - to_start_block_id):
+            if perm_block[i] not in illegal_blocks:
+                selected_random_blocks.append(perm_block[i])
+            if len(selected_random_blocks) == num_rand_blocks:
+                break
+        return jnp.array(selected_random_blocks, dtype=jnp.int32)
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertSelfOutput with Bert->BigBird
+class FlaxBigBirdSelfOutput(nn.Module):
+    config: BigBirdConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.dense = nn.Dense(
+            self.config.hidden_size,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
+
+    def __call__(self, hidden_states, input_tensor, deterministic: bool = True):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class FlaxBigBirdAttention(nn.Module):
+    config: BigBirdConfig
+    layer_id: Optional[int] = None
+    causal: bool = False
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        if self.config.attention_type == "original_full":
+            self.self = FlaxBigBirdSelfAttention(self.config, causal=self.causal, dtype=self.dtype)
+        elif self.config.attention_type == "block_sparse":
+            self.self = FlaxBigBirdBlockSparseAttention(self.config, block_sparse_seed=self.layer_id, dtype=self.dtype)
+        else:
+            raise ValueError(
+                f"Your `config.attention_type` is {self.config.attention_type} but it can either be `original_full` or"
+                " `block_sparse`"
+            )
+
+        self.output = FlaxBigBirdSelfOutput(self.config, dtype=self.dtype)
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        layer_head_mask,
+        key_value_states=None,
+        init_cache=False,
+        deterministic=True,
+        output_attentions: bool = False,
+    ):
+        # Attention mask comes in as attention_mask.shape == (*batch_sizes, kv_length)
+        # FLAX expects: attention_mask.shape == (*batch_sizes, 1, 1, kv_length) such that it is broadcastable
+        # with attn_weights.shape == (*batch_sizes, num_heads, q_length, kv_length)
+        if self.config.attention_type == "original_full":
+            attn_outputs = self.self(
+                hidden_states,
+                attention_mask,
+                layer_head_mask=layer_head_mask,
+                key_value_states=key_value_states,
+                init_cache=init_cache,
+                deterministic=deterministic,
+                output_attentions=output_attentions,
+            )
+        else:
+            attn_outputs = self.self(
+                hidden_states,
+                attention_mask,
+                deterministic=deterministic,
+                output_attentions=output_attentions,
+            )
+        attn_output = attn_outputs[0]
+        hidden_states = self.output(attn_output, hidden_states, deterministic=deterministic)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_outputs[1],)
+
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertIntermediate with Bert->BigBird
+class FlaxBigBirdIntermediate(nn.Module):
+    config: BigBirdConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.dense = nn.Dense(
+            self.config.intermediate_size,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.activation = ACT2FN[self.config.hidden_act]
+
+    def __call__(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertOutput with Bert->BigBird
+class FlaxBigBirdOutput(nn.Module):
+    config: BigBirdConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.dense = nn.Dense(
+            self.config.hidden_size,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            dtype=self.dtype,
+        )
+        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
+        self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+
+    def __call__(self, hidden_states, attention_output, deterministic: bool = True):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        hidden_states = self.LayerNorm(hidden_states + attention_output)
+        return hidden_states
+
+
+class FlaxBigBirdLayer(nn.Module):
+    config: BigBirdConfig
+    layer_id: Optional[int] = None
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.attention = FlaxBigBirdAttention(
+            self.config, layer_id=self.layer_id, causal=self.config.is_decoder, dtype=self.dtype
+        )
+        self.intermediate = FlaxBigBirdIntermediate(self.config, dtype=self.dtype)
+        self.output = FlaxBigBirdOutput(self.config, dtype=self.dtype)
+        if self.config.add_cross_attention:
+            self.crossattention = FlaxBigBirdAttention(self.config, causal=False, dtype=self.dtype)
+
+    # Copied from transformers.models.bert.modeling_flax_bert.FlaxBertLayer.__call__ with Bert->BigBird
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        layer_head_mask,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+    ):
+        # Self Attention
+        attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            layer_head_mask=layer_head_mask,
+            init_cache=init_cache,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+        )
+        attention_output = attention_outputs[0]
+
+        # Cross-Attention Block
+        if encoder_hidden_states is not None:
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask=encoder_attention_mask,
+                layer_head_mask=layer_head_mask,
+                key_value_states=encoder_hidden_states,
+                deterministic=deterministic,
+                output_attentions=output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+
+        hidden_states = self.intermediate(attention_output)
+        hidden_states = self.output(hidden_states, attention_output, deterministic=deterministic)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attention_outputs[1],)
+            if encoder_hidden_states is not None:
+                outputs += (cross_attention_outputs[1],)
+        return outputs
+
+
+class FlaxBigBirdLayerCollection(nn.Module):
+    config: BigBirdConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    gradient_checkpointing: bool = False
+
+    def setup(self):
+        if self.gradient_checkpointing:
+            FlaxBigBirdCheckpointLayer = remat(FlaxBigBirdLayer, static_argnums=(5, 6, 7))
+            self.layers = [
+                FlaxBigBirdCheckpointLayer(self.config, layer_id=i, name=str(i), dtype=self.dtype)
+                for i in range(self.config.num_hidden_layers)
+            ]
+        else:
+            self.layers = [
+                FlaxBigBirdLayer(self.config, layer_id=i, name=str(i), dtype=self.dtype)
+                for i in range(self.config.num_hidden_layers)
+            ]
+
+    # Copied from transformers.models.bert.modeling_flax_bert.FlaxBertLayerCollection.__call__ with Bert->BigBird
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        head_mask,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        all_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+
+        # Check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            if head_mask.shape[0] != (len(self.layers)):
+                raise ValueError(
+                    f"The head_mask should be specified for {len(self.layers)} layers, but it is for                  "
+                    f"       {head_mask.shape[0]}."
+                )
+
+        for i, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = layer(
+                hidden_states,
+                attention_mask,
+                head_mask[i] if head_mask is not None else None,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                init_cache,
+                deterministic,
+                output_attentions,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        outputs = (hidden_states, all_hidden_states, all_attentions, all_cross_attentions)
+
+        if not return_dict:
+            return tuple(v for v in outputs if v is not None)
+
+        return FlaxBaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertEncoder with Bert->BigBird
+class FlaxBigBirdEncoder(nn.Module):
+    config: BigBirdConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    gradient_checkpointing: bool = False
+
+    def setup(self):
+        self.layer = FlaxBigBirdLayerCollection(
+            self.config,
+            dtype=self.dtype,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        head_mask,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        return self.layer(
+            hidden_states,
+            attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            init_cache=init_cache,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertPredictionHeadTransform with Bert->BigBird
+class FlaxBigBirdPredictionHeadTransform(nn.Module):
+    config: BigBirdConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.dense = nn.Dense(self.config.hidden_size, dtype=self.dtype)
+        self.activation = ACT2FN[self.config.hidden_act]
+        self.LayerNorm = nn.LayerNorm(epsilon=self.config.layer_norm_eps, dtype=self.dtype)
+
+    def __call__(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return self.LayerNorm(hidden_states)
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertLMPredictionHead with Bert->BigBird, np.ndarray->jnp.ndarray
+class FlaxBigBirdLMPredictionHead(nn.Module):
+    config: BigBirdConfig
+    dtype: jnp.dtype = jnp.float32
+    bias_init: Callable[..., jnp.ndarray] = jax.nn.initializers.zeros
+
+    def setup(self):
+        self.transform = FlaxBigBirdPredictionHeadTransform(self.config, dtype=self.dtype)
+        self.decoder = nn.Dense(self.config.vocab_size, dtype=self.dtype, use_bias=False)
+        self.bias = self.param("bias", self.bias_init, (self.config.vocab_size,))
+
+    def __call__(self, hidden_states, shared_embedding=None):
+        hidden_states = self.transform(hidden_states)
+
+        if shared_embedding is not None:
+            hidden_states = self.decoder.apply({"params": {"kernel": shared_embedding.T}}, hidden_states)
+        else:
+            hidden_states = self.decoder(hidden_states)
+
+        bias = jnp.asarray(self.bias, self.dtype)
+        hidden_states += bias
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertOnlyMLMHead with Bert->BigBird
+class FlaxBigBirdOnlyMLMHead(nn.Module):
+    config: BigBirdConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.predictions = FlaxBigBirdLMPredictionHead(self.config, dtype=self.dtype)
+
+    def __call__(self, hidden_states, shared_embedding=None):
+        hidden_states = self.predictions(hidden_states, shared_embedding=shared_embedding)
+        return hidden_states
+
+
+class FlaxBigBirdPreTrainingHeads(nn.Module):
+    config: BigBirdConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.predictions = FlaxBigBirdLMPredictionHead(self.config, dtype=self.dtype)
+        self.seq_relationship = nn.Dense(2, dtype=self.dtype)
+
+    def __call__(self, hidden_states, pooled_output, shared_embedding=None):
+        prediction_scores = self.predictions(hidden_states, shared_embedding=shared_embedding)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+class FlaxBigBirdPreTrainedModel(FlaxPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = BigBirdConfig
+    base_model_prefix = "bert"
+    module_class: nn.Module = None
+
+    def __init__(
+        self,
+        config: BigBirdConfig,
+        input_shape: Optional[tuple] = None,
+        seed: int = 0,
+        dtype: jnp.dtype = jnp.float32,
+        _do_init: bool = True,
+        gradient_checkpointing: bool = False,
+        **kwargs,
+    ):
+        module = self.module_class(config=config, dtype=dtype, gradient_checkpointing=gradient_checkpointing, **kwargs)
+        if config.attention_type == "block_sparse" and input_shape is None:
+            input_shape = (1, 12 * config.block_size)
+        elif input_shape is None:
+            input_shape = (1, 1)
+
+        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
+
+    # Copied from transformers.models.bert.modeling_flax_bert.FlaxBertPreTrainedModel.enable_gradient_checkpointing
+    def enable_gradient_checkpointing(self):
+        self._module = self.module_class(
+            config=self.config,
+            dtype=self.dtype,
+            gradient_checkpointing=True,
+        )
+
+    def init_weights(self, rng: jax.random.PRNGKey, input_shape: tuple, params: FrozenDict = None) -> FrozenDict:
+        # init input tensors
+        input_ids = jnp.zeros(input_shape, dtype="i4")
+        token_type_ids = jnp.zeros_like(input_ids)
+        position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_shape)
+        attention_mask = jnp.ones_like(input_ids)
+        head_mask = jnp.ones((self.config.num_hidden_layers, self.config.num_attention_heads))
+
+        params_rng, dropout_rng, indices_rng = jax.random.split(rng, num=3)
+        rngs = {"params": params_rng, "dropout": dropout_rng, "indices": indices_rng}
+
+        if self.config.add_cross_attention:
+            encoder_hidden_states = jnp.zeros(input_shape + (self.config.hidden_size,))
+            encoder_attention_mask = attention_mask
+            module_init_outputs = self.module.init(
+                rngs,
+                input_ids,
+                attention_mask,
+                token_type_ids,
+                position_ids,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                return_dict=False,
+            )
+        else:
+            module_init_outputs = self.module.init(
+                rngs,
+                input_ids,
+                attention_mask,
+                token_type_ids,
+                position_ids,
+                head_mask,
+                return_dict=False,
+            )
+
+        random_params = module_init_outputs["params"]
+
+        if params is not None:
+            random_params = flatten_dict(unfreeze(random_params))
+            params = flatten_dict(unfreeze(params))
+            for missing_key in self._missing_keys:
+                params[missing_key] = random_params[missing_key]
+            self._missing_keys = set()
+            return freeze(unflatten_dict(params))
+        else:
+            return random_params
+
+    # Copied from transformers.models.bart.modeling_flax_bart.FlaxBartDecoderPreTrainedModel.init_cache
+    def init_cache(self, batch_size, max_length):
+        r"""
+        Args:
+            batch_size (`int`):
+                batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
+            max_length (`int`):
+                maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
+                cache.
+        """
+        # init input variables to retrieve cache
+        input_ids = jnp.ones((batch_size, max_length), dtype="i4")
+        attention_mask = jnp.ones_like(input_ids, dtype="i4")
+        position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
+
+        init_variables = self.module.init(
+            jax.random.PRNGKey(0), input_ids, attention_mask, position_ids, return_dict=False, init_cache=True
+        )
+        return unfreeze(init_variables["cache"])
+
+    @add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    def __call__(
+        self,
+        input_ids,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        params: Optional[dict] = None,
+        dropout_rng: Optional[jax.random.PRNGKey] = None,
+        indices_rng: Optional[jax.random.PRNGKey] = None,
+        train: bool = False,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        past_key_values: Optional[dict] = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        # init input tensors if not passed
+        if token_type_ids is None:
+            token_type_ids = jnp.zeros_like(input_ids)
+
+        if position_ids is None:
+            position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
+
+        if attention_mask is None:
+            attention_mask = jnp.ones_like(input_ids)
+
+        if head_mask is None:
+            head_mask = jnp.ones((self.config.num_hidden_layers, self.config.num_attention_heads))
+
+        # Handle any PRNG if needed
+        rngs = {}
+        if indices_rng is not None:
+            rngs["indices"] = indices_rng
+
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+
+        inputs = {"params": params or self.params}
+
+        if self.config.add_cross_attention:
+            # if past_key_values are passed then cache is already initialized a private flag init_cache has to be passed
+            # down to ensure cache is used. It has to be made sure that cache is marked as mutable so that it can be
+            # changed by FlaxBigBirdAttention module
+            if past_key_values:
+                inputs["cache"] = past_key_values
+                mutable = ["cache"]
+            else:
+                mutable = False
+
+            outputs = self.module.apply(
+                inputs,
+                jnp.array(input_ids, dtype="i4"),
+                jnp.array(attention_mask, dtype="i4"),
+                token_type_ids=jnp.array(token_type_ids, dtype="i4"),
+                position_ids=jnp.array(position_ids, dtype="i4"),
+                head_mask=jnp.array(head_mask, dtype="i4"),
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                deterministic=not train,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                rngs=rngs,
+                mutable=mutable,
+            )
+
+            # add updated cache to model output
+            if past_key_values is not None and return_dict:
+                outputs, past_key_values = outputs
+                outputs["past_key_values"] = unfreeze(past_key_values["cache"])
+                return outputs
+            elif past_key_values is not None and not return_dict:
+                outputs, past_key_values = outputs
+                outputs = outputs[:1] + (unfreeze(past_key_values["cache"]),) + outputs[1:]
+
+        else:
+            outputs = self.module.apply(
+                inputs,
+                jnp.array(input_ids, dtype="i4"),
+                jnp.array(attention_mask, dtype="i4"),
+                token_type_ids=jnp.array(token_type_ids, dtype="i4"),
+                position_ids=jnp.array(position_ids, dtype="i4"),
+                head_mask=jnp.array(head_mask, dtype="i4"),
+                deterministic=not train,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                rngs=rngs,
+            )
+
+        return outputs
+
+
+class FlaxBigBirdModule(nn.Module):
+    config: BigBirdConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    add_pooling_layer: bool = True
+    gradient_checkpointing: bool = False
+
+    def setup(self):
+        self.embeddings = FlaxBigBirdEmbeddings(self.config, dtype=self.dtype)
+        self.encoder = FlaxBigBirdEncoder(
+            self.config, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing
+        )
+        self.pooler = nn.Dense(
+            self.config.hidden_size,
+            kernel_init=jax.nn.initializers.normal(self.config.initializer_range),
+            dtype=self.dtype,
+        )
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        position_ids,
+        head_mask,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        hidden_states = self.embeddings(
+            input_ids, token_type_ids, position_ids, attention_mask, deterministic=deterministic
+        )
+        outputs = self.encoder(
+            hidden_states,
+            attention_mask,
+            head_mask=head_mask,
+            deterministic=deterministic,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+
+        pooled = nn.tanh(self.pooler(hidden_states[:, 0, :])) if self.add_pooling_layer else None
+
+        if not return_dict:
+            # if pooled is None, don't return it
+            if pooled is None:
+                return (hidden_states,) + outputs[1:]
+            return (hidden_states, pooled) + outputs[1:]
+
+        return FlaxBaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            pooler_output=pooled,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    "The bare BigBird Model transformer outputting raw hidden-states without any specific head on top.",
+    BIG_BIRD_START_DOCSTRING,
+)
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertModel with Bert->BigBird
+class FlaxBigBirdModel(FlaxBigBirdPreTrainedModel):
+    module_class = FlaxBigBirdModule
+
+
+append_call_sample_docstring(FlaxBigBirdModel, _CHECKPOINT_FOR_DOC, FlaxBaseModelOutputWithPooling, _CONFIG_FOR_DOC)
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForPreTrainingModule with Bert->BigBird
+class FlaxBigBirdForPreTrainingModule(nn.Module):
+    config: BigBirdConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+
+    def setup(self):
+        self.bert = FlaxBigBirdModule(
+            config=self.config,
+            dtype=self.dtype,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        self.cls = FlaxBigBirdPreTrainingHeads(config=self.config, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        position_ids,
+        head_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # Model
+        outputs = self.bert(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            head_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if self.config.tie_word_embeddings:
+            shared_embedding = self.bert.variables["params"]["embeddings"]["word_embeddings"]["embedding"]
+        else:
+            shared_embedding = None
+
+        hidden_states = outputs[0]
+        pooled_output = outputs[1]
+
+        prediction_scores, seq_relationship_score = self.cls(
+            hidden_states, pooled_output, shared_embedding=shared_embedding
+        )
+
+        if not return_dict:
+            return (prediction_scores, seq_relationship_score) + outputs[2:]
+
+        return FlaxBigBirdForPreTrainingOutput(
+            prediction_logits=prediction_scores,
+            seq_relationship_logits=seq_relationship_score,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    BigBird Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
+    sentence prediction (classification)` head.
+    """,
+    BIG_BIRD_START_DOCSTRING,
+)
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForPreTraining with Bert->BigBird
+class FlaxBigBirdForPreTraining(FlaxBigBirdPreTrainedModel):
+    module_class = FlaxBigBirdForPreTrainingModule
+
+
+FLAX_BIG_BIRD_FOR_PRETRAINING_DOCSTRING = """
+    Returns:
+
+    Example:
+
+    ```python
+    >>> from transformers import AutoTokenizer, FlaxBigBirdForPreTraining
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("google/bigbird-roberta-base")
+    >>> model = FlaxBigBirdForPreTraining.from_pretrained("google/bigbird-roberta-base")
+
+    >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="np")
+    >>> outputs = model(**inputs)
+
+    >>> prediction_logits = outputs.prediction_logits
+    >>> seq_relationship_logits = outputs.seq_relationship_logits
+    ```
+"""
+
+overwrite_call_docstring(
+    FlaxBigBirdForPreTraining,
+    BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length") + FLAX_BIG_BIRD_FOR_PRETRAINING_DOCSTRING,
+)
+append_replace_return_docstrings(
+    FlaxBigBirdForPreTraining, output_type=FlaxBigBirdForPreTrainingOutput, config_class=_CONFIG_FOR_DOC
+)
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForMaskedLMModule with Bert->BigBird
+class FlaxBigBirdForMaskedLMModule(nn.Module):
+    config: BigBirdConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+
+    def setup(self):
+        self.bert = FlaxBigBirdModule(
+            config=self.config,
+            add_pooling_layer=False,
+            dtype=self.dtype,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        self.cls = FlaxBigBirdOnlyMLMHead(config=self.config, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        position_ids,
+        head_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # Model
+        outputs = self.bert(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            head_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        if self.config.tie_word_embeddings:
+            shared_embedding = self.bert.variables["params"]["embeddings"]["word_embeddings"]["embedding"]
+        else:
+            shared_embedding = None
+
+        # Compute the prediction scores
+        logits = self.cls(hidden_states, shared_embedding=shared_embedding)
+
+        if not return_dict:
+            return (logits,) + outputs[1:]
+
+        return FlaxMaskedLMOutput(
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings("""BigBird Model with a `language modeling` head on top.""", BIG_BIRD_START_DOCSTRING)
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForMaskedLM with Bert->BigBird
+class FlaxBigBirdForMaskedLM(FlaxBigBirdPreTrainedModel):
+    module_class = FlaxBigBirdForMaskedLMModule
+
+
+append_call_sample_docstring(FlaxBigBirdForMaskedLM, _CHECKPOINT_FOR_DOC, FlaxMaskedLMOutput, _CONFIG_FOR_DOC)
+
+
+class FlaxBigBirdClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    config: BigBirdConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.dense = nn.Dense(self.config.hidden_size, dtype=self.dtype)
+        classifier_dropout = (
+            self.config.classifier_dropout
+            if self.config.classifier_dropout is not None
+            else self.config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.out_proj = nn.Dense(self.config.num_labels, dtype=self.dtype)
+
+    def __call__(self, features, deterministic=True):
+        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
+        x = self.dropout(x, deterministic=deterministic)
+        x = self.dense(x)
+        x = ACT2FN[self.config.hidden_act](x)
+        x = self.dropout(x, deterministic=deterministic)
+        x = self.out_proj(x)
+        return x
+
+
+class FlaxBigBirdForSequenceClassificationModule(nn.Module):
+    config: BigBirdConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+
+    def setup(self):
+        self.bert = FlaxBigBirdModule(
+            config=self.config, dtype=self.dtype, gradient_checkpointing=self.gradient_checkpointing
+        )
+        self.classifier = FlaxBigBirdClassificationHead(self.config, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        position_ids,
+        head_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # Model
+        outputs = self.bert(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            head_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        logits = self.classifier(sequence_output, deterministic=deterministic)
+
+        if not return_dict:
+            return (logits,) + outputs[2:]
+
+        return FlaxSequenceClassifierOutput(
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    BigBird Model transformer with a sequence classification/regression head on top (a linear layer on top of the
+    pooled output) e.g. for GLUE tasks.
+    """,
+    BIG_BIRD_START_DOCSTRING,
+)
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForSequenceClassification with Bert->BigBird
+class FlaxBigBirdForSequenceClassification(FlaxBigBirdPreTrainedModel):
+    module_class = FlaxBigBirdForSequenceClassificationModule
+
+
+append_call_sample_docstring(
+    FlaxBigBirdForSequenceClassification,
+    _CHECKPOINT_FOR_DOC,
+    FlaxSequenceClassifierOutput,
+    _CONFIG_FOR_DOC,
+)
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForMultipleChoiceModule with Bert->BigBird
+class FlaxBigBirdForMultipleChoiceModule(nn.Module):
+    config: BigBirdConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+
+    def setup(self):
+        self.bert = FlaxBigBirdModule(
+            config=self.config,
+            dtype=self.dtype,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
+        self.classifier = nn.Dense(1, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        position_ids,
+        head_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        num_choices = input_ids.shape[1]
+        input_ids = input_ids.reshape(-1, input_ids.shape[-1]) if input_ids is not None else None
+        attention_mask = attention_mask.reshape(-1, attention_mask.shape[-1]) if attention_mask is not None else None
+        token_type_ids = token_type_ids.reshape(-1, token_type_ids.shape[-1]) if token_type_ids is not None else None
+        position_ids = position_ids.reshape(-1, position_ids.shape[-1]) if position_ids is not None else None
+
+        # Model
+        outputs = self.bert(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            head_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+        pooled_output = self.dropout(pooled_output, deterministic=deterministic)
+        logits = self.classifier(pooled_output)
+
+        reshaped_logits = logits.reshape(-1, num_choices)
+
+        if not return_dict:
+            return (reshaped_logits,) + outputs[2:]
+
+        return FlaxMultipleChoiceModelOutput(
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    BigBird Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    BIG_BIRD_START_DOCSTRING,
+)
+class FlaxBigBirdForMultipleChoice(FlaxBigBirdPreTrainedModel):
+    module_class = FlaxBigBirdForMultipleChoiceModule
+
+    def __init__(
+        self,
+        config: BigBirdConfig,
+        input_shape: Optional[tuple] = None,
+        seed: int = 0,
+        dtype: jnp.dtype = jnp.float32,
+        _do_init: bool = True,
+        **kwargs,
+    ):
+        if config.attention_type == "block_sparse" and input_shape is None:
+            input_shape = (1, 1, 12 * config.block_size)
+        elif input_shape is None:
+            input_shape = (1, 1)
+        super().__init__(config, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
+
+
+overwrite_call_docstring(
+    FlaxBigBirdForMultipleChoice, BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")
+)
+append_call_sample_docstring(
+    FlaxBigBirdForMultipleChoice,
+    _CHECKPOINT_FOR_DOC,
+    FlaxMultipleChoiceModelOutput,
+    _CONFIG_FOR_DOC,
+)
+
+
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForTokenClassificationModule with Bert->BigBird
+class FlaxBigBirdForTokenClassificationModule(nn.Module):
+    config: BigBirdConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+
+    def setup(self):
+        self.bert = FlaxBigBirdModule(
+            config=self.config,
+            dtype=self.dtype,
+            add_pooling_layer=False,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        classifier_dropout = (
+            self.config.classifier_dropout
+            if self.config.classifier_dropout is not None
+            else self.config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(rate=classifier_dropout)
+        self.classifier = nn.Dense(self.config.num_labels, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        position_ids,
+        head_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # Model
+        outputs = self.bert(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            head_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        hidden_states = self.dropout(hidden_states, deterministic=deterministic)
+        logits = self.classifier(hidden_states)
+
+        if not return_dict:
+            return (logits,) + outputs[1:]
+
+        return FlaxTokenClassifierOutput(
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    BigBird Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    BIG_BIRD_START_DOCSTRING,
+)
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForTokenClassification with Bert->BigBird
+class FlaxBigBirdForTokenClassification(FlaxBigBirdPreTrainedModel):
+    module_class = FlaxBigBirdForTokenClassificationModule
+
+
+append_call_sample_docstring(
+    FlaxBigBirdForTokenClassification,
+    _CHECKPOINT_FOR_DOC,
+    FlaxTokenClassifierOutput,
+    _CONFIG_FOR_DOC,
+)
+
+
+class FlaxBigBirdForQuestionAnsweringHead(nn.Module):
+    config: BigBirdConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self):
+        self.dropout = nn.Dropout(rate=self.config.hidden_dropout_prob)
+        self.intermediate = FlaxBigBirdIntermediate(self.config, dtype=self.dtype)
+        self.output = FlaxBigBirdOutput(self.config, dtype=self.dtype)
+        self.qa_outputs = nn.Dense(self.config.num_labels, dtype=self.dtype)
+
+    def __call__(self, encoder_output, deterministic=True):
+        hidden_states = self.dropout(encoder_output, deterministic=deterministic)
+        hidden_states = self.intermediate(hidden_states)
+        hidden_states = self.output(hidden_states, encoder_output)
+        hidden_states = self.qa_outputs(hidden_states)
+        return hidden_states
+
+
+class FlaxBigBirdForQuestionAnsweringModule(nn.Module):
+    config: BigBirdConfig
+    dtype: jnp.dtype = jnp.float32
+    add_pooling_layer: bool = False
+    gradient_checkpointing: bool = False
+
+    def setup(self):
+        self.config.num_labels = 2
+        self.bert = FlaxBigBirdModule(
+            self.config,
+            dtype=self.dtype,
+            add_pooling_layer=self.add_pooling_layer,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        self.qa_classifier = FlaxBigBirdForQuestionAnsweringHead(self.config, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        token_type_ids,
+        position_ids,
+        head_mask,
+        logits_mask=None,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # Model
+        outputs = self.bert(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            position_ids,
+            head_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        pooled_output = outputs[1] if self.add_pooling_layer else None
+        logits = self.qa_classifier(hidden_states, deterministic=deterministic)
+
+        if logits_mask is not None:
+            # removing question tokens from the competition
+            logits = logits - logits_mask * 1e6
+
+        start_logits, end_logits = jnp.split(logits, self.config.num_labels, axis=-1)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = end_logits.squeeze(-1)
+
+        if not return_dict:
+            return (start_logits, end_logits) + outputs[1:]
+
+        return FlaxBigBirdForQuestionAnsweringModelOutput(
+            start_logits=start_logits,
+            end_logits=end_logits,
+            pooled_output=pooled_output,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    BigBird Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    BIG_BIRD_START_DOCSTRING,
+)
+class FlaxBigBirdForQuestionAnswering(FlaxBigBirdPreTrainedModel):
+    module_class = FlaxBigBirdForQuestionAnsweringModule
+
+    @add_start_docstrings_to_model_forward(BIG_BIRD_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    def __call__(
+        self,
+        input_ids,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        question_lengths=None,
+        params: Optional[dict] = None,
+        dropout_rng: Optional[jax.random.PRNGKey] = None,
+        indices_rng: Optional[jax.random.PRNGKey] = None,
+        train: bool = False,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        if position_ids is None:
+            position_ids = jnp.broadcast_to(jnp.arange(jnp.atleast_2d(input_ids).shape[-1]), input_ids.shape)
+
+        if attention_mask is None:
+            attention_mask = jnp.ones_like(input_ids)
+
+        if head_mask is None:
+            head_mask = jnp.ones((self.config.num_hidden_layers, self.config.num_attention_heads))
+
+        if question_lengths is None and input_ids is not None:
+            # assuming input_ids format: <cls> <question> <sep> context <sep>
+            question_lengths = jnp.argmax((input_ids == self.config.sep_token_id).astype("i4"), axis=-1) + 1
+            question_lengths = jnp.expand_dims(question_lengths, axis=1)
+
+        seqlen = input_ids.shape[1]
+
+        logits_mask = None
+        if question_lengths is not None:
+            # setting lengths logits to `-inf`
+            logits_mask = self.prepare_question_mask(question_lengths, seqlen)
+            if token_type_ids is None:
+                token_type_ids = (~logits_mask).astype("i4")
+            logits_mask = jnp.expand_dims(logits_mask, axis=2)
+            logits_mask = logits_mask.at[:, 0].set(False)
+
+        # init input tensors if not passed
+        if token_type_ids is None:
+            token_type_ids = jnp.zeros_like(input_ids)
+
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+
+        if indices_rng is not None:
+            rngs["indices"] = indices_rng
+
+        return self.module.apply(
+            {"params": params or self.params},
+            jnp.array(input_ids, dtype="i4"),
+            jnp.array(attention_mask, dtype="i4"),
+            token_type_ids,
+            jnp.array(position_ids, dtype="i4"),
+            jnp.array(head_mask, dtype="i4"),
+            logits_mask,
+            not train,
+            output_attentions,
+            output_hidden_states,
+            return_dict,
+            rngs=rngs,
+        )
+
+    @staticmethod
+    def prepare_question_mask(q_lengths, maxlen: int):
+        # q_lengths -> (bz, 1)
+        mask = jnp.arange(0, maxlen)
+        mask = jnp.expand_dims(mask, axis=0) < q_lengths
+        return mask
+
+
+append_call_sample_docstring(
+    FlaxBigBirdForQuestionAnswering,
+    _CHECKPOINT_FOR_DOC,
+    FlaxBigBirdForQuestionAnsweringModelOutput,
+    _CONFIG_FOR_DOC,
+)
+
+
+class FlaxBigBirdForCausalLMModule(nn.Module):
+    config: BigBirdConfig
+    dtype: jnp.dtype = jnp.float32
+    gradient_checkpointing: bool = False
+
+    def setup(self):
+        self.bert = FlaxBigBirdModule(
+            config=self.config,
+            add_pooling_layer=False,
+            dtype=self.dtype,
+            gradient_checkpointing=self.gradient_checkpointing,
+        )
+        self.cls = FlaxBigBirdOnlyMLMHead(config=self.config, dtype=self.dtype)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        position_ids,
+        token_type_ids: Optional[jnp.ndarray] = None,
+        head_mask: Optional[jnp.ndarray] = None,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # Model
+        outputs = self.bert(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            init_cache=init_cache,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        if self.config.tie_word_embeddings:
+            shared_embedding = self.bert.variables["params"]["embeddings"]["word_embeddings"]["embedding"]
+        else:
+            shared_embedding = None
+
+        # Compute the prediction scores
+        logits = self.cls(hidden_states, shared_embedding=shared_embedding)
+
+        if not return_dict:
+            return (logits,) + outputs[1:]
+
+        return FlaxCausalLMOutputWithCrossAttentions(
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    BigBird Model with a language modeling head on top (a linear layer on top of the hidden-states output) e.g for
+    autoregressive tasks.
+    """,
+    BIG_BIRD_START_DOCSTRING,
+)
+# Copied from transformers.models.bert.modeling_flax_bert.FlaxBertForCausalLM with Bert->BigBird
+class FlaxBigBirdForCausalLM(FlaxBigBirdPreTrainedModel):
+    module_class = FlaxBigBirdForCausalLMModule
+
+    def prepare_inputs_for_generation(self, input_ids, max_length, attention_mask: Optional[jax.Array] = None):
+        # initializing the cache
+        batch_size, seq_length = input_ids.shape
+
+        past_key_values = self.init_cache(batch_size, max_length)
+        # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length.
+        # But since the decoder uses a causal mask, those positions are masked anyway.
+        # Thus, we can create a single static attention_mask here, which is more efficient for compilation
+        extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
+        if attention_mask is not None:
+            position_ids = attention_mask.cumsum(axis=-1) - 1
+            extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, attention_mask, (0, 0))
+        else:
+            position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length))
+
+        return {
+            "past_key_values": past_key_values,
+            "attention_mask": extended_attention_mask,
+            "position_ids": position_ids,
+        }
+
+    def update_inputs_for_generation(self, model_outputs, model_kwargs):
+        model_kwargs["past_key_values"] = model_outputs.past_key_values
+        model_kwargs["position_ids"] = model_kwargs["position_ids"][:, -1:] + 1
+        return model_kwargs
+
+
+append_call_sample_docstring(
+    FlaxBigBirdForCausalLM,
+    _CHECKPOINT_FOR_DOC,
+    FlaxCausalLMOutputWithCrossAttentions,
+    _CONFIG_FOR_DOC,
+)
+
+
+__all__ = [
+    "FlaxBigBirdForCausalLM",
+    "FlaxBigBirdForMaskedLM",
+    "FlaxBigBirdForMultipleChoice",
+    "FlaxBigBirdForPreTraining",
+    "FlaxBigBirdForQuestionAnswering",
+    "FlaxBigBirdForSequenceClassification",
+    "FlaxBigBirdForTokenClassification",
+    "FlaxBigBirdModel",
+    "FlaxBigBirdPreTrainedModel",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/tokenization_big_bird.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/tokenization_big_bird.py
new file mode 100644
index 0000000000000000000000000000000000000000..56680972a63e6360eaaefc023e57bd6beff71b9d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/tokenization_big_bird.py
@@ -0,0 +1,303 @@
+# coding=utf-8
+# Copyright 2021 Google Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for BigBird."""
+
+import os
+import re
+from shutil import copyfile
+from typing import Any, Optional
+
+import sentencepiece as spm
+
+from ...tokenization_utils import AddedToken, PreTrainedTokenizer
+from ...utils import logging
+from ...utils.import_utils import requires
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
+
+
+@requires(backends=("sentencepiece",))
+class BigBirdTokenizer(PreTrainedTokenizer):
+    """
+    Construct a BigBird tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The begin of sequence token.
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
+            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
+            to set:
+
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
+
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+                using forward-filtering-and-backward-sampling algorithm.
+
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+              BPE-dropout.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+    prefix_tokens: list[int] = []
+
+    def __init__(
+        self,
+        vocab_file,
+        unk_token="<unk>",
+        bos_token="<s>",
+        eos_token="</s>",
+        pad_token="<pad>",
+        sep_token="[SEP]",
+        mask_token="[MASK]",
+        cls_token="[CLS]",
+        sp_model_kwargs: Optional[dict[str, Any]] = None,
+        **kwargs,
+    ) -> None:
+        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
+        cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
+        sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
+
+        # Mask token behave like a normal word, i.e. include the space before it
+        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+
+        self.vocab_file = vocab_file
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(vocab_file)
+
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            sep_token=sep_token,
+            mask_token=mask_token,
+            cls_token=cls_token,
+            sp_model_kwargs=self.sp_model_kwargs,
+            **kwargs,
+        )
+
+    @property
+    def vocab_size(self):
+        return self.sp_model.get_piece_size()
+
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__ = d
+
+        # for backward compatibility
+        if not hasattr(self, "sp_model_kwargs"):
+            self.sp_model_kwargs = {}
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(self.vocab_file)
+
+    def _tokenize(self, text: str) -> list[str]:
+        """Take as input a string and return a list of strings (tokens) for words/sub-words"""
+        return self.sp_model.encode(text, out_type=str)
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.sp_model.piece_to_id(token)
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        token = self.sp_model.IdToPiece(index)
+        return token
+
+    # Copied from transformers.models.albert.tokenization_albert.AlbertTokenizer.convert_tokens_to_string
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        current_sub_tokens = []
+        out_string = ""
+        prev_is_special = False
+        for token in tokens:
+            # make sure that special tokens are not decoded using sentencepiece model
+            if token in self.all_special_tokens:
+                if not prev_is_special:
+                    out_string += " "
+                out_string += self.sp_model.decode(current_sub_tokens) + token
+                prev_is_special = True
+                current_sub_tokens = []
+            else:
+                current_sub_tokens.append(token)
+                prev_is_special = False
+        out_string += self.sp_model.decode(current_sub_tokens)
+        return out_string.strip()
+
+    def _decode(
+        self,
+        token_ids: list[int],
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: Optional[bool] = None,
+        spaces_between_special_tokens: bool = True,
+        **kwargs,
+    ) -> str:
+        self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)
+
+        filtered_tokens = self.convert_ids_to_tokens(token_ids, skip_special_tokens=skip_special_tokens)
+
+        # To avoid mixing byte-level and unicode for byte-level BPT
+        # we need to build string separately for added tokens and byte-level tokens
+        # cf. https://github.com/huggingface/transformers/issues/1133
+        sub_texts = []
+        current_sub_text = []
+        for token in filtered_tokens:
+            if skip_special_tokens and token in self.all_special_ids:
+                continue
+            if token in self.added_tokens_encoder:
+                if current_sub_text:
+                    sub_texts.append(self.convert_tokens_to_string(current_sub_text))
+                    current_sub_text = []
+                sub_texts.append(token)
+            else:
+                current_sub_text.append(token)
+        if current_sub_text:
+            sub_texts.append(self.convert_tokens_to_string(current_sub_text))
+
+        # Mimic the behavior of the Rust tokenizer:
+        # No space before [MASK] and [SEP]
+        if spaces_between_special_tokens:
+            text = re.sub(r" (\[(MASK|SEP)\])", r"\1", " ".join(sub_texts))
+        else:
+            text = "".join(sub_texts)
+
+        clean_up_tokenization_spaces = (
+            clean_up_tokenization_spaces
+            if clean_up_tokenization_spaces is not None
+            else self.clean_up_tokenization_spaces
+        )
+        if clean_up_tokenization_spaces:
+            clean_text = self.clean_up_tokenization(text)
+            return clean_text
+        else:
+            return text
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+
+        return (out_vocab_file,)
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A Big Bird sequence has the following format:
+
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
+    ) -> list[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if token_ids_1 is None:
+            return [1] + ([0] * len(token_ids_0)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+
+
+__all__ = ["BigBirdTokenizer"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/tokenization_big_bird_fast.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/tokenization_big_bird_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..6148585a40b10385098c69714f650e1f42c22b4f
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/big_bird/tokenization_big_bird_fast.py
@@ -0,0 +1,198 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and the HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for Big Bird model."""
+
+import os
+from shutil import copyfile
+from typing import Optional
+
+from ...tokenization_utils import AddedToken
+from ...tokenization_utils_fast import PreTrainedTokenizerFast
+from ...utils import is_sentencepiece_available, logging
+
+
+if is_sentencepiece_available():
+    from .tokenization_big_bird import BigBirdTokenizer
+else:
+    BigBirdTokenizer = None
+
+logger = logging.get_logger(__name__)
+VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"}
+
+
+SPIECE_UNDERLINE = "▁"
+
+
+class BigBirdTokenizerFast(PreTrainedTokenizerFast):
+    """
+    Construct a "fast" BigBird tokenizer (backed by HuggingFace's *tokenizers* library). Based on
+    [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models). This
+    tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods
+
+    Args:
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token. .. note:: When building a sequence using special tokens, this is not the token
+            that is used for the end of sequence. The token used is the `sep_token`.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    slow_tokenizer_class = BigBirdTokenizer
+    model_input_names = ["input_ids", "attention_mask"]
+    prefix_tokens: list[int] = []
+
+    def __init__(
+        self,
+        vocab_file=None,
+        tokenizer_file=None,
+        unk_token="<unk>",
+        bos_token="<s>",
+        eos_token="</s>",
+        pad_token="<pad>",
+        sep_token="[SEP]",
+        mask_token="[MASK]",
+        cls_token="[CLS]",
+        **kwargs,
+    ):
+        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
+        cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
+        sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
+
+        # Mask token behave like a normal word, i.e. include the space before it
+        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+
+        super().__init__(
+            vocab_file,
+            tokenizer_file=tokenizer_file,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            **kwargs,
+        )
+
+        self.vocab_file = vocab_file
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. An BigBird sequence has the following format:
+
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+        if token_ids_1 is None:
+            return cls + token_ids_0 + sep
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
+    ) -> list[int]:
+        """
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of ids.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Set to True if the token list is already formatted with special tokens for the model
+
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formatted with special tokens for the model."
+                )
+            return [1 if x in [self.sep_token_id, self.cls_token_id] else 0 for x in token_ids_0]
+
+        if token_ids_1 is None:
+            return [1] + ([0] * len(token_ids_0)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        if not self.can_save_slow_tokenizer:
+            raise ValueError(
+                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
+                "tokenizer."
+            )
+
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+
+        return (out_vocab_file,)
+
+
+__all__ = ["BigBirdTokenizerFast"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/biogpt/__pycache__/configuration_biogpt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/biogpt/__pycache__/configuration_biogpt.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c135bcf0ef11c065b4b7ca03e92060f6cadb7719
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/biogpt/__pycache__/configuration_biogpt.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/biogpt/__pycache__/modeling_biogpt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/biogpt/__pycache__/modeling_biogpt.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5410a327c09c64d10f7b6c3e01d58baeb49a83af
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/biogpt/__pycache__/modeling_biogpt.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/biogpt/__pycache__/modular_biogpt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/biogpt/__pycache__/modular_biogpt.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5db3ac279bc72cfbd90f251935f447af77ce93d8
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/biogpt/__pycache__/modular_biogpt.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/biogpt/__pycache__/tokenization_biogpt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/biogpt/__pycache__/tokenization_biogpt.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..65f260d957a0a457a35d1ef29d0a9740682b12e1
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/biogpt/__pycache__/tokenization_biogpt.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bit/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bit/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b813295c4455316449134cd685ec05cefdba90b1
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bit/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bit/__pycache__/configuration_bit.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bit/__pycache__/configuration_bit.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b897a61ace0764eecfc2945901556b8b1c9de93c
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bit/__pycache__/configuration_bit.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bit/__pycache__/image_processing_bit.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bit/__pycache__/image_processing_bit.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f039a45ce751e82d060b3729567e03c1738f03b5
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bit/__pycache__/image_processing_bit.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bit/__pycache__/image_processing_bit_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bit/__pycache__/image_processing_bit_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..da70e23122d642dc6fb75a5c7718e2e8573cf395
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bit/__pycache__/image_processing_bit_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bit/__pycache__/modeling_bit.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bit/__pycache__/modeling_bit.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..79d9eeee3ae84923418ebe3d36df5c3b089eac15
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bit/__pycache__/modeling_bit.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bitnet/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bitnet/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3428dd3800b58e8052c53cbd71adb7dce8ea451
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bitnet/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2025 The BitNet Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_bitnet import *
+    from .modeling_bitnet import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bitnet/configuration_bitnet.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bitnet/configuration_bitnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..6df31b5b8200e3b30eff59a270708984a90c35ca
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bitnet/configuration_bitnet.py
@@ -0,0 +1,147 @@
+# coding=utf-8
+# Copyright 2025 The BitNet Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+"""BitNet model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class BitNetConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`BitNetModel`]. It is used to instantiate an BitNet
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of
+    BitNet b1.58 2B4T [microsoft/bitnet-b1.58-2B-4T](https://huggingface.co/microsoft/bitnet-b1.58-2B-4T).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 128256):
+            Vocabulary size of the BitNet model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`BitNetModel`]
+        hidden_size (`int`, *optional*, defaults to 2560):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 6912):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 30):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 20):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*, defaults to 5):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"relu2"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 128000):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 128001):
+            End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 500000.0):
+            The base period of the RoPE embeddings.
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+
+    ```python
+    >>> from transformers import BitNetModel, BitNetConfig
+
+    >>> # Initializing a BitNet style configuration
+    >>> configuration = BitNetConfig()
+
+    >>> # Initializing a model from the BitNet style configuration
+    >>> model = BitNetModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "bitnet"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=128256,
+        hidden_size=2560,
+        intermediate_size=6912,
+        num_hidden_layers=30,
+        num_attention_heads=20,
+        num_key_value_heads=5,
+        hidden_act="relu2",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=128000,
+        eos_token_id=128001,
+        tie_word_embeddings=False,
+        rope_theta=500000.0,
+        attention_bias=False,
+        attention_dropout=0.0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+__all__ = ["BitNetConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bitnet/modeling_bitnet.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bitnet/modeling_bitnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..0af7c794b155fa989da445960376069885b30bf6
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bitnet/modeling_bitnet.py
@@ -0,0 +1,488 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/bitnet/modular_bitnet.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_bitnet.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 The BitNet Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+
+from typing import Callable, Optional, Union
+
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...integrations import use_kernel_forward_from_hub
+from ...masking_utils import create_causal_mask
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
+from ...utils.deprecation import deprecate_kwarg
+from ...utils.generic import check_model_inputs
+from .configuration_bitnet import BitNetConfig
+
+
+@use_kernel_forward_from_hub("RMSNorm")
+class BitNetRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        BitNetRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class BitNetMLP(nn.Module):
+    def __init__(self, config: BitNetConfig):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+        self.ffn_sub_norm = BitNetRMSNorm(config.intermediate_size, eps=config.rms_norm_eps)
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.ffn_sub_norm(self.act_fn(self.gate_proj(x)) * self.up_proj(x)))
+        return down_proj
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs: Unpack[TransformersKwargs],
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class BitNetAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: BitNetConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
+        )
+        self.attn_sub_norm = BitNetRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.attn_sub_norm(attn_output)  # diff with Llama
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class BitNetDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: BitNetConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = BitNetAttention(config=config, layer_idx=layer_idx)
+
+        self.mlp = BitNetMLP(config)
+        self.input_layernorm = BitNetRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = BitNetRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+class BitNetRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: BitNetConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+@auto_docstring
+class BitNetPreTrainedModel(PreTrainedModel):
+    config: BitNetConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["BitNetDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+
+    _can_compile_fullgraph = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": BitNetDecoderLayer,
+        "attentions": BitNetAttention,
+    }
+
+
+@auto_docstring
+class BitNetModel(BitNetPreTrainedModel):
+    def __init__(self, config: BitNetConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [BitNetDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = BitNetRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = BitNetRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds: torch.Tensor = self.embed_tokens(input_ids)
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position: torch.Tensor = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+
+        hidden_states = inputs_embeds
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            hidden_states = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+
+        hidden_states = self.norm(hidden_states)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+        )
+
+
+@auto_docstring
+class BitNetForCausalLM(BitNetPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = None
+    _pp_plan = None
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = BitNetModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> CausalLMOutputWithPast:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, transformers.,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, transformers., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, BitNetForCausalLM
+
+        >>> model = BitNetForCausalLM.from_pretrained("microsoft/bitnet-b1.58-2B-4T")
+        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/bitnet-b1.58-2B-4T")
+
+        >>> prompt = f'<|begin_of_text|>User: Hey, are you conscious? Can you talk to me?<|eot_id|>Assistant: '
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=100)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "User: Hey, are you conscious? Can you talk to me?Assistant: No, I'm not conscious. I'm an artificial intelligence designed to assist with information and tasks. How can I help you today?"
+        ```"""
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = ["BitNetForCausalLM", "BitNetModel", "BitNetPreTrainedModel"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bitnet/modular_bitnet.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bitnet/modular_bitnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..92ad3a32145992eb5b8a6bbac0af97f079ef2352
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bitnet/modular_bitnet.py
@@ -0,0 +1,155 @@
+# coding=utf-8
+# Copyright 2025 The BitNet Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+"""PyTorch BitNet model."""
+
+from typing import Callable, Optional
+
+import torch
+
+from ...cache_utils import Cache
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_outputs import CausalLMOutputWithPast
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
+from ...processing_utils import Unpack
+from ...utils import logging
+from ...utils.deprecation import deprecate_kwarg
+from ..gemma.modeling_gemma import GemmaMLP
+from ..llama.modeling_llama import (
+    LlamaAttention,
+    LlamaDecoderLayer,
+    LlamaForCausalLM,
+    LlamaModel,
+    LlamaRMSNorm,
+    apply_rotary_pos_emb,
+    eager_attention_forward,
+)
+from .configuration_bitnet import BitNetConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class BitNetRMSNorm(LlamaRMSNorm):
+    pass
+
+
+class BitNetMLP(GemmaMLP):
+    def __init__(self, config: BitNetConfig):
+        super().__init__(config)
+        self.ffn_sub_norm = BitNetRMSNorm(config.intermediate_size, eps=config.rms_norm_eps)
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.ffn_sub_norm(self.act_fn(self.gate_proj(x)) * self.up_proj(x)))
+        return down_proj
+
+
+class BitNetAttention(LlamaAttention):
+    def __init__(self, config: BitNetConfig, layer_idx: int):
+        super().__init__(config, layer_idx)
+        self.attn_sub_norm = BitNetRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.attn_sub_norm(attn_output)  # diff with Llama
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class BitNetDecoderLayer(LlamaDecoderLayer):
+    pass
+
+
+class BitNetModel(LlamaModel):
+    pass
+
+
+class BitNetForCausalLM(LlamaForCausalLM):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = None
+    _pp_plan = None
+
+    def forward(
+        self,
+        **super_kwargs,
+    ) -> CausalLMOutputWithPast:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, transformers.,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, transformers., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, BitNetForCausalLM
+
+        >>> model = BitNetForCausalLM.from_pretrained("microsoft/bitnet-b1.58-2B-4T")
+        >>> tokenizer = AutoTokenizer.from_pretrained("microsoft/bitnet-b1.58-2B-4T")
+
+        >>> prompt = f'<|begin_of_text|>User: Hey, are you conscious? Can you talk to me?<|eot_id|>Assistant: '
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=100)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "User: Hey, are you conscious? Can you talk to me?Assistant: No, I'm not conscious. I'm an artificial intelligence designed to assist with information and tasks. How can I help you today?"
+        ```"""
+        return super().forward(**super_kwargs)
+
+
+__all__ = [
+    "BitNetForCausalLM",
+    "BitNetModel",
+    "BitNetPreTrainedModel",  # noqa: F822
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blenderbot_small/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blenderbot_small/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e4da453659b949fedd47b36565f445d02a52ae46
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blenderbot_small/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blenderbot_small/__pycache__/configuration_blenderbot_small.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blenderbot_small/__pycache__/configuration_blenderbot_small.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b142562ba9991f137031e7db5e633eaf70ae4ef5
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blenderbot_small/__pycache__/configuration_blenderbot_small.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blenderbot_small/__pycache__/modeling_blenderbot_small.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blenderbot_small/__pycache__/modeling_blenderbot_small.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6af91616debfa513303c655a72a94d9781bd8fc9
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blenderbot_small/__pycache__/modeling_blenderbot_small.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blenderbot_small/__pycache__/modeling_flax_blenderbot_small.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blenderbot_small/__pycache__/modeling_flax_blenderbot_small.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f3c85e63984c60b4cd4e1860691051cefcfb129e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blenderbot_small/__pycache__/modeling_flax_blenderbot_small.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blenderbot_small/__pycache__/modeling_tf_blenderbot_small.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blenderbot_small/__pycache__/modeling_tf_blenderbot_small.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eeca7219efa297ce075e5f2283776dbaade13519
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blenderbot_small/__pycache__/modeling_tf_blenderbot_small.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blenderbot_small/__pycache__/tokenization_blenderbot_small.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blenderbot_small/__pycache__/tokenization_blenderbot_small.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1778d3be458baf23f9f45300df71644075d5f7bb
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blenderbot_small/__pycache__/tokenization_blenderbot_small.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blenderbot_small/__pycache__/tokenization_blenderbot_small_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blenderbot_small/__pycache__/tokenization_blenderbot_small_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d1b57dd123cfc3132bad6042296daf8d2250598c
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blenderbot_small/__pycache__/tokenization_blenderbot_small_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..952de2f855a71591e950b31751af1fe6cfbae20a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/__init__.py
@@ -0,0 +1,33 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_blip import *
+    from .image_processing_blip import *
+    from .image_processing_blip_fast import *
+    from .modeling_blip import *
+    from .modeling_blip_text import *
+    from .modeling_tf_blip import *
+    from .modeling_tf_blip_text import *
+    from .processing_blip import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/configuration_blip.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/configuration_blip.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e0a5590c3f769896675e801f76ecfa9234eb8be
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/configuration_blip.py
@@ -0,0 +1,317 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Blip model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class BlipTextConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`BlipTextModel`]. It is used to instantiate a BLIP
+    text model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the `BlipText` used by the [base
+    architectures](https://huggingface.co/Salesforce/blip-vqa-base).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30524):
+            Vocabulary size of the `Blip` text model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`BlipModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        encoder_hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers from the vision model.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        bos_token_id (`int`, *optional*, defaults to 30522):
+            The id of the `beginning-of-sequence` token.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the `end-of-sequence` token.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            The id of the `padding` token.
+        sep_token_id (`int`, *optional*, defaults to 102):
+            The id of the `separator` token.
+        is_decoder (`bool`, *optional*, defaults to `True`):
+            Whether the model is used as a decoder.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        label_smoothing (float, *optional*):
+            A float in [0.0, 1.0]. Specifies the amount of smoothing when computing the loss, where 0.0 means no smoothing. The targets
+            become a mixture of the original ground truth and a uniform distribution as described in
+            `Rethinking the Inception Architecture for Computer Vision <https://huggingface.co/papers/1512.00567>`__. Default: :math:`0.0`.
+
+    Example:
+
+    ```python
+    >>> from transformers import BlipTextConfig, BlipTextModel
+
+    >>> # Initializing a BlipTextConfig with Salesforce/blip-vqa-base style configuration
+    >>> configuration = BlipTextConfig()
+
+    >>> # Initializing a BlipTextModel (with random weights) from the Salesforce/blip-vqa-base style configuration
+    >>> model = BlipTextModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "blip_text_model"
+    base_config_key = "text_config"
+
+    def __init__(
+        self,
+        vocab_size=30524,
+        hidden_size=768,
+        encoder_hidden_size=768,
+        intermediate_size=3072,
+        projection_dim=768,
+        num_hidden_layers=12,
+        num_attention_heads=8,
+        max_position_embeddings=512,
+        hidden_act="gelu",
+        layer_norm_eps=1e-12,
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        initializer_range=0.02,
+        bos_token_id=30522,
+        eos_token_id=2,
+        pad_token_id=0,
+        sep_token_id=102,
+        is_decoder=True,
+        use_cache=True,
+        label_smoothing=0.0,
+        **kwargs,
+    ):
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            sep_token_id=sep_token_id,
+            **kwargs,
+        )
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.encoder_hidden_size = encoder_hidden_size
+        self.intermediate_size = intermediate_size
+        self.projection_dim = projection_dim
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.max_position_embeddings = max_position_embeddings
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.is_decoder = is_decoder
+        self.use_cache = use_cache
+        self.label_smoothing = label_smoothing
+
+
+class BlipVisionConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`BlipVisionModel`]. It is used to instantiate a
+    BLIP vision model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration defaults will yield a similar configuration to that of the Blip-base
+    [Salesforce/blip-vqa-base](https://huggingface.co/Salesforce/blip-vqa-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        image_size (`int`, *optional*, defaults to 384):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` `"gelu"` are supported.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 1e-10):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+
+    Example:
+
+    ```python
+    >>> from transformers import BlipVisionConfig, BlipVisionModel
+
+    >>> # Initializing a BlipVisionConfig with Salesforce/blip-vqa-base style configuration
+    >>> configuration = BlipVisionConfig()
+
+    >>> # Initializing a BlipVisionModel (with random weights) from the Salesforce/blip-vqa-base style configuration
+    >>> model = BlipVisionModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "blip_vision_model"
+    base_config_key = "vision_config"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        intermediate_size=3072,
+        projection_dim=512,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        image_size=384,
+        patch_size=16,
+        hidden_act="gelu",
+        layer_norm_eps=1e-5,
+        attention_dropout=0.0,
+        initializer_range=1e-10,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.projection_dim = projection_dim
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.patch_size = patch_size
+        self.image_size = image_size
+        self.initializer_range = initializer_range
+        self.attention_dropout = attention_dropout
+        self.layer_norm_eps = layer_norm_eps
+        self.hidden_act = hidden_act
+
+
+class BlipConfig(PretrainedConfig):
+    r"""
+    [`BlipConfig`] is the configuration class to store the configuration of a [`BlipModel`]. It is used to instantiate
+    a BLIP model according to the specified arguments, defining the text model and vision model configs. Instantiating
+    a configuration with the defaults will yield a similar configuration to that of the BLIP-base
+    [Salesforce/blip-vqa-base](https://huggingface.co/Salesforce/blip-vqa-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`BlipTextConfig`].
+        vision_config (`dict`, *optional*):
+            Dictionary of configuration options used to initialize [`BlipVisionConfig`].
+        projection_dim (`int`, *optional*, defaults to 512):
+            Dimensionality of text and vision projection layers.
+        logit_scale_init_value (`float`, *optional*, defaults to 2.6592):
+            The initial value of the *logit_scale* parameter. Default is used as per the original BLIP implementation.
+        image_text_hidden_size (`int`, *optional*, defaults to 256):
+            Dimensionality of the hidden state of the image-text fusion layer.
+        label_smoothing (float, optional, *optional*, defaults to 0.0):
+            A float in [0.0, 1.0]. Specifies the amount of smoothing when computing the loss, where 0.0 means no smoothing. The targets
+            become a mixture of the original ground truth and a uniform distribution as described in
+            `Rethinking the Inception Architecture for Computer Vision <https://huggingface.co/papers/1512.00567>`__. Default: :math:`0.0`.
+        kwargs (*optional*):
+            Dictionary of keyword arguments.
+
+    Example:
+
+    ```python
+    >>> from transformers import BlipConfig, BlipModel
+
+    >>> # Initializing a BlipConfig with Salesforce/blip-vqa-base style configuration
+    >>> configuration = BlipConfig()
+
+    >>> # Initializing a BlipPModel (with random weights) from the Salesforce/blip-vqa-base style configuration
+    >>> model = BlipModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+
+    >>> # We can also initialize a BlipConfig from a BlipTextConfig and a BlipVisionConfig
+
+    >>> # Initializing a BLIPText and BLIPVision configuration
+    >>> config_text = BlipTextConfig()
+    >>> config_vision = BlipVisionConfig()
+
+    >>> config = BlipConfig.from_text_vision_configs(config_text, config_vision)
+    ```"""
+
+    model_type = "blip"
+    sub_configs = {"text_config": BlipTextConfig, "vision_config": BlipVisionConfig}
+
+    def __init__(
+        self,
+        text_config=None,
+        vision_config=None,
+        projection_dim=512,
+        logit_scale_init_value=2.6592,
+        image_text_hidden_size=256,
+        label_smoothing=0.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        if text_config is None:
+            text_config = {}
+            logger.info("`text_config` is `None`. Initializing the `BlipTextConfig` with default values.")
+
+        if vision_config is None:
+            vision_config = {}
+            logger.info("`vision_config` is `None`. Initializing the `BlipVisionConfig` with default values.")
+
+        self.text_config = BlipTextConfig(**text_config)
+        self.vision_config = BlipVisionConfig(**vision_config)
+
+        self.text_config.encoder_hidden_size = self.vision_config.hidden_size
+
+        self.projection_dim = projection_dim
+        self.logit_scale_init_value = logit_scale_init_value
+        self.initializer_factor = 1.0
+        self.initializer_range = 0.02
+        self.image_text_hidden_size = image_text_hidden_size
+        self.label_smoothing = label_smoothing
+
+
+__all__ = ["BlipConfig", "BlipTextConfig", "BlipVisionConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/image_processing_blip.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/image_processing_blip.py
new file mode 100644
index 0000000000000000000000000000000000000000..78a152374fd0089846f4431c8b408da4c5c9ff54
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/image_processing_blip.py
@@ -0,0 +1,297 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for BLIP."""
+
+from typing import Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import convert_to_rgb, resize, to_channel_dimension_format
+from ...image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_flat_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
+
+
+if is_vision_available():
+    import PIL
+
+
+logger = logging.get_logger(__name__)
+
+
+class BlipImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a BLIP image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
+            `do_resize` parameter in the `preprocess` method.
+        size (`dict`, *optional*, defaults to `{"height": 384, "width": 384}`):
+            Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
+            method.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+            Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
+            overridden by the `resample` parameter in the `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
+            `do_rescale` parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be
+            overridden by the `rescale_factor` parameter in the `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method. Can be overridden by the `do_normalize` parameter in the `preprocess` method.
+        image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
+            overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_convert_rgb: bool = True,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 384, "width": 384}
+        size = get_size_dict(size, default_to_square=True)
+
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
+        self.do_convert_rgb = do_convert_rgb
+
+    # Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize with PILImageResampling.BILINEAR->PILImageResampling.BICUBIC
+    def resize(
+        self,
+        image: np.ndarray,
+        size: dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image to `(size["height"], size["width"])`.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`dict[str, int]`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+
+        Returns:
+            `np.ndarray`: The resized image.
+        """
+        size = get_size_dict(size)
+        if "height" not in size or "width" not in size:
+            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
+        output_size = (size["height"], size["width"])
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    @filter_out_non_signature_kwargs()
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample: Optional[PILImageResampling] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        do_convert_rgb: Optional[bool] = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Controls the size of the image after `resize`. The shortest edge of the image is resized to
+                `size["shortest_edge"]` whilst preserving the aspect ratio. If the longest edge of this resized image
+                is > `int(size["shortest_edge"] * (1333 / 800))`, then the image is resized again to make the longest
+                edge equal to `int(size["shortest_edge"] * (1333 / 800))`.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to normalize the image by if `do_normalize` is set to `True`.
+            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to normalize the image by if `do_normalize` is set to `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+
+        size = size if size is not None else self.size
+        size = get_size_dict(size, default_to_square=False)
+        images = self.fetch_images(images)
+        images = make_flat_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+        # PIL RGBA images are converted to RGB
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if do_rescale and is_scaled_image(images[0]):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        if do_resize:
+            images = [
+                self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        if do_rescale:
+            images = [
+                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        if do_normalize:
+            images = [
+                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
+
+        encoded_outputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
+
+        return encoded_outputs
+
+
+__all__ = ["BlipImageProcessor"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/image_processing_blip_fast.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/image_processing_blip_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b1d14f139049162590bcaabf923a08b194aaae5
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/image_processing_blip_fast.py
@@ -0,0 +1,36 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Image processor class for BLIP."""
+
+from ...image_processing_utils_fast import BaseImageProcessorFast
+from ...image_utils import OPENAI_CLIP_MEAN, OPENAI_CLIP_STD, PILImageResampling
+from ...utils import auto_docstring
+
+
+@auto_docstring
+class BlipImageProcessorFast(BaseImageProcessorFast):
+    # To be checked against the slow image processor
+    # None values left after checking can be removed
+    resample = PILImageResampling.BICUBIC
+    image_mean = OPENAI_CLIP_MEAN
+    image_std = OPENAI_CLIP_STD
+    size = {"height": 384, "width": 384}
+    do_resize = True
+    do_rescale = True
+    do_normalize = True
+    do_convert_rgb = True
+
+
+__all__ = ["BlipImageProcessorFast"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/modeling_blip.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/modeling_blip.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a58add38af3a68befc369376f4023b461c512b4
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/modeling_blip.py
@@ -0,0 +1,1308 @@
+# coding=utf-8
+# Copyright 2022 The Salesforce Team Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch BLIP model."""
+
+import warnings
+from dataclasses import dataclass
+from typing import Any, Optional, Union
+
+import torch
+from torch import nn
+from torch.nn.functional import normalize
+
+from ...activations import ACT2FN
+from ...generation import GenerationMixin
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling
+from ...modeling_utils import PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import ModelOutput, TransformersKwargs, auto_docstring, can_return_tuple, logging, torch_int
+from ...utils.generic import check_model_inputs
+from .configuration_blip import BlipConfig, BlipTextConfig, BlipVisionConfig
+from .modeling_blip_text import BlipTextLMHeadModel, BlipTextModel
+
+
+logger = logging.get_logger(__name__)
+
+
+# Copied from transformers.models.clip.modeling_clip.contrastive_loss
+def contrastive_loss(logits: torch.Tensor) -> torch.Tensor:
+    return nn.functional.cross_entropy(logits, torch.arange(len(logits), device=logits.device))
+
+
+# Copied from transformers.models.clip.modeling_clip.clip_loss with clip->blip
+def blip_loss(similarity: torch.Tensor) -> torch.Tensor:
+    caption_loss = contrastive_loss(similarity)
+    image_loss = contrastive_loss(similarity.t())
+    return (caption_loss + image_loss) / 2.0
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the
+    last hidden states. This class also adds the loss term from the text decoder.
+    """
+)
+class BlipForConditionalGenerationModelOutput(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor`, *optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
+        Language modeling loss from the text decoder.
+    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`, *optional*):
+        Prediction scores of the language modeling head of the text decoder model.
+    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)`, *optional*):
+        The image embeddings obtained after applying the Vision Transformer model to the input image.
+    hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True`):
+        Tuple of `torch.FloatTensor` (one for the output of the embeddings, if the model has an embedding layer, +
+        one for the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+        Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+    attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed):
+        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+        sequence_length)`.
+
+        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+        heads.
+    """
+
+    loss: Optional[tuple[torch.FloatTensor]] = None
+    logits: Optional[tuple[torch.FloatTensor]] = None
+    image_embeds: Optional[torch.FloatTensor] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+
+    @property
+    def decoder_logits(self):
+        warnings.warn(
+            "`decoder_logits` attribute is deprecated and will be removed in version 5 of Transformers."
+            " Please use the `logits` attribute to retrieve the final output instead.",
+            FutureWarning,
+        )
+        return self.logits
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the
+    last hidden states. This class also adds the loss term from the text decoder.
+    """
+)
+class BlipTextVisionModelOutput(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Language modeling loss from the text decoder.
+    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
+        The image embeddings obtained by applying the projection layer to the pooler_output.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    image_embeds: Optional[torch.FloatTensor] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the
+    last hidden states. This class also adds the loss term from the text decoder as well as the image-text similarity
+    scores.
+    """
+)
+class BlipImageTextMatchingModelOutput(ModelOutput):
+    r"""
+    itm_score (`torch.FloatTensor`):
+        The image-text similarity scores.
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Language modeling loss from the text decoder.
+    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
+        The image embeddings obtained by applying the projection layer to the pooler_output.
+    vision_pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*):
+        Last layer hidden-state of the vision of the vision-only branch of the model.
+    question_embeds (`torch.FloatTensor`):
+        The question embeddings obtained by the text projection layer.
+    """
+
+    itm_score: Optional[torch.FloatTensor] = None
+    loss: Optional[torch.FloatTensor] = None
+    image_embeds: Optional[torch.FloatTensor] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    vision_pooler_output: Optional[torch.FloatTensor] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+    question_embeds: Optional[tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+@auto_docstring
+class BlipOutput(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
+        Contrastive loss for image-text similarity.
+    logits_per_image (`torch.FloatTensor` of shape `(image_batch_size, text_batch_size)`):
+        The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
+        similarity scores.
+    logits_per_text (`torch.FloatTensor` of shape `(text_batch_size, image_batch_size)`):
+        The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
+        similarity scores.
+    text_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
+        The text embeddings obtained by applying the projection layer to the pooled output of [`BlipTextModel`].
+    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim`):
+        The image embeddings obtained by applying the projection layer to the pooled output of [`BlipVisionModel`].
+    text_model_output (`BaseModelOutputWithPooling`):
+        The output of the [`BlipTextModel`].
+    vision_model_output (`BaseModelOutputWithPooling`):
+        The output of the [`BlipVisionModel`].
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits_per_image: Optional[torch.FloatTensor] = None
+    logits_per_text: Optional[torch.FloatTensor] = None
+    text_embeds: Optional[torch.FloatTensor] = None
+    image_embeds: Optional[torch.FloatTensor] = None
+    text_model_output: BaseModelOutputWithPooling = None
+    vision_model_output: BaseModelOutputWithPooling = None
+
+    def to_tuple(self) -> tuple[Any]:
+        return tuple(
+            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+
+
+class BlipVisionEmbeddings(nn.Module):
+    def __init__(self, config: BlipVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.class_embedding = nn.Parameter(torch.randn(1, 1, self.embed_dim))
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=3, out_channels=self.embed_dim, kernel_size=self.patch_size, stride=self.patch_size
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+
+        self.position_embedding = nn.Parameter(torch.randn(1, self.num_positions, self.embed_dim))
+
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
+        images. This method is also adapted to support torch.jit tracing.
+
+        Adapted from:
+        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
+        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
+        """
+
+        num_patches = embeddings.shape[1] - 1
+        num_positions = self.position_embedding.shape[1] - 1
+
+        # always interpolate when tracing to ensure the exported model works for dynamic input shapes
+        if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
+            return self.position_embedding
+
+        class_pos_embed = self.position_embedding[:, :1]
+        patch_pos_embed = self.position_embedding[:, 1:]
+
+        dim = embeddings.shape[-1]
+
+        new_height = height // self.patch_size
+        new_width = width // self.patch_size
+
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            size=(new_height, new_width),
+            mode="bicubic",
+            align_corners=False,
+        )
+
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+
+        return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
+
+    def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding: bool = False) -> torch.Tensor:
+        batch_size, _, height, width = pixel_values.shape
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1).to(target_dtype)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        if interpolate_pos_encoding:
+            position_embedding = self.interpolate_pos_encoding(embeddings, height, width)
+        else:
+            position_embedding = self.position_embedding
+        embeddings = embeddings + position_embedding[:, : embeddings.size(1), :].to(target_dtype)
+        return embeddings
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPTextEmbeddings with CLIP->Blip
+class BlipTextEmbeddings(nn.Module):
+    def __init__(self, config: BlipTextConfig):
+        super().__init__()
+        embed_dim = config.hidden_size
+
+        self.token_embedding = nn.Embedding(config.vocab_size, embed_dim)
+        self.position_embedding = nn.Embedding(config.max_position_embeddings, embed_dim)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+    ) -> torch.Tensor:
+        seq_length = input_ids.shape[-1] if input_ids is not None else inputs_embeds.shape[-2]
+        max_position_embedding = self.position_embedding.weight.shape[0]
+
+        if seq_length > max_position_embedding:
+            raise ValueError(
+                f"Sequence length must be less than max_position_embeddings (got `sequence length`: "
+                f"{seq_length} and max_position_embeddings: {max_position_embedding}"
+            )
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, :seq_length]
+
+        if inputs_embeds is None:
+            inputs_embeds = self.token_embedding(input_ids)
+
+        position_embeddings = self.position_embedding(position_ids)
+        embeddings = inputs_embeds + position_embeddings
+
+        return embeddings
+
+
+class BlipAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = nn.Dropout(config.attention_dropout)
+
+        self.qkv = nn.Linear(self.embed_dim, 3 * self.embed_dim)
+
+        self.projection = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        """Input shape: Batch x Time x Channel"""
+
+        bsz, tgt_len, embed_dim = hidden_states.size()
+
+        mixed_qkv = (
+            self.qkv(hidden_states)
+            .reshape(bsz, tgt_len, 3, self.num_heads, embed_dim // self.num_heads)
+            .permute(2, 0, 3, 1, 4)
+        )
+        query_states, key_states, value_states = mixed_qkv[0], mixed_qkv[1], mixed_qkv[2]
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_states, key_states.transpose(-1, -2))
+
+        attention_scores = attention_scores * self.scale
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_states).permute(0, 2, 1, 3)
+
+        new_context_layer_shape = context_layer.size()[:-2] + (self.embed_dim,)
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        output = self.projection(context_layer)
+
+        return output, attention_probs
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPMLP with CLIP->Blip
+class BlipMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class BlipEncoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: BlipConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = BlipAttention(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = BlipMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+    @auto_docstring
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> torch.FloatTensor:
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, _ = self.self_attn(
+            hidden_states=hidden_states,
+            head_mask=attention_mask,
+            **kwargs,
+        )
+        hidden_states = hidden_states + residual
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+
+        hidden_states = hidden_states + residual
+
+        return hidden_states
+
+
+@auto_docstring
+class BlipPreTrainedModel(PreTrainedModel):
+    config: BlipConfig
+    base_model_prefix = "blip"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["BlipEncoderLayer", "BlipTextEmbeddings"]
+    _skip_keys_device_placement = ["past_key_values"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        factor = self.config.initializer_range
+        if isinstance(module, (nn.Conv2d, nn.Embedding, nn.Linear)):
+            module.weight.data.normal_(mean=0.0, std=factor)
+            if hasattr(module, "bias") and module.bias is not None:
+                module.bias.data.zero_()
+
+        if isinstance(module, BlipVisionEmbeddings):
+            if hasattr(self.config, "vision_config"):
+                factor = self.config.vision_config.initializer_range
+            nn.init.trunc_normal_(
+                module.position_embedding,
+                mean=0.0,
+                std=factor,
+            )
+
+            nn.init.trunc_normal_(
+                module.class_embedding,
+                mean=0.0,
+                std=factor,
+            )
+
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+
+class BlipEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`BlipEncoderLayer`].
+
+    Args:
+        config (`BlipConfig`):
+            The corresponding vision configuration for the `BlipEncoder`.
+    """
+
+    def __init__(self, config: BlipConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([BlipEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    @auto_docstring
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, BaseModelOutput]:
+        hidden_states = inputs_embeds
+        for encoder_layer in self.layers:
+            hidden_states = encoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                **kwargs,
+            )
+
+        return BaseModelOutput(last_hidden_state=hidden_states)
+
+
+class BlipVisionModel(BlipPreTrainedModel):
+    main_input_name = "pixel_values"
+    config: BlipVisionConfig
+    _can_record_outputs = {
+        "hidden_states": BlipEncoderLayer,
+        "attentions": BlipAttention,
+    }
+
+    def __init__(self, config: BlipVisionConfig):
+        super().__init__(config)
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = BlipVisionEmbeddings(config)
+        self.encoder = BlipEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+        self.post_init()
+
+    @check_model_inputs(tie_last_hidden_states=False)
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        interpolate_pos_encoding: bool = False,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, BaseModelOutputWithPooling]:
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
+
+        encoder_outputs: BaseModelOutput = self.encoder(
+            inputs_embeds=hidden_states,
+            **kwargs,
+        )
+
+        last_hidden_state = encoder_outputs.last_hidden_state
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        pooled_output = last_hidden_state[:, 0, :]
+        pooled_output = self.post_layernorm(pooled_output)
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+        )
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+
+@auto_docstring(
+    custom_intro="""
+    This model is going to be deprecated in future versions. Please use `BlipForConditionalGeneration`, `BlipForQuestionAnswering` or `BlipForImageTextRetrieval` depending on your usecase.
+    """
+)
+class BlipModel(BlipPreTrainedModel):
+    config: BlipConfig
+
+    def __init__(self, config: BlipConfig):
+        super().__init__(config)
+
+        if not isinstance(config.text_config, BlipTextConfig):
+            raise TypeError(
+                "config.text_config is expected to be of type BlipTextConfig but is of type"
+                f" {type(config.text_config)}."
+            )
+
+        if not isinstance(config.vision_config, BlipVisionConfig):
+            raise TypeError(
+                "config.vision_config is expected to be of type BlipVisionConfig but is of type"
+                f" {type(config.vision_config)}."
+            )
+
+        text_config = config.text_config
+        vision_config = config.vision_config
+
+        self.projection_dim = config.projection_dim
+        self.text_embed_dim = text_config.hidden_size
+        self.vision_embed_dim = vision_config.hidden_size
+
+        self.text_model = BlipTextModel(text_config)
+        self.vision_model = BlipVisionModel(vision_config)
+
+        self.visual_projection = nn.Linear(self.vision_embed_dim, self.projection_dim, bias=False)
+        self.text_projection = nn.Linear(self.text_embed_dim, self.projection_dim, bias=False)
+        self.logit_scale = nn.Parameter(torch.tensor(self.config.logit_scale_init_value))
+
+        logger.warning(
+            "`BlipModel` is going to be deprecated in future release, please use `BlipForConditionalGeneration`, `BlipForQuestionAnswering` or `BlipForImageTextRetrieval` depending on your usecase."
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.text_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.text_model.set_input_embeddings(value)
+
+    @auto_docstring
+    def get_text_features(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            text_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The text embeddings obtained by
+            applying the projection layer to the pooled output of [`BlipTextModel`].
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoProcessor, BlipModel
+
+        >>> model = BlipModel.from_pretrained("Salesforce/blip-image-captioning-base")
+        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+
+        >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="pt")
+        >>> text_features = model.get_text_features(**inputs)
+        ```"""
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+        )
+
+        pooled_output = text_outputs[1]
+        text_features = self.text_projection(pooled_output)
+
+        return text_features
+
+    @auto_docstring
+    def get_image_features(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        interpolate_pos_encoding: bool = False,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            image_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The image embeddings obtained by
+            applying the projection layer to the pooled output of [`BlipVisionModel`].
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, BlipModel
+
+        >>> model = BlipModel.from_pretrained("Salesforce/blip-image-captioning-base")
+        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> image_features = model.get_image_features(**inputs)
+        ```"""
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+        )
+
+        pooled_output = vision_outputs[1]  # pooled_output
+        image_features = self.visual_projection(pooled_output)
+
+        return image_features
+
+    @auto_docstring
+    def get_multimodal_features(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        interpolate_pos_encoding: bool = False,
+    ) -> torch.FloatTensor:
+        r"""
+        Returns:
+            multimodal_features (`torch.FloatTensor` of shape `(batch_size, output_dim`): The multimodal embeddings
+            obtained by applying the image embeddings to the text encoder using the cross-attention mechanism.
+
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, BlipModel
+
+        >>> model = BlipModel.from_pretrained("Salesforce/blip-image-captioning-base")
+        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> texts = ["a photo of a cat", "a photo of a dog"]
+        >>> inputs = processor(images=image, text=texts, padding=True, return_tensors="pt")
+
+        >>> multimodal_features = model.get_multimodal_features(**inputs)
+        ```"""
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+        )
+
+        image_embeds = vision_outputs[0]
+        image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long)
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_atts,
+        )
+
+        pooled_output = text_outputs[1]  # pooled_output
+        multimodal_features = self.text_projection(pooled_output)
+
+        return multimodal_features
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        return_loss: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, BlipOutput]:
+        r"""
+        return_loss (`bool`, *optional*):
+            Whether or not to return the contrastive loss.
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, BlipModel
+
+        >>> model = BlipModel.from_pretrained("Salesforce/blip-image-captioning-base")
+        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(
+        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="pt", padding=True
+        ... )
+
+        >>> outputs = model(**inputs)
+        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
+        >>> probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities
+        ```"""
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            **kwargs,
+        )
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            **kwargs,
+        )
+
+        image_embeds = vision_outputs.pooler_output
+        image_embeds = self.visual_projection(image_embeds)
+
+        text_embeds = text_outputs.pooler_output
+        text_embeds = self.text_projection(text_embeds)
+
+        # normalized features
+        image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
+        text_embeds = text_embeds / text_embeds.norm(p=2, dim=-1, keepdim=True)
+
+        # cosine similarity as logits
+        logit_scale = self.logit_scale.exp().to(device=text_embeds.device)
+        image_embeds = image_embeds.to(device=text_embeds.device, dtype=text_embeds.dtype)
+        logits_per_text = torch.matmul(text_embeds, image_embeds.t()) * logit_scale
+        logits_per_image = logits_per_text.t()
+
+        loss = None
+        if return_loss:
+            loss = blip_loss(logits_per_text)
+
+        return BlipOutput(
+            loss=loss,
+            logits_per_image=logits_per_image,
+            logits_per_text=logits_per_text,
+            text_embeds=text_embeds,
+            image_embeds=image_embeds,
+            text_model_output=text_outputs,
+            vision_model_output=vision_outputs,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    BLIP Model for image captioning. The model consists of a vision encoder and a text decoder. One can optionally pass
+    `input_ids` to the model, which serve as a text prompt, to make the text decoder continue the prompt. Otherwise,
+    the decoder starts generating text from the [BOS] (beginning-of-sequence) token. will start generating the caption
+    from the text input. If no text input is provided, the decoder will start with the [BOS] token only.
+    """
+)
+class BlipForConditionalGeneration(BlipPreTrainedModel, GenerationMixin):
+    config: BlipConfig
+    _tied_weights_keys = ["text_decoder.cls.predictions.decoder.bias"]
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: BlipConfig):
+        super().__init__(config)
+
+        self.vision_model = BlipVisionModel(config.vision_config)
+
+        self.text_decoder = BlipTextLMHeadModel(config.text_config)
+
+        self.decoder_input_ids = config.text_config.bos_token_id
+        self.decoder_pad_token_id = config.text_config.pad_token_id
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.text_decoder.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.text_decoder.set_input_embeddings(value)
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        interpolate_pos_encoding: bool = False,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, BlipForConditionalGenerationModelOutput]:
+        r"""
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, BlipForConditionalGeneration
+
+        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+        >>> model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> text = "A picture of"
+
+        >>> inputs = processor(images=image, text=text, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        ```"""
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            **kwargs,
+        )
+
+        image_embeds = vision_outputs.last_hidden_state
+
+        outputs = self.text_decoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            encoder_hidden_states=image_embeds,
+            labels=labels,
+            reduction="mean",
+            **kwargs,
+        )
+
+        return BlipForConditionalGenerationModelOutput(
+            loss=outputs.loss,
+            logits=outputs.logits,
+            image_embeds=image_embeds,
+            last_hidden_state=vision_outputs.last_hidden_state,
+            hidden_states=vision_outputs.hidden_states,
+            attentions=vision_outputs.attentions,
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        pixel_values: torch.FloatTensor,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        interpolate_pos_encoding: bool = False,
+        **generate_kwargs,
+    ) -> torch.LongTensor:
+        r"""
+        Overrides *generate* function to be able to use the model as a conditional generator
+
+        Parameters:
+            pixel_values (*torch.FloatTensor* of shape *(batch_size, num_channels, image_height, image_width)*:
+                Input image to be processed
+            input_ids (*torch.LongTensor* of shape *(batch_size, sequence_length)*, *optional*):
+                The sequence used as a prompt for the generation.
+            attention_mask (*torch.LongTensor* of shape *(batch_size, sequence_length)*, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, BlipForConditionalGeneration
+
+        >>> model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
+        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> outputs = model.generate(**inputs)
+        >>> print(processor.decode(outputs[0], skip_special_tokens=True))
+        two cats sleeping on a couch
+        ```
+        """
+
+        batch_size = pixel_values.shape[0]
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+        )
+
+        image_embeds = vision_outputs[0]
+
+        image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)
+
+        if isinstance(input_ids, list):
+            input_ids = torch.LongTensor(input_ids)
+        elif input_ids is None:
+            input_ids = (
+                torch.LongTensor([[self.decoder_input_ids, self.config.text_config.eos_token_id]])
+                .repeat(batch_size, 1)
+                .to(image_embeds.device)
+            )
+
+        input_ids[:, 0] = self.config.text_config.bos_token_id
+        attention_mask = attention_mask[:, :-1] if attention_mask is not None else None
+
+        outputs = self.text_decoder.generate(
+            input_ids=input_ids[:, :-1],
+            eos_token_id=self.config.text_config.sep_token_id,
+            pad_token_id=self.config.text_config.pad_token_id,
+            attention_mask=attention_mask,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            **generate_kwargs,
+        )
+
+        return outputs
+
+
+@auto_docstring(
+    custom_intro="""
+    BLIP Model for visual question answering. The model consists of a vision encoder, a text encoder as well as a text
+    decoder. The vision encoder will encode the input image, the text encoder will encode the input question together
+    with the encoding of the image, and the text decoder will output the answer to the question.
+    """
+)
+class BlipForQuestionAnswering(BlipPreTrainedModel, GenerationMixin):
+    config: BlipConfig
+    _tied_weights_keys = ["text_decoder.cls.predictions.decoder.bias"]
+
+    def __init__(self, config: BlipConfig):
+        super().__init__(config)
+
+        self.vision_model = BlipVisionModel(config.vision_config)
+
+        self.text_encoder = BlipTextModel(config.text_config, add_pooling_layer=False)
+
+        self.text_decoder = BlipTextLMHeadModel(config.text_config)
+
+        self.decoder_pad_token_id = config.text_config.pad_token_id
+        self.decoder_start_token_id = config.text_config.bos_token_id
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def set_input_embeddings(self, value):
+        self.text_encoder.set_input_embeddings(value)
+
+    def get_input_embeddings(self):
+        # This will return shared embeddings if they are shared else specific to encoder.
+        return self.text_encoder.get_input_embeddings()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        pixel_values: torch.FloatTensor,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        interpolate_pos_encoding: bool = False,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, BlipTextVisionModelOutput]:
+        r"""
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, BlipForQuestionAnswering
+
+        >>> model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
+        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> # training
+        >>> text = "How many cats are in the picture?"
+        >>> label = "2"
+        >>> inputs = processor(images=image, text=text, return_tensors="pt")
+        >>> labels = processor(text=label, return_tensors="pt").input_ids
+
+        >>> inputs["labels"] = labels
+        >>> outputs = model(**inputs)
+        >>> loss = outputs.loss
+        >>> loss.backward()
+
+        >>> # inference
+        >>> text = "How many cats are in the picture?"
+        >>> inputs = processor(images=image, text=text, return_tensors="pt")
+        >>> outputs = model.generate(**inputs)
+        >>> print(processor.decode(outputs[0], skip_special_tokens=True))
+        2
+        ```"""
+        if labels is None and decoder_input_ids is None:
+            raise ValueError(
+                "Either `decoder_input_ids` or `labels` should be passed when calling `forward` with"
+                " `BlipForQuestionAnswering`. if you are training the model make sure that `labels` is passed, if you"
+                " are using the model for inference make sure that `decoder_input_ids` is passed or call `generate`"
+            )
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            **kwargs,
+        )
+
+        image_embeds = vision_outputs.last_hidden_state
+        image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long)
+
+        question_embeds = self.text_encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            **kwargs,
+        )
+
+        if labels is not None and decoder_input_ids is None:
+            # labels are already shifted right, see: https://github.com/huggingface/transformers/pull/23153
+            decoder_input_ids = labels
+
+        question_embeds = question_embeds[0]
+
+        answer_output = self.text_decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=question_embeds,
+            encoder_attention_mask=attention_mask,
+            labels=labels,
+            reduction="mean",
+            **kwargs,
+        )
+
+        if labels is not None:
+            decoder_loss = answer_output.loss.mean()
+        else:
+            decoder_loss = None
+
+        return BlipTextVisionModelOutput(
+            loss=decoder_loss,
+            image_embeds=image_embeds,
+            last_hidden_state=vision_outputs.last_hidden_state,
+            hidden_states=vision_outputs.hidden_states,
+            attentions=vision_outputs.attentions,
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        input_ids: torch.LongTensor,
+        pixel_values: torch.FloatTensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        interpolate_pos_encoding: bool = False,
+        **generate_kwargs,
+    ) -> torch.LongTensor:
+        r"""
+        Overrides *generate* function to be able to use the model as a conditional generator
+
+        Parameters:
+            input_ids (*torch.LongTensor* of shape *(batch_size, sequence_length)*):
+                The sequence used as a prompt for the generation.
+            pixel_values (*torch.FloatTensor* of shape *(batch_size, num_channels, image_height, image_width)*:
+                Input image to be processed
+            attention_mask (*torch.LongTensor* of shape *(batch_size, sequence_length)*, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`. `1` for
+                tokens that are NOT MASKED, `0` for MASKED tokens.
+            **generate_kwargs:
+                Additional arguments passed to the *generate* function of the decoder
+
+
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, BlipForQuestionAnswering
+
+        >>> model = BlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
+        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> text = "How many cats are in the picture?"
+
+        >>> inputs = processor(images=image, text=text, return_tensors="pt")
+
+        >>> outputs = model.generate(**inputs)
+        >>> print(processor.decode(outputs[0], skip_special_tokens=True))
+        2
+        ```
+        """
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+        )
+
+        image_embeds = vision_outputs[0]
+
+        image_attention_mask = torch.ones(image_embeds.size()[:-1], dtype=torch.long, device=image_embeds.device)
+
+        if isinstance(input_ids, list):
+            input_ids = torch.LongTensor(input_ids)
+
+        question_outputs = self.text_encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            return_dict=False,
+        )
+
+        question_embeds = question_outputs[0]
+
+        question_attention_mask = torch.ones(
+            question_embeds.size()[:-1], dtype=torch.long, device=question_embeds.device
+        )
+
+        bos_ids = torch.full(
+            (question_embeds.size(0), 1), fill_value=self.decoder_start_token_id, device=question_embeds.device
+        )
+
+        outputs = self.text_decoder.generate(
+            input_ids=bos_ids,
+            eos_token_id=self.config.text_config.sep_token_id,
+            pad_token_id=self.config.text_config.pad_token_id,
+            encoder_hidden_states=question_embeds,
+            encoder_attention_mask=question_attention_mask,
+            **generate_kwargs,
+        )
+
+        return outputs
+
+
+@auto_docstring(
+    custom_intro="""
+    BLIP Model with a vision and text projector, and a classification head on top. The model is used in the context of
+    image-text retrieval. Given an image and a text, the model returns the probability of the text being relevant to
+    the image.
+    """
+)
+class BlipForImageTextRetrieval(BlipPreTrainedModel):
+    config: BlipConfig
+
+    def __init__(self, config: BlipConfig):
+        super().__init__(config)
+
+        self.vision_model = BlipVisionModel(config.vision_config)
+
+        self.text_encoder = BlipTextModel(config.text_config, add_pooling_layer=False)
+
+        # vision projection layer
+        self.vision_proj = nn.Linear(config.vision_config.hidden_size, config.image_text_hidden_size)
+
+        # text projection layer
+        self.text_proj = nn.Linear(config.text_config.hidden_size, config.image_text_hidden_size)
+
+        # image text matching head
+        self.itm_head = nn.Linear(config.text_config.hidden_size, 2)
+
+        self.decoder_pad_token_id = (
+            config.text_config.pad_token_id
+            if not hasattr(config, "decoder_pad_token_id")
+            else config.decoder_pad_token_id
+        )
+        self.decoder_start_token_id = (
+            config.text_config.bos_token_id
+            if not hasattr(config, "decoder_start_token_id")
+            else config.decoder_start_token_id
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.text_encoder.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.text_encoder.set_input_embeddings(value)
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: torch.LongTensor,
+        pixel_values: torch.FloatTensor,
+        use_itm_head: Optional[bool] = True,
+        attention_mask: Optional[torch.LongTensor] = None,
+        interpolate_pos_encoding: bool = False,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, BlipTextVisionModelOutput]:
+        r"""
+        use_itm_head (`bool`, *optional*, defaults to `True`):
+            Whether or not to use the image-text matching head.
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, BlipForImageTextRetrieval
+
+        >>> model = BlipForImageTextRetrieval.from_pretrained("Salesforce/blip-itm-base-coco")
+        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-itm-base-coco")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> text = "an image of a cat"
+
+        >>> inputs = processor(images=image, text=text, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        ```
+        """
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            **kwargs,
+        )
+
+        image_embeds = vision_outputs.last_hidden_state
+        image_atts = torch.ones(image_embeds.size()[:-1], dtype=torch.long)
+
+        if use_itm_head:
+            question_embeds = self.text_encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                encoder_hidden_states=image_embeds,
+                encoder_attention_mask=image_atts,
+                **kwargs,
+            )
+            question_embeds = question_embeds.last_hidden_state
+
+            output = self.itm_head(question_embeds[:, 0, :])
+        else:
+            question_embeds = self.text_encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                **kwargs,
+            )
+            question_embeds = question_embeds.last_hidden_state
+
+            image_feat = normalize(self.vision_proj(image_embeds[:, 0, :]), dim=-1)
+            text_feat = normalize(self.text_proj(question_embeds[:, 0, :]), dim=-1)
+
+            output = image_feat @ text_feat.t()
+
+        return BlipImageTextMatchingModelOutput(
+            itm_score=output,
+            last_hidden_state=vision_outputs.last_hidden_state,
+            hidden_states=vision_outputs.hidden_states,
+            attentions=vision_outputs.attentions,
+            question_embeds=question_embeds,
+        )
+
+
+__all__ = [
+    "BlipModel",
+    "BlipPreTrainedModel",
+    "BlipForConditionalGeneration",
+    "BlipForQuestionAnswering",
+    "BlipVisionModel",
+    "BlipTextModel",
+    "BlipForImageTextRetrieval",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/modeling_blip_text.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/modeling_blip_text.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f1f58c7533463ab9ba01f45a15ba51a294c0651
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/modeling_blip_text.py
@@ -0,0 +1,968 @@
+# coding=utf-8
+# Copyright 2022 The Salesforce Team Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the BSD-3-clause license (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+import math
+from typing import Optional, Union
+
+import torch
+from torch import Tensor, device, nn
+from torch.nn import CrossEntropyLoss
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
+from ...generation import GenerationMixin
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+)
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import logging
+from ...utils.deprecation import deprecate_kwarg
+from .configuration_blip import BlipTextConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L52
+class BlipTextEmbeddings(nn.Module):
+    """Construct the embeddings from word and position embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+
+        self.config = config
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        past_key_values_length: int = 0,
+    ) -> torch.Tensor:
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        embeddings = inputs_embeds
+
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L97
+class BlipTextSelfAttention(nn.Module):
+    def __init__(self, config, is_cross_attention, layer_idx=None):
+        super().__init__()
+        self.config = config
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention heads (%d)"
+                % (config.hidden_size, config.num_attention_heads)
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.layer_idx = layer_idx
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        if is_cross_attention:
+            self.key = nn.Linear(config.encoder_hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.encoder_hidden_size, self.all_head_size)
+        else:
+            self.key = nn.Linear(config.hidden_size, self.all_head_size)
+            self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+
+    def save_attn_gradients(self, attn_gradients):
+        self.attn_gradients = attn_gradients
+
+    def get_attn_gradients(self):
+        return self.attn_gradients
+
+    def save_attention_map(self, attention_map):
+        self.attention_map = attention_map
+
+    def get_attention_map(self):
+        return self.attention_map
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor]:
+        batch_size, seq_length, _ = hidden_states.shape
+        query_layer = (
+            self.query(hidden_states)
+            .view(batch_size, -1, self.num_attention_heads, self.attention_head_size)
+            .transpose(1, 2)
+        )
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+        attention_mask = encoder_attention_mask if is_cross_attention else attention_mask
+
+        is_updated = False
+        if past_key_values is not None:
+            if isinstance(past_key_values, EncoderDecoderCache):
+                is_updated = past_key_values.is_updated.get(self.layer_idx)
+                if is_cross_attention:
+                    # after the first generated id, we can subsequently re-use all key/value_layer from cache
+                    curr_past_key_value = past_key_values.cross_attention_cache
+                else:
+                    curr_past_key_value = past_key_values.self_attention_cache
+            else:
+                curr_past_key_value = past_key_values
+
+        current_states = encoder_hidden_states if is_cross_attention else hidden_states
+        if is_cross_attention and past_key_values is not None and is_updated:
+            # reuse k,v, cross_attentions
+            key_layer = curr_past_key_value.layers[self.layer_idx].keys
+            value_layer = curr_past_key_value.layers[self.layer_idx].values
+        else:
+            key_layer = (
+                self.key(current_states)
+                .view(batch_size, -1, self.num_attention_heads, self.attention_head_size)
+                .transpose(1, 2)
+            )
+            value_layer = (
+                self.value(current_states)
+                .view(batch_size, -1, self.num_attention_heads, self.attention_head_size)
+                .transpose(1, 2)
+            )
+
+            if past_key_values is not None:
+                # save all key/value_layer to cache to be re-used for fast auto-regressive generation
+                cache_position = cache_position if not is_cross_attention else None
+                key_layer, value_layer = curr_past_key_value.update(
+                    key_layer, value_layer, self.layer_idx, {"cache_position": cache_position}
+                )
+                # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls
+                if is_cross_attention and isinstance(past_key_values, EncoderDecoderCache):
+                    past_key_values.is_updated[self.layer_idx] = True
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BlipTextModel forward() function)
+            attention_scores = attention_scores + attention_mask.to(attention_scores.device)
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs_dropped = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs_dropped = attention_probs_dropped * head_mask
+
+        context_layer = torch.matmul(attention_probs_dropped, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        return context_layer, attention_probs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfOutput with Bert -> BlipText
+class BlipTextSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#242
+class BlipTextAttention(nn.Module):
+    def __init__(self, config, is_cross_attention=False, layer_idx=None):
+        super().__init__()
+        self.self = BlipTextSelfAttention(config, is_cross_attention, layer_idx=layer_idx)
+        self.output = BlipTextSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor]:
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            cache_position=cache_position,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate with Bert -> BlipText
+class BlipTextIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOutput with Bert -> BlipText
+class BlipTextOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class BlipTextLayer(GradientCheckpointingLayer):
+    def __init__(self, config, layer_num):
+        super().__init__()
+        self.config = config
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = BlipTextAttention(config, layer_idx=layer_num)
+        self.layer_num = layer_num
+        if self.config.is_decoder:
+            self.crossattention = BlipTextAttention(
+                config, is_cross_attention=self.config.is_decoder, layer_idx=layer_num
+            )
+        self.intermediate = BlipTextIntermediate(config)
+        self.output = BlipTextOutput(config)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor]:
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            past_key_values=past_key_values,
+            cache_position=cache_position,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]
+
+        if encoder_hidden_states is not None:
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask=encoder_attention_mask,
+                head_mask=head_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                cache_position=cache_position,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:]  # add cross attentions if we output attention weights
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        return (layer_output,) + outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L386
+class BlipTextEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([BlipTextLayer(config, i) for i in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        if use_cache:
+            if isinstance(past_key_values, tuple):
+                logger.warning_once(
+                    "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. "
+                    "You should pass an instance of `EncoderDecoderCache` instead, e.g. "
+                    "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`."
+                )
+                past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values)
+            # The model acts as encoder decoder but is not an encoder decoder. So we cast all cache objects to
+            # `EncoderDecoderCache` type assuming that the incoming cache is from `self_attention`
+            elif isinstance(past_key_values, DynamicCache):
+                past_key_values = EncoderDecoderCache(past_key_values, DynamicCache(config=self.config))
+            elif past_key_values is None:
+                past_key_values = EncoderDecoderCache(
+                    DynamicCache(config=self.config), DynamicCache(config=self.config)
+                )
+
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and encoder_hidden_states is not None else None
+
+        for i in range(self.config.num_hidden_layers):
+            layer_module = self.layer[i]
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            layer_outputs = layer_module(
+                hidden_states,
+                attention_mask,
+                layer_head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                past_key_values,
+                output_attentions,
+                cache_position,
+            )
+
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if encoder_hidden_states is not None:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    past_key_values,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+# Copied from transformers.models.bert.modeling_bert.BertPooler with Bert->BlipText
+class BlipTextPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+# Copied from transformers.models.bert.modeling_bert.BertPredictionHeadTransform with Bert->BlipText
+class BlipTextPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertLMPredictionHead with Bert->BlipText
+class BlipTextLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = BlipTextPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def _tie_weights(self):
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOnlyMLMHead with Bert->BlipText
+class BlipTextOnlyMLMHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = BlipTextLMPredictionHead(config)
+
+    def forward(self, sequence_output: torch.Tensor) -> torch.Tensor:
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L548
+class BlipTextPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config: BlipTextConfig
+    base_model_prefix = "bert"
+    _no_split_modules = []
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+
+# Adapted from https://github.com/salesforce/BLIP/blob/3a29b7410476bf5f2ba0955827390eb6ea1f4f9d/models/med.py#L571
+class BlipTextModel(BlipTextPreTrainedModel):
+    """
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
+    all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. argument and `is_decoder` set to `True`; an
+    `encoder_hidden_states` is then expected as an input to the forward pass.
+    """
+
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = BlipTextEmbeddings(config)
+        self.encoder = BlipTextEncoder(config)
+        self.pooler = BlipTextPooler(config) if add_pooling_layer else None
+
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    # Copied from transformers.models.bert.modeling_bert.BertModel._prune_heads
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def get_extended_attention_mask(
+        self, attention_mask: Tensor, input_shape: tuple[int], device: device, is_decoder: bool
+    ) -> Tensor:
+        """
+        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
+
+        Arguments:
+            attention_mask (`torch.Tensor`):
+                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
+            input_shape (`tuple[int]`):
+                The shape of the input to the model.
+            device (`torch.device`):
+                The device of the input to the model.
+
+        Returns:
+            `torch.Tensor` The extended attention mask, with a the same dtype as `attention_mask.dtype`.
+        """
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if attention_mask.dim() == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.dim() == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - if the model is a decoder, apply a causal mask in addition to the padding mask
+            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            if is_decoder:
+                batch_size, seq_length = input_shape
+
+                seq_ids = torch.arange(seq_length, device=device)
+                causal_mask = seq_ids[None, None, :].repeat(batch_size, seq_length, 1) <= seq_ids[None, :, None]
+                # in case past_key_values are used we need to add a prefix ones mask to the causal mask
+                causal_mask = causal_mask.to(attention_mask.dtype)
+
+                if causal_mask.shape[1] < attention_mask.shape[1]:
+                    prefix_seq_len = attention_mask.shape[1] - causal_mask.shape[1]
+                    causal_mask = torch.cat(
+                        [
+                            torch.ones(
+                                (batch_size, seq_length, prefix_seq_len), device=device, dtype=causal_mask.dtype
+                            ),
+                            causal_mask,
+                        ],
+                        axis=-1,
+                    )
+
+                extended_attention_mask = causal_mask[:, None, :, :] * attention_mask[:, None, None, :]
+            else:
+                extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError(
+                f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})"
+            )
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = extended_attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        return extended_attention_mask
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        is_decoder: Optional[bool] = False,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`Cache`, *optional*):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+            batch_size, seq_length = input_shape
+            device = input_ids.device
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            batch_size, seq_length = input_shape
+            device = inputs_embeds.device
+        elif encoder_embeds is not None:
+            input_shape = encoder_embeds.size()[:-1]
+            batch_size, seq_length = input_shape
+            device = encoder_embeds.device
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds or encoder_embeds")
+
+        past_key_values_length = 0
+        if past_key_values is not None:
+            past_key_values_length = (
+                past_key_values[0][0].shape[-2]
+                if not isinstance(past_key_values, Cache)
+                else past_key_values.get_seq_length()
+            )
+
+        if attention_mask is None:
+            attention_mask = torch.ones((batch_size, seq_length + past_key_values_length)).to(device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(
+            attention_mask, input_shape, device, is_decoder
+        )
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if encoder_hidden_states is not None:
+            if isinstance(encoder_hidden_states, list):
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states[0].size()
+            else:
+                encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+
+            if isinstance(encoder_attention_mask, list):
+                encoder_extended_attention_mask = [self.invert_attention_mask(mask) for mask in encoder_attention_mask]
+            elif encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+            else:
+                encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        if encoder_embeds is None:
+            embedding_output = self.embeddings(
+                input_ids=input_ids,
+                position_ids=position_ids,
+                inputs_embeds=inputs_embeds,
+                past_key_values_length=past_key_values_length,
+            )
+        else:
+            embedding_output = encoder_embeds
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L811
+class BlipTextLMHeadModel(BlipTextPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["cls.predictions.decoder.weight", "cls.predictions.decoder.bias"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = BlipTextModel(config, add_pooling_layer=False)
+        self.cls = BlipTextOnlyMLMHead(config)
+        self.label_smoothing = config.label_smoothing
+
+    def get_input_embeddings(self):
+        return self.bert.get_input_embeddings()
+
+    def set_input_embeddings(self, new_embeddings):
+        self.bert.set_input_embeddings(new_embeddings)
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+        self.cls.predictions.bias = new_embeddings.bias
+
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        return_logits: Optional[bool] = False,
+        is_decoder: Optional[bool] = True,
+        reduction: Optional[str] = "mean",
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> Union[tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
+        r"""
+        encoder_hidden_states (`torch.FloatTensor`, *optional*): Sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is
+            configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        labels (`torch.LongTensor`, *optional*):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`
+        past_key_values (`Cache`, *optional*):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            is_decoder=is_decoder,
+            cache_position=cache_position,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        if return_logits:
+            return prediction_scores[:, :-1, :].contiguous()
+
+        lm_loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
+            labels = labels[:, 1:].contiguous().to(shifted_prediction_scores.device)
+            loss_fct = CrossEntropyLoss(reduction=reduction, label_smoothing=self.label_smoothing)
+            lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+            if reduction == "none":
+                lm_loss = lm_loss.view(prediction_scores.size(0), -1).sum(1)
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
+        # Overwrite -- hardcoded key return (`is_decoder=True`)
+
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            **model_kwargs,
+        )
+        model_inputs["is_decoder"] = True
+
+        return model_inputs
+
+
+__all__ = ["BlipTextModel", "BlipTextLMHeadModel", "BlipTextPreTrainedModel"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/modeling_tf_blip.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/modeling_tf_blip.py
new file mode 100644
index 0000000000000000000000000000000000000000..a1a1f7928273c69de30b4ba1c190435d91dfd509
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/modeling_tf_blip.py
@@ -0,0 +1,1709 @@
+# coding=utf-8
+# Copyright 2023 The Salesforce Team Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TensorFlow BLIP model."""
+
+from __future__ import annotations
+
+import warnings
+from dataclasses import dataclass
+from typing import Any
+
+import tensorflow as tf
+
+from ...modeling_tf_outputs import TFBaseModelOutput, TFBaseModelOutputWithPooling
+from ...modeling_tf_utils import (
+    TFPreTrainedModel,
+    get_initializer,
+    get_tf_activation,
+    keras,
+    keras_serializable,
+    shape_list,
+    unpack_inputs,
+)
+from ...tf_utils import check_embeddings_within_bounds, stable_softmax
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_blip import BlipConfig, BlipTextConfig, BlipVisionConfig
+from .modeling_tf_blip_text import BLIP_TEXT_INPUTS_DOCSTRING, TFBlipTextLMHeadModel, TFBlipTextModel
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "Salesforce/blip-vqa-base"
+
+
+# Copied from transformers.models.clip.modeling_tf_clip.contrastive_loss
+def contrastive_loss(logits: tf.Tensor) -> tf.Tensor:
+    return tf.math.reduce_mean(
+        keras.metrics.sparse_categorical_crossentropy(
+            y_true=tf.range(shape_list(logits)[0]), y_pred=logits, from_logits=True
+        )
+    )
+
+
+# Copied from transformers.models.clip.modeling_tf_clip.clip_loss with clip->blip
+def blip_loss(similarity: tf.Tensor) -> tf.Tensor:
+    caption_loss = contrastive_loss(similarity)
+    image_loss = contrastive_loss(tf.transpose(similarity))
+    return (caption_loss + image_loss) / 2.0
+
+
+@dataclass
+class TFBlipForConditionalGenerationModelOutput(ModelOutput):
+    """
+    Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the
+    last hidden states. This class also adds the loss term from the text decoder.
+
+    Args:
+        loss (`tf.Tensor`, *optional*, returned when `labels` is provided, `tf.Tensor` of shape `(1,)`):
+            Language modeling loss from the text decoder.
+        logits (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`, *optional*):
+            Prediction scores of the language modeling head of the text decoder model.
+        image_embeds (`tf.Tensor` of shape `(batch_size, output_dim)`, *optional*):
+            The image embeddings obtained after applying the Vision Transformer model to the input image.
+        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for
+            the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.`
+    """
+
+    loss: tuple[tf.Tensor] | None = None
+    logits: tuple[tf.Tensor] | None = None
+    image_embeds: tf.Tensor | None = None
+    last_hidden_state: tf.Tensor | None = None
+    hidden_states: tuple[tf.Tensor, ...] | None = None
+    attentions: tuple[tf.Tensor, ...] | None = None
+
+    @property
+    def decoder_logits(self):
+        warnings.warn(
+            "`decoder_logits` attribute is deprecated and will be removed in version 5 of Transformers."
+            " Please use the `logits` attribute to retrieve the final output instead.",
+            FutureWarning,
+        )
+        return self.logits
+
+
+@dataclass
+class TFBlipTextVisionModelOutput(ModelOutput):
+    """
+    Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the
+    last hidden states. This class also adds the loss term from the text decoder.
+
+    Args:
+        loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss from the text decoder.
+        image_embeds (`tf.Tensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
+            The image embeddings obtained by applying the projection layer to the pooler_output.
+        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for
+            the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: tf.Tensor | None = None
+    image_embeds: tf.Tensor | None = None
+    last_hidden_state: tf.Tensor | None = None
+    hidden_states: tuple[tf.Tensor, ...] | None = None
+    attentions: tuple[tf.Tensor, ...] | None = None
+
+
+@dataclass
+class TFBlipImageTextMatchingModelOutput(ModelOutput):
+    """
+    Adapted from the base class for vision model's outputs that also contains image embeddings of the pooling of the
+    last hidden states. This class also adds the loss term from the text decoder as well as the image-text similarity
+    scores.
+
+    Args:
+        itm_score (`tf.Tensor`):
+            The image-text similarity scores.
+        loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss from the text decoder.
+        image_embeds (`tf.Tensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
+            The image embeddings obtained by applying the projection layer to the pooler_output.
+        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings, if the model has an embedding layer, + one for
+            the output of each layer) of shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the optional initial embedding outputs.
+        vision_pooler_output (`tf.Tensor` of shape `(batch_size, hidden_size)`, *optional*):
+            Last layer hidden-state of the vision of the vision-only branch of the model.
+        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        question_embeds (`tf.Tensor`):
+            The question embeddings obtained by the text projection layer.
+    """
+
+    itm_score: tf.Tensor | None = None
+    loss: tf.Tensor | None = None
+    image_embeds: tf.Tensor | None = None
+    last_hidden_state: tf.Tensor | None = None
+    hidden_states: tuple[tf.Tensor, ...] | None = None
+    vision_pooler_output: tf.Tensor | None = None
+    attentions: tuple[tf.Tensor, ...] | None = None
+    question_embeds: tuple[tf.Tensor] | None = None
+
+
+@dataclass
+class TFBlipOutput(ModelOutput):
+    """
+    Args:
+        loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `return_loss` is `True`):
+            Contrastive loss for image-text similarity.
+        logits_per_image:(`tf.Tensor` of shape `(image_batch_size, text_batch_size)`):
+            The scaled dot product scores between `image_embeds` and `text_embeds`. This represents the image-text
+            similarity scores.
+        logits_per_text:(`tf.Tensor` of shape `(text_batch_size, image_batch_size)`):
+            The scaled dot product scores between `text_embeds` and `image_embeds`. This represents the text-image
+            similarity scores.
+        text_embeds(`tf.Tensor` of shape `(batch_size, output_dim`):
+            The text embeddings obtained by applying the projection layer to the pooled output of [`BlipTextModel`].
+        image_embeds(`tf.Tensor` of shape `(batch_size, output_dim`):
+            The image embeddings obtained by applying the projection layer to the pooled output of [`BlipVisionModel`].
+        text_model_output(`BaseModelOutputWithPooling`):
+            The output of the [`BlipTextModel`].
+        vision_model_output(`BaseModelOutputWithPooling`):
+            The output of the [`BlipVisionModel`].
+    """
+
+    loss: tf.Tensor | None = None
+    logits_per_image: tf.Tensor | None = None
+    logits_per_text: tf.Tensor | None = None
+    text_embeds: tf.Tensor | None = None
+    image_embeds: tf.Tensor | None = None
+    text_model_output: TFBaseModelOutputWithPooling = None
+    vision_model_output: TFBaseModelOutputWithPooling = None
+
+    def to_tuple(self) -> tuple[Any]:
+        return tuple(
+            self[k] if k not in ["text_model_output", "vision_model_output"] else getattr(self, k).to_tuple()
+            for k in self.keys()
+        )
+
+
+class TFBlipVisionEmbeddings(keras.layers.Layer):
+    def __init__(self, config: BlipVisionConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.patch_embedding = keras.layers.Conv2D(
+            filters=self.embed_dim,
+            kernel_size=self.patch_size,
+            strides=self.patch_size,
+            kernel_initializer=get_initializer(self.config.initializer_range),
+            data_format="channels_last",
+            name="patch_embedding",
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+
+    def build(self, input_shape=None):
+        self.class_embedding = self.add_weight(
+            shape=(1, 1, self.embed_dim),
+            initializer=get_initializer(self.config.initializer_range),
+            trainable=True,
+            name="class_embedding",
+        )
+
+        self.position_embedding = self.add_weight(
+            shape=(1, self.num_positions, self.embed_dim),
+            initializer=get_initializer(self.config.initializer_range),
+            trainable=True,
+            name="position_embedding",
+        )
+
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "patch_embedding", None) is not None:
+            with tf.name_scope(self.patch_embedding.name):
+                self.patch_embedding.build([None, None, None, 3])
+
+    def call(self, pixel_values: tf.Tensor) -> tf.Tensor:
+        # Input is channels-first, we transpose. PyTorch transposes after the conv because PyTorch
+        # likes channels-first convs.
+        batch_size = tf.shape(pixel_values)[0]
+        pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))
+        patch_embeds = self.patch_embedding(pixel_values)
+        patch_embeds = tf.reshape(patch_embeds, (batch_size, self.num_patches, -1))
+
+        class_embeds = tf.broadcast_to(self.class_embedding, (batch_size, 1, self.embed_dim))
+        embeddings = tf.concat([class_embeds, patch_embeds], axis=1)
+        embeddings = embeddings + self.position_embedding[:, : tf.shape(embeddings)[1], :]
+        return embeddings
+
+
+# Copied from transformers.models.clip.modeling_tf_clip.TFCLIPTextEmbeddings with CLIP->Blip
+class TFBlipTextEmbeddings(keras.layers.Layer):
+    def __init__(self, config: BlipTextConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.embed_dim = config.hidden_size
+
+        self.config = config
+
+    def build(self, input_shape: tf.TensorShape = None):
+        with tf.name_scope("token_embedding"):
+            self.weight = self.add_weight(
+                shape=(self.config.vocab_size, self.embed_dim),
+                initializer=get_initializer(self.config.initializer_factor * self.config.initializer_range),
+                trainable=True,
+                name="weight",
+            )
+
+        with tf.name_scope("position_embedding"):
+            self.position_embedding = self.add_weight(
+                shape=(self.config.max_position_embeddings, self.embed_dim),
+                initializer=get_initializer(self.config.initializer_factor * self.config.initializer_range),
+                trainable=True,
+                name="embeddings",
+            )
+
+        super().build(input_shape)
+
+    def call(
+        self,
+        input_ids: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+    ) -> tf.Tensor:
+        """
+        Applies embedding based on inputs tensor.
+
+        Returns:
+            final_embeddings (`tf.Tensor`): output embedding tensor.
+        """
+        if input_ids is None and inputs_embeds is None:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
+            inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
+
+        input_shape = shape_list(inputs_embeds)[:-1]
+
+        if position_ids is None:
+            position_ids = tf.expand_dims(tf.range(start=0, limit=input_shape[-1]), axis=0)
+
+        position_embeds = tf.gather(params=self.position_embedding, indices=position_ids)
+        position_embeds = tf.tile(input=position_embeds, multiples=(input_shape[0], 1, 1))
+        final_embeddings = inputs_embeds + position_embeds
+
+        return final_embeddings
+
+
+class TFBlipAttention(keras.layers.Layer):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = keras.layers.Dropout(config.attention_dropout, name="dropout")
+
+        self.qkv = keras.layers.Dense(
+            3 * self.embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="qkv"
+        )
+
+        self.projection = keras.layers.Dense(
+            self.embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="projection"
+        )
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        head_mask: tf.Tensor | None = None,
+        output_attentions: bool | None = False,
+        training: bool | None = None,
+    ) -> tuple[tf.Tensor, tf.Tensor | None, tuple[tf.Tensor] | None]:
+        """Input shape: Batch x Time x Channel"""
+
+        bsz, tgt_len, embed_dim = shape_list(hidden_states)
+
+        mixed_qkv = self.qkv(hidden_states)
+        mixed_qkv = tf.reshape(mixed_qkv, (bsz, tgt_len, 3, self.num_heads, self.head_dim))
+        mixed_qkv = tf.transpose(mixed_qkv, perm=(2, 0, 3, 1, 4))
+
+        query_states, key_states, value_states = mixed_qkv[0], mixed_qkv[1], mixed_qkv[2]
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = query_states @ tf.transpose(key_states, (0, 1, 3, 2))
+
+        attention_scores = attention_scores * self.scale
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = stable_softmax(attention_scores, axis=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs, training=training)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = tf.transpose(attention_probs @ value_states, perm=(0, 2, 1, 3))
+
+        new_context_layer_shape = shape_list(context_layer)[:-2] + [self.embed_dim]
+        context_layer = tf.reshape(context_layer, new_context_layer_shape)
+
+        output = self.projection(context_layer)
+
+        outputs = (output, attention_probs) if output_attentions else (output, None)
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dropout", None) is not None:
+            with tf.name_scope(self.dropout.name):
+                self.dropout.build(None)
+        if getattr(self, "qkv", None) is not None:
+            with tf.name_scope(self.qkv.name):
+                self.qkv.build([None, None, self.embed_dim])
+        if getattr(self, "projection", None) is not None:
+            with tf.name_scope(self.projection.name):
+                self.projection.build([None, None, self.embed_dim])
+
+
+class TFBlipMLP(keras.layers.Layer):
+    def __init__(self, config: BlipConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.activation_fn = get_tf_activation(config.hidden_act)
+
+        in_proj_std = (config.hidden_size**-0.5) * ((2 * config.num_hidden_layers) ** -0.5)
+        fc_std = (2 * config.hidden_size) ** -0.5
+
+        self.fc1 = keras.layers.Dense(
+            units=config.intermediate_size, kernel_initializer=get_initializer(fc_std), name="fc1"
+        )
+        self.fc2 = keras.layers.Dense(
+            units=config.hidden_size, kernel_initializer=get_initializer(in_proj_std), name="fc2"
+        )
+        self.config = config
+
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        hidden_states = self.fc1(inputs=hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(inputs=hidden_states)
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "fc1", None) is not None:
+            with tf.name_scope(self.fc1.name):
+                self.fc1.build([None, None, self.config.hidden_size])
+        if getattr(self, "fc2", None) is not None:
+            with tf.name_scope(self.fc2.name):
+                self.fc2.build([None, None, self.config.intermediate_size])
+
+
+class TFBlipEncoderLayer(keras.layers.Layer):
+    def __init__(self, config: BlipConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.embed_dim = config.hidden_size
+        self.self_attn = TFBlipAttention(config, name="self_attn")
+        self.layer_norm1 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm1")
+        self.mlp = TFBlipMLP(config, name="mlp")
+        self.layer_norm2 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layer_norm2")
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: tf.Tensor,
+        output_attentions: bool | None = False,
+        training: bool | None = None,
+    ) -> tuple[tf.Tensor]:
+        """
+        Args:
+            hidden_states (`tf.Tensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`tf.Tensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            head_mask=attention_mask,
+            output_attentions=output_attentions,
+            training=training,
+        )
+        hidden_states = hidden_states + residual
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+
+        hidden_states = hidden_states + residual
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "self_attn", None) is not None:
+            with tf.name_scope(self.self_attn.name):
+                self.self_attn.build(None)
+        if getattr(self, "layer_norm1", None) is not None:
+            with tf.name_scope(self.layer_norm1.name):
+                self.layer_norm1.build([None, None, self.embed_dim])
+        if getattr(self, "mlp", None) is not None:
+            with tf.name_scope(self.mlp.name):
+                self.mlp.build(None)
+        if getattr(self, "layer_norm2", None) is not None:
+            with tf.name_scope(self.layer_norm2.name):
+                self.layer_norm2.build([None, None, self.embed_dim])
+
+
+class TFBlipPreTrainedModel(TFPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = BlipConfig
+    base_model_prefix = "blip"
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+
+BLIP_START_DOCSTRING = r"""
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`BlipConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+BLIP_VISION_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`BlipImageProcessor`]. See [`BlipImageProcessor.__call__`] for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+BLIP_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoProcessor`]. See [`BlipProcessor.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it. Pixel values can be obtained using
+            [`BlipImageProcessor`]. See [`BlipImageProcessor.__call__`] for details.
+        return_loss (`bool`, *optional*):
+            Whether or not to return the contrastive loss.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@keras_serializable
+class TFBlipEncoder(keras.layers.Layer):
+    config_class = BlipConfig
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`BlipEncoderLayer`].
+
+    Args:
+        config (`BlipConfig`):
+            The corresponding vision configuration for the `BlipEncoder`.
+    """
+
+    def __init__(self, config: BlipConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.layers = [TFBlipEncoderLayer(config, name=f"layers_._{i}") for i in range(config.num_hidden_layers)]
+
+    @unpack_inputs
+    def call(
+        self,
+        inputs_embeds,
+        attention_mask: tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool | None = None,
+    ) -> tuple | TFBaseModelOutput:
+        r"""
+        Args:
+            inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Embedded representation of the inputs. Should be float, not int tokens.
+            attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            layer_outputs = encoder_layer(
+                hidden_states,
+                attention_mask,
+                output_attentions=output_attentions,
+                training=training,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return TFBaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "layers", None) is not None:
+            for layer in self.layers:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+
+
+class TFBlipVisionModel(TFBlipPreTrainedModel):
+    main_input_name = "pixel_values"
+    config_class = BlipVisionConfig
+
+    def __init__(self, config: BlipVisionConfig, *args, **kwargs):
+        super().__init__(config, *args, **kwargs)
+        self.config = config
+
+        self.embeddings = TFBlipVisionEmbeddings(config, name="embeddings")
+        self.encoder = TFBlipEncoder(config, name="encoder")
+        self.post_layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="post_layernorm")
+        self.embed_dim = config.hidden_size
+
+    def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling:
+        hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
+        attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
+
+        return TFBaseModelOutputWithPooling(
+            last_hidden_state=output.last_hidden_state,
+            pooler_output=output.pooler_output,
+            hidden_states=hs,
+            attentions=attns,
+        )
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=TFBaseModelOutputWithPooling, config_class=BlipVisionConfig)
+    def call(
+        self,
+        pixel_values: tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool | None = None,
+    ) -> tuple | TFBaseModelOutputWithPooling:
+        r"""
+        Returns:
+
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        hidden_states = self.embeddings(pixel_values)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        pooled_output = last_hidden_state[:, 0, :]
+        # TF gets confused if we call the layer with inputs of different ranks, so insert a singleton dimension
+        pooled_output = self.post_layernorm(tf.expand_dims(pooled_output, 1))
+        pooled_output = tf.squeeze(pooled_output, 1)
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return TFBaseModelOutputWithPooling(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+    def get_input_embeddings(self):
+        return self.embeddings
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "embeddings", None) is not None:
+            with tf.name_scope(self.embeddings.name):
+                self.embeddings.build(None)
+        if getattr(self, "encoder", None) is not None:
+            with tf.name_scope(self.encoder.name):
+                self.encoder.build(None)
+        if getattr(self, "post_layernorm", None) is not None:
+            with tf.name_scope(self.post_layernorm.name):
+                self.post_layernorm.build([None, None, self.embed_dim])
+
+
+class TFBlipMainLayer(keras.layers.Layer):
+    config_class = BlipConfig
+
+    def __init__(self, config: BlipConfig, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        if not isinstance(config.text_config, BlipTextConfig):
+            raise TypeError(
+                "config.text_config is expected to be of type BlipTextConfig but is of type"
+                f" {type(config.text_config)}."
+            )
+
+        if not isinstance(config.vision_config, BlipVisionConfig):
+            raise TypeError(
+                "config.vision_config is expected to be of type BlipVisionConfig but is of type"
+                f" {type(config.vision_config)}."
+            )
+
+        text_config = config.text_config
+        vision_config = config.vision_config
+
+        self.projection_dim = config.projection_dim
+        self.text_embed_dim = text_config.hidden_size
+        self.vision_embed_dim = vision_config.hidden_size
+
+        self.text_model = TFBlipTextModel(text_config, name="text_model")
+        self.vision_model = TFBlipVisionModel(vision_config, name="vision_model")
+
+        self.visual_projection = keras.layers.Dense(
+            self.projection_dim,
+            use_bias=False,
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="visual_projection",
+        )
+        self.text_projection = keras.layers.Dense(
+            self.projection_dim,
+            use_bias=False,
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="text_projection",
+        )
+
+        self.config = config
+
+    def build(self, input_shape=None):
+        self.logit_scale = self.add_weight(
+            name="logit_scale",
+            shape=[],
+            initializer=keras.initializers.Constant(self.config.logit_scale_init_value),
+            trainable=True,
+        )
+
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "text_model", None) is not None:
+            with tf.name_scope(self.text_model.name):
+                self.text_model.build(None)
+        if getattr(self, "vision_model", None) is not None:
+            with tf.name_scope(self.vision_model.name):
+                self.vision_model.build(None)
+        if getattr(self, "visual_projection", None) is not None:
+            with tf.name_scope(self.visual_projection.name):
+                self.visual_projection.build([None, None, self.vision_embed_dim])
+        if getattr(self, "text_projection", None) is not None:
+            with tf.name_scope(self.text_projection.name):
+                self.text_projection.build([None, None, self.text_embed_dim])
+
+    @unpack_inputs
+    def call(
+        self,
+        input_ids: tf.Tensor | None = None,
+        pixel_values: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        return_loss: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool | None = None,
+    ) -> tuple | TFBlipOutput:
+        # Use BLIP model's config for some fields (if specified) instead of those of vision & text components.
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        text_outputs = self.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        image_embeds = vision_outputs[1]
+        image_embeds = self.visual_projection(image_embeds)
+
+        text_embeds = text_outputs[1]
+        text_embeds = self.text_projection(text_embeds)
+
+        # normalized features
+        image_embeds = image_embeds / tf.norm(image_embeds, ord=2, axis=-1, keepdims=True)
+        text_embeds = text_embeds / tf.norm(text_embeds, ord=2, axis=-1, keepdims=True)
+
+        # cosine similarity as logits
+        logit_scale = tf.exp(self.logit_scale)
+        logits_per_text = tf.matmul(text_embeds, image_embeds, transpose_b=True) * logit_scale
+        logits_per_image = tf.transpose(logits_per_text)
+
+        loss = None
+        if return_loss:
+            loss = blip_loss(logits_per_text)
+            loss = tf.reshape(loss, (1,))
+
+        if not return_dict:
+            output = (logits_per_image, logits_per_text, text_embeds, image_embeds, text_outputs, vision_outputs)
+            return ((loss,) + output) if loss is not None else output
+
+        return TFBlipOutput(
+            loss=loss,
+            logits_per_image=logits_per_image,
+            logits_per_text=logits_per_text,
+            text_embeds=text_embeds,
+            image_embeds=image_embeds,
+            text_model_output=text_outputs,
+            vision_model_output=vision_outputs,
+        )
+
+
+class TFBlipModel(TFBlipPreTrainedModel):
+    config_class = BlipConfig
+    _keys_to_ignore_on_load_missing = [r"text_decoder.cls.predictions.decoder.bias"]
+    main_input_name = "input_ids"
+
+    def __init__(self, config: BlipConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.blip = TFBlipMainLayer(config, name="blip")
+
+    def serving_output(self, output: TFBlipOutput) -> TFBlipOutput:
+        return TFBlipOutput(
+            logits_per_image=output.logits_per_image,
+            logits_per_text=output.logits_per_text,
+            text_embeds=output.text_embeds,
+            image_embeds=output.image_embeds,
+        )
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(BLIP_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=TFBlipOutput, config_class=BlipConfig)
+    def call(
+        self,
+        input_ids: tf.Tensor | None = None,
+        pixel_values: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        return_loss: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool | None = None,
+    ) -> tuple | TFBlipOutput:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, TFBlipModel
+
+        >>> model = TFBlipModel.from_pretrained("Salesforce/blip-image-captioning-base")
+        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(
+        ...     text=["a photo of a cat", "a photo of a dog"], images=image, return_tensors="tf", padding=True
+        ... )
+
+        >>> outputs = model(**inputs)
+        >>> logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
+        >>> probs = tf.nn.softmax(logits_per_image, axis=1)  # we can take the softmax to get the label probabilities
+        ```"""
+        outputs = self.blip(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            return_loss=return_loss,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        return outputs
+
+    @add_start_docstrings_to_model_forward(BLIP_TEXT_INPUTS_DOCSTRING)
+    def get_text_features(
+        self,
+        input_ids: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        return_dict: bool | None = None,
+    ) -> tf.Tensor:
+        r"""
+        Returns:
+            text_features (`tf.Tensor` of shape `(batch_size, output_dim`): The text embeddings obtained by applying
+            the projection layer to the pooled output of [`TFBlipTextModel`].
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoProcessor, TFBlipModel
+
+        >>> model = TFBlipModel.from_pretrained("Salesforce/blip-image-captioning-base")
+        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+
+        >>> inputs = processor(text=["a photo of a cat", "a photo of a dog"], padding=True, return_tensors="tf")
+        >>> text_features = model.get_text_features(**inputs)
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        text_outputs = self.blip.text_model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            return_dict=return_dict,
+        )
+
+        pooled_output = text_outputs[1]
+        text_features = self.blip.text_projection(pooled_output)
+
+        return text_features
+
+    @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING)
+    def get_image_features(
+        self,
+        pixel_values: tf.Tensor | None = None,
+        return_dict: bool | None = None,
+    ) -> tf.Tensor:
+        r"""
+        Returns:
+            image_features (`tf.Tensor` of shape `(batch_size, output_dim`): The image embeddings obtained by applying
+            the projection layer to the pooled output of [`TFBlipVisionModel`].
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, TFBlipModel
+
+        >>> model = TFBlipModel.from_pretrained("Salesforce/blip-image-captioning-base")
+        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="tf")
+
+        >>> image_features = model.get_image_features(**inputs)
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.blip.vision_model(pixel_values=pixel_values, return_dict=return_dict)
+
+        pooled_output = vision_outputs[1]  # pooled_output
+        image_features = self.blip.visual_projection(pooled_output)
+
+        return image_features
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "blip", None) is not None:
+            with tf.name_scope(self.blip.name):
+                self.blip.build(None)
+
+
+@add_start_docstrings(
+    """
+    BLIP Model for image captioning. The model consists of a vision encoder and a text decoder. One can optionally pass
+    `input_ids` to the model, which serve as a text prompt, to make the text decoder continue the prompt. Otherwise,
+    the decoder starts generating text from the [BOS] (beginning-of-sequence) token. will start generating the caption
+    from the text input. If no text input is provided, the decoder will start with the [BOS] token only.
+    """,
+    BLIP_START_DOCSTRING,
+)
+class TFBlipForConditionalGeneration(TFBlipPreTrainedModel):
+    config_class = BlipConfig
+    _keys_to_ignore_on_load_missing = [r"text_decoder.cls.predictions.decoder.bias"]
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: BlipConfig, *args, **kwargs):
+        super().__init__(config, *args, **kwargs)
+
+        self.vision_model = TFBlipVisionModel(config.vision_config, name="vision_model")
+
+        self.text_decoder = TFBlipTextLMHeadModel(config.text_config, name="text_decoder")
+
+        self.decoder_input_ids = config.text_config.bos_token_id
+        self.decoder_pad_token_id = config.text_config.pad_token_id
+
+    def get_input_embeddings(self) -> keras.layers.Layer:
+        return self.vision_model.embeddings.patch_embedding
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=TFBlipForConditionalGenerationModelOutput, config_class=BlipConfig)
+    def call(
+        self,
+        pixel_values: tf.Tensor,
+        input_ids: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        labels: tf.Tensor | None = None,
+        return_dict: bool | None = None,
+        training: bool | None = None,
+    ) -> tuple | TFBlipForConditionalGenerationModelOutput:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, TFBlipForConditionalGeneration
+
+        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+        >>> model = TFBlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> text = "A picture of"
+
+        >>> inputs = processor(images=image, text=text, return_tensors="tf")
+
+        >>> outputs = model(**inputs)
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        image_embeds = vision_outputs[0]
+
+        outputs = self.text_decoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            encoder_hidden_states=image_embeds,
+            labels=labels,
+            return_dict=False,
+            training=training,
+        )
+
+        if not return_dict:
+            outputs = (outputs[0], outputs[1], image_embeds, vision_outputs[0]) + vision_outputs[2:]
+            return tuple(output for output in outputs if output is not None)
+
+        if labels is not None:
+            loss = outputs[0]
+            logits = outputs[1]
+        else:
+            loss = None
+            logits = outputs[0]
+
+        if loss is not None and loss.shape.rank == 0:
+            loss = tf.reshape(loss, (1,))
+
+        return TFBlipForConditionalGenerationModelOutput(
+            loss=loss,
+            logits=logits,
+            image_embeds=image_embeds,
+            last_hidden_state=vision_outputs.last_hidden_state,
+            hidden_states=vision_outputs.hidden_states,
+            attentions=vision_outputs.attentions,
+        )
+
+    def generate(
+        self,
+        pixel_values: tf.Tensor,
+        input_ids: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        **generate_kwargs,
+    ) -> tf.Tensor:
+        r"""
+        Overrides *generate* function to be able to use the model as a conditional generator
+
+        Parameters:
+            pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, image_height, image_width)`:
+                Input image to be processed
+            input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                The sequence used as a prompt for the generation.
+            attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, TFBlipForConditionalGeneration
+
+        >>> model = TFBlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")
+        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="tf")
+
+        >>> outputs = model.generate(**inputs)
+        >>> print(processor.decode(outputs[0], skip_special_tokens=True))
+        two cats sleeping on a couch
+        ```
+        """
+
+        batch_size = pixel_values.shape[0]
+        vision_outputs = self.vision_model(pixel_values=pixel_values)
+
+        image_embeds = vision_outputs[0]
+
+        image_attention_mask = tf.ones(shape_list(image_embeds)[:-1], dtype=tf.int32)
+
+        if isinstance(input_ids, list):
+            input_ids = tf.convert_to_tensor(input_ids, dtype=tf.int32)
+        elif input_ids is None:
+            input_ids = tf.convert_to_tensor(
+                [[self.decoder_input_ids, self.config.text_config.eos_token_id]], dtype=tf.int32
+            )
+
+            input_ids = tf.tile(input_ids, (batch_size, 1))
+
+        # PyTorch: input_ids[:, 0] = self.config.text_config.bos_token_id
+        input_ids = tf.concat(
+            [tf.ones((batch_size, 1), dtype=tf.int32) * self.config.text_config.bos_token_id, input_ids[:, 1:]], axis=1
+        )
+        attention_mask = attention_mask[:, :-1] if attention_mask is not None else None
+
+        outputs = self.text_decoder.generate(
+            input_ids=input_ids[:, :-1],
+            eos_token_id=self.config.text_config.sep_token_id,
+            pad_token_id=self.config.text_config.pad_token_id,
+            attention_mask=attention_mask,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            **generate_kwargs,
+        )
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "vision_model", None) is not None:
+            with tf.name_scope(self.vision_model.name):
+                self.vision_model.build(None)
+        if getattr(self, "text_decoder", None) is not None:
+            with tf.name_scope(self.text_decoder.name):
+                self.text_decoder.build(None)
+
+
+@add_start_docstrings(
+    """
+    BLIP Model for visual question answering. The model consists of a vision encoder, a text encoder as well as a text
+    decoder. The vision encoder will encode the input image, the text encoder will encode the input question together
+    with the encoding of the image, and the text decoder will output the answer to the question.
+    """,
+    BLIP_START_DOCSTRING,
+)
+class TFBlipForQuestionAnswering(TFBlipPreTrainedModel):
+    config_class = BlipConfig
+    _keys_to_ignore_on_load_missing = [r"text_decoder.cls.predictions.decoder.bias"]
+
+    def __init__(self, config: BlipConfig, *args, **kwargs):
+        super().__init__(config, *args, **kwargs)
+
+        self.vision_model = TFBlipVisionModel(config.vision_config, name="vision_model")
+
+        self.text_encoder = TFBlipTextModel(config.text_config, name="text_encoder", add_pooling_layer=False)
+
+        self.text_decoder = TFBlipTextLMHeadModel(config.text_config, name="text_decoder")
+
+        self.decoder_pad_token_id = config.text_config.pad_token_id
+        self.decoder_start_token_id = config.text_config.bos_token_id
+
+    def get_input_embeddings(self) -> keras.layers.Layer:
+        return self.vision_model.embeddings.patch_embedding
+
+    # Adapted from transformers.models.t5.modeling_tf_t5.TFT5PreTrainedModel._shift_right
+    def _shift_right(self, input_ids):
+        decoder_start_token_id = self.decoder_start_token_id
+        pad_token_id = self.decoder_pad_token_id
+
+        if decoder_start_token_id is None or pad_token_id is None:
+            raise ValueError("decoder_start_token_id and pad_token_id must be defined!")
+
+        start_tokens = tf.fill((shape_list(input_ids)[0], 1), decoder_start_token_id)
+        start_tokens = tf.cast(start_tokens, input_ids.dtype)  # Ensure compatible dtypes for concatenation
+        shifted_input_ids = tf.concat([start_tokens, input_ids[:, :-1]], -1)
+
+        # replace possible -100 values in labels by `pad_token_id`
+        shifted_input_ids = tf.where(
+            shifted_input_ids == -100,
+            tf.cast(tf.fill(shape_list(shifted_input_ids), pad_token_id), shifted_input_ids.dtype),
+            shifted_input_ids,
+        )
+
+        # "Verify that `labels` has only positive values and -100"
+        tf.debugging.assert_greater_equal(shifted_input_ids, tf.constant(0, dtype=shifted_input_ids.dtype))
+
+        return shifted_input_ids
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=TFBlipTextVisionModelOutput, config_class=BlipVisionConfig)
+    def call(
+        self,
+        input_ids: tf.Tensor,
+        pixel_values: tf.Tensor | None = None,
+        decoder_input_ids: tf.Tensor | None = None,
+        decoder_attention_mask: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        labels: tf.Tensor | None = None,
+        return_dict: bool | None = None,
+        training: bool | None = None,
+    ) -> tuple | TFBlipTextVisionModelOutput:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, TFBlipForQuestionAnswering
+
+        >>> model = TFBlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
+        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> # training
+        >>> text = "How many cats are in the picture?"
+        >>> label = "2"
+        >>> inputs = processor(images=image, text=text, return_tensors="tf")
+        >>> labels = processor(text=label, return_tensors="tf").input_ids
+
+        >>> inputs["labels"] = labels
+        >>> outputs = model(**inputs)
+        >>> loss = outputs.loss
+
+        >>> # inference
+        >>> text = "How many cats are in the picture?"
+        >>> inputs = processor(images=image, text=text, return_tensors="tf")
+        >>> outputs = model.generate(**inputs)
+        >>> print(processor.decode(outputs[0], skip_special_tokens=True))
+        2
+        ```"""
+        if labels is None and decoder_input_ids is None:
+            raise ValueError(
+                "Either `decoder_input_ids` or `labels` should be passed when calling"
+                " `TFBlipForQuestionAnswering`. if you are training the model make sure that `labels` is passed, if you"
+                " are using the model for inference make sure that `decoder_input_ids` is passed or call `generate`"
+            )
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        image_embeds = vision_outputs[0]
+        image_attention_mask = tf.ones(shape_list(image_embeds)[:-1], dtype=tf.int64)
+
+        question_embeds = self.text_encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        question_embeds = question_embeds[0] if not return_dict else question_embeds.last_hidden_state
+
+        if labels is not None and decoder_input_ids is None:
+            # labels are already shifted right, see: https://github.com/huggingface/transformers/pull/23153
+            decoder_input_ids = labels
+
+        answer_output = self.text_decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=question_embeds,
+            encoder_attention_mask=attention_mask,
+            labels=labels,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        if labels is not None:
+            decoder_loss = tf.reduce_mean(answer_output.loss) if return_dict else tf.reduce_mean(answer_output[0])
+        else:
+            decoder_loss = None
+
+        if not return_dict:
+            outputs = (decoder_loss, image_embeds, vision_outputs[0]) + vision_outputs[2:]
+            return tuple(output for output in outputs if output is not None)
+
+        return TFBlipTextVisionModelOutput(
+            loss=decoder_loss,
+            image_embeds=image_embeds,
+            last_hidden_state=vision_outputs.last_hidden_state,
+            hidden_states=vision_outputs.hidden_states,
+            attentions=vision_outputs.attentions,
+        )
+
+    def generate(
+        self,
+        input_ids: tf.Tensor,
+        pixel_values: tf.Tensor,
+        attention_mask: tf.Tensor | None = None,
+        **generate_kwargs,
+    ) -> tf.Tensor:
+        r"""
+        Overrides *generate* function to be able to use the model as a conditional generator
+
+        Parameters:
+            input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
+                The sequence used as a prompt for the generation.
+            pixel_values (`tf.Tensor` of shape `(batch_size, num_channels, image_height, image_width)`:
+                Input image to be processed
+            attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`. `1` for
+                tokens that are NOT MASKED, `0` for MASKED tokens.
+            generate_kwargs (dict, *optional*):
+                Additional arguments passed to the `generate` function of the decoder
+
+
+        Examples:
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, TFBlipForQuestionAnswering
+
+        >>> model = TFBlipForQuestionAnswering.from_pretrained("Salesforce/blip-vqa-base")
+        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-vqa-base")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> text = "How many cats are in the picture?"
+
+        >>> inputs = processor(images=image, text=text, return_tensors="tf")
+
+        >>> outputs = model.generate(**inputs)
+        >>> print(processor.decode(outputs[0], skip_special_tokens=True))
+        2
+        ```
+        """
+        vision_outputs = self.vision_model(pixel_values=pixel_values)
+
+        image_embeds = vision_outputs[0]
+
+        image_attention_mask = tf.ones(shape_list(image_embeds)[:-1], dtype=tf.int32)
+
+        if isinstance(input_ids, list):
+            input_ids = tf.Tensor(input_ids)
+
+        question_outputs = self.text_encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_attention_mask,
+            return_dict=False,
+        )
+
+        question_embeds = question_outputs[0]
+
+        question_attention_mask = tf.ones(shape_list(question_embeds)[:-1], dtype=tf.int32)
+
+        bos_ids = tf.fill(
+            (tf.shape(question_embeds)[0], 1), value=tf.cast(self.decoder_start_token_id, input_ids.dtype)
+        )
+
+        outputs = self.text_decoder.generate(
+            input_ids=bos_ids,
+            eos_token_id=self.config.text_config.sep_token_id,
+            pad_token_id=self.config.text_config.pad_token_id,
+            encoder_hidden_states=question_embeds,
+            encoder_attention_mask=question_attention_mask,
+            **generate_kwargs,
+        )
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "vision_model", None) is not None:
+            with tf.name_scope(self.vision_model.name):
+                self.vision_model.build(None)
+        if getattr(self, "text_encoder", None) is not None:
+            with tf.name_scope(self.text_encoder.name):
+                self.text_encoder.build(None)
+        if getattr(self, "text_decoder", None) is not None:
+            with tf.name_scope(self.text_decoder.name):
+                self.text_decoder.build(None)
+
+
+@add_start_docstrings(
+    """
+    BLIP Model with a vision and text projector, and a classification head on top. The model is used in the context of
+    image-text retrieval. Given an image and a text, the model returns the probability of the text being relevant to
+    the image.
+    """,
+    BLIP_START_DOCSTRING,
+)
+class TFBlipForImageTextRetrieval(TFBlipPreTrainedModel):
+    config_class = BlipConfig
+
+    def __init__(self, config: BlipConfig, *args, **kwargs):
+        super().__init__(config, *args, **kwargs)
+
+        self.vision_model = TFBlipVisionModel(config.vision_config, name="vision_model")
+
+        self.text_encoder = TFBlipTextModel(config.text_config, name="text_encoder", add_pooling_layer=False)
+
+        # vision projection layer
+        self.vision_proj = keras.layers.Dense(
+            config.image_text_hidden_size,
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="vision_proj",
+        )
+
+        # text projection layer
+        self.text_proj = keras.layers.Dense(
+            config.image_text_hidden_size,
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="text_proj",
+        )
+
+        # image text matching head
+        self.itm_head = keras.layers.Dense(
+            2, kernel_initializer=get_initializer(config.initializer_range), name="itm_head"
+        )
+
+        self.decoder_pad_token_id = (
+            config.text_config.pad_token_id
+            if not hasattr(config, "decoder_pad_token_id")
+            else config.decoder_pad_token_id
+        )
+        self.decoder_start_token_id = (
+            config.text_config.bos_token_id
+            if not hasattr(config, "decoder_start_token_id")
+            else config.decoder_start_token_id
+        )
+        self.config = config
+
+    def get_input_embeddings(self) -> keras.layers.Layer:
+        return self.vision_model.embeddings.patch_embedding
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(BLIP_VISION_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=TFBlipImageTextMatchingModelOutput, config_class=BlipVisionConfig)
+    def call(
+        self,
+        input_ids: tf.Tensor,
+        pixel_values: tf.Tensor | None = None,
+        use_itm_head: bool | None = True,
+        attention_mask: tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool | None = None,
+    ) -> tuple | TFBlipImageTextMatchingModelOutput:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, TFBlipForImageTextRetrieval
+
+        >>> model = TFBlipForImageTextRetrieval.from_pretrained("Salesforce/blip-itm-base-coco")
+        >>> processor = AutoProcessor.from_pretrained("Salesforce/blip-itm-base-coco")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+        >>> text = "an image of a cat"
+
+        >>> inputs = processor(images=image, text=text, return_tensors="tf")
+        >>> outputs = model(**inputs)
+        ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        vision_outputs = self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        image_embeds = vision_outputs[0]
+        image_atts = tf.ones(shape_list(image_embeds)[:-1], dtype=tf.int64)
+
+        # Matt: In PyTorch, only one path (itm/non-itm) is taken. However, in TensorFlow this can result in
+        # some layers not being built! To avoid this, we always call both paths, then use an if statement to select
+        # which output to pass to the final output. The unnecessary nodes will be pruned from the final graph, but
+        # not before the layers have all been built correctly.
+        itm_question_embeds = self.text_encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            encoder_hidden_states=image_embeds,
+            encoder_attention_mask=image_atts,
+            return_dict=return_dict,
+            training=training,
+        )
+        itm_question_embeds = itm_question_embeds[0] if not return_dict else itm_question_embeds.last_hidden_state
+
+        itm_output = self.itm_head(itm_question_embeds[:, 0, :])
+
+        no_itm_question_embeds = self.text_encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            return_dict=return_dict,
+            training=training,
+        )
+        no_itm_question_embeds = (
+            no_itm_question_embeds[0] if not return_dict else no_itm_question_embeds.last_hidden_state
+        )
+
+        image_feat, _ = tf.linalg.normalize(self.vision_proj(image_embeds[:, 0, :]), ord=2, axis=-1)
+        text_feat, _ = tf.linalg.normalize(self.text_proj(no_itm_question_embeds[:, 0, :]), ord=2, axis=-1)
+
+        no_itm_output = tf.matmul(image_feat, text_feat, transpose_b=True)
+
+        if use_itm_head:
+            output = itm_output
+            question_embeds = itm_question_embeds
+        else:
+            output = no_itm_output
+            question_embeds = no_itm_question_embeds
+
+        if not return_dict:
+            outputs = (output, vision_outputs[0]) + vision_outputs[2:] + (question_embeds,)
+            return tuple(output for output in outputs if output is not None)
+
+        return TFBlipImageTextMatchingModelOutput(
+            itm_score=output,
+            last_hidden_state=vision_outputs.last_hidden_state,
+            hidden_states=vision_outputs.hidden_states,
+            attentions=vision_outputs.attentions,
+            question_embeds=question_embeds,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "vision_model", None) is not None:
+            with tf.name_scope(self.vision_model.name):
+                self.vision_model.build(None)
+        if getattr(self, "text_encoder", None) is not None:
+            with tf.name_scope(self.text_encoder.name):
+                self.text_encoder.build(None)
+        if getattr(self, "vision_proj", None) is not None:
+            with tf.name_scope(self.vision_proj.name):
+                self.vision_proj.build([None, None, self.config.vision_config.hidden_size])
+        if getattr(self, "text_proj", None) is not None:
+            with tf.name_scope(self.text_proj.name):
+                self.text_proj.build([None, None, self.config.text_config.hidden_size])
+        if getattr(self, "itm_head", None) is not None:
+            with tf.name_scope(self.itm_head.name):
+                self.itm_head.build([None, None, self.config.text_config.hidden_size])
+
+
+__all__ = [
+    "TFBlipModel",
+    "TFBlipPreTrainedModel",
+    "TFBlipForConditionalGeneration",
+    "TFBlipForQuestionAnswering",
+    "TFBlipVisionModel",
+    "TFBlipTextModel",
+    "TFBlipForImageTextRetrieval",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/modeling_tf_blip_text.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/modeling_tf_blip_text.py
new file mode 100644
index 0000000000000000000000000000000000000000..7dae1126e03b12b39b17b8fdc00b6050f9681fcd
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/modeling_tf_blip_text.py
@@ -0,0 +1,1122 @@
+# coding=utf-8
+# Copyright 2023 The Salesforce Team Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the BSD-3-clause license (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     https://opensource.org/licenses/BSD-3-Clause
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from __future__ import annotations
+
+import math
+
+import tensorflow as tf
+
+from ...modeling_tf_outputs import (
+    TFBaseModelOutputWithPastAndCrossAttentions,
+    TFBaseModelOutputWithPoolingAndCrossAttentions,
+    TFCausalLMOutputWithCrossAttentions,
+)
+from ...modeling_tf_utils import (
+    TFModelInputType,
+    TFPreTrainedModel,
+    get_initializer,
+    get_tf_activation,
+    keras,
+    keras_serializable,
+    shape_list,
+    unpack_inputs,
+)
+from ...tf_utils import check_embeddings_within_bounds, invert_attention_mask, stable_softmax
+from ...utils import add_start_docstrings_to_model_forward, logging
+from .configuration_blip import BlipTextConfig
+
+
+logger = logging.get_logger(__name__)
+
+BLIP_TEXT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoProcessor`]. See [`BlipProcessor.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L52
+class TFBlipTextEmbeddings(keras.layers.Layer):
+    """Construct the embeddings from word and position embeddings."""
+
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        self.word_embeddings = keras.layers.Embedding(
+            config.vocab_size,
+            config.hidden_size,
+            embeddings_initializer=get_initializer(config.initializer_range),
+            name="word_embeddings",
+        )
+        self.position_embeddings = keras.layers.Embedding(
+            config.max_position_embeddings,
+            config.hidden_size,
+            embeddings_initializer=get_initializer(config.initializer_range),
+            name="position_embeddings",
+        )
+
+        # self.LayerNorm is not snake-cased to stick with PyTorch model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+        self.dropout = keras.layers.Dropout(config.hidden_dropout_prob, name="dropout")
+
+        self.position_ids = tf.expand_dims(tf.range(config.max_position_embeddings), 0)
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+
+        self.config = config
+
+    def call(self, input_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0, training=None):
+        if input_ids is not None:
+            input_shape = tf.shape(input_ids)
+        else:
+            input_shape = tf.shape(inputs_embeds)[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
+
+        if inputs_embeds is None:
+            check_embeddings_within_bounds(input_ids, self.config.vocab_size)
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        embeddings = inputs_embeds
+
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings, training=training)
+        return embeddings
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "word_embeddings", None) is not None:
+            with tf.name_scope(self.word_embeddings.name):
+                self.word_embeddings.build(None)
+        if getattr(self, "position_embeddings", None) is not None:
+            with tf.name_scope(self.position_embeddings.name):
+                self.position_embeddings.build(None)
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.hidden_size])
+        if getattr(self, "dropout", None) is not None:
+            with tf.name_scope(self.dropout.name):
+                self.dropout.build(None)
+
+
+# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L97
+class TFBlipTextSelfAttention(keras.layers.Layer):
+    def __init__(self, config, is_cross_attention, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                "The hidden size (%d) is not a multiple of the number of attention heads (%d)"
+                % (config.hidden_size, config.num_attention_heads)
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = keras.layers.Dense(
+            self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
+        )
+        self.key = keras.layers.Dense(
+            self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
+        )
+        self.value = keras.layers.Dense(
+            self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
+        )
+
+        self.dropout = keras.layers.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = keras.layers.Embedding(
+                2 * config.max_position_embeddings - 1, self.attention_head_size
+            )
+        self.is_cross_attention = is_cross_attention
+
+    def transpose_for_scores(self, x):
+        new_x_shape = tf.concat(
+            [tf.shape(x)[:-1], tf.constant([self.num_attention_heads, self.attention_head_size], dtype=tf.int32)],
+            axis=0,
+        )
+        x = tf.reshape(x, new_x_shape)
+        return tf.transpose(x, perm=(0, 2, 1, 3))
+
+    def call(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+        training=None,
+    ):
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_value is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = tf.concat([past_key_value[0], key_layer], axis=2)
+            value_layer = tf.concat([past_key_value[1], value_layer], axis=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        past_key_value = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            seq_length = shape_list(hidden_states)[1]
+            position_ids_l = tf.expand_dims(tf.range(seq_length, dtype=tf.int64, device=hidden_states.device), 1)
+            position_ids_r = tf.expand_dims(tf.range(seq_length, dtype=tf.int64, device=hidden_states.device), 0)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = tf.cast(positional_embedding, query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = tf.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = tf.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = tf.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BlipTextModel forward() function)
+            attention_scores = attention_scores + tf.cast(attention_mask, attention_scores.dtype)
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = stable_softmax(attention_scores, axis=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs_dropped = self.dropout(attention_probs, training=training)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs_dropped = attention_probs_dropped * head_mask
+
+        context_layer = attention_probs_dropped @ value_layer
+
+        context_layer = tf.transpose(context_layer, perm=(0, 2, 1, 3))
+        new_context_layer_shape = shape_list(context_layer)[:-2] + [self.all_head_size]
+        context_layer = tf.reshape(context_layer, new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        outputs = outputs + (past_key_value,)
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "query", None) is not None:
+            with tf.name_scope(self.query.name):
+                self.query.build([None, None, self.config.hidden_size])
+        if self.is_cross_attention:
+            if getattr(self, "key", None) is not None:
+                with tf.name_scope(self.key.name):
+                    self.key.build([None, None, self.config.encoder_hidden_size])
+            if getattr(self, "value", None) is not None:
+                with tf.name_scope(self.value.name):
+                    self.value.build([None, None, self.config.encoder_hidden_size])
+        else:
+            if getattr(self, "key", None) is not None:
+                with tf.name_scope(self.key.name):
+                    self.key.build([None, None, self.config.hidden_size])
+            if getattr(self, "value", None) is not None:
+                with tf.name_scope(self.value.name):
+                    self.value.build([None, None, self.config.hidden_size])
+
+
+class TFBlipTextSelfOutput(keras.layers.Layer):
+    def __init__(self, config: BlipTextConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = keras.layers.Dense(
+            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
+        self.config = config
+
+    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool | None = None) -> tf.Tensor:
+        hidden_states = self.dense(inputs=hidden_states)
+        hidden_states = self.dropout(inputs=hidden_states, training=training)
+        hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
+
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.hidden_size])
+
+
+# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#242
+class TFBlipTextAttention(keras.layers.Layer):
+    def __init__(self, config, is_cross_attention=False, **kwargs):
+        super().__init__(**kwargs)
+        self.self = TFBlipTextSelfAttention(config, is_cross_attention, name="self")
+        # "output" is a protected attribute on TF models
+        self.self_output = TFBlipTextSelfOutput(config, name="output")
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        encoder_hidden_states: tf.Tensor | None = None,
+        encoder_attention_mask: tf.Tensor | None = None,
+        past_key_value: tuple[tuple[tf.Tensor]] | None = None,
+        output_attentions: bool | None = False,
+        training: bool | None = None,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_value,
+            output_attentions,
+            training=training,
+        )
+        attention_output = self.self_output(self_outputs[0], hidden_states, training=training)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "self", None) is not None:
+            with tf.name_scope(self.self.name):
+                self.self.build(None)
+        if getattr(self, "self_output", None) is not None:
+            with tf.name_scope(self.self_output.name):
+                self.self_output.build(None)
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->BlipText
+class TFBlipTextIntermediate(keras.layers.Layer):
+    def __init__(self, config: BlipTextConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = keras.layers.Dense(
+            units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = get_tf_activation(config.hidden_act)
+        else:
+            self.intermediate_act_fn = config.hidden_act
+        self.config = config
+
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        hidden_states = self.dense(inputs=hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+
+
+class TFBlipTextOutput(keras.layers.Layer):
+    def __init__(self, config: BlipTextConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = keras.layers.Dense(
+            units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
+        self.config = config
+
+    def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
+        hidden_states = self.dense(inputs=hidden_states)
+        hidden_states = self.dropout(inputs=hidden_states, training=training)
+        hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
+
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.intermediate_size])
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.hidden_size])
+
+
+class TFBlipTextLayer(keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.attention = TFBlipTextAttention(config, name="attention")
+        if self.config.is_decoder:
+            self.crossattention = TFBlipTextAttention(
+                config, is_cross_attention=self.config.is_decoder, name="crossattention"
+            )
+        self.intermediate = TFBlipTextIntermediate(config, name="intermediate")
+        self.self_output = TFBlipTextOutput(config, name="output")
+
+    def call(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_value=None,
+        output_attentions=False,
+        training=None,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_value=self_attn_past_key_value,
+            training=training,
+        )
+        attention_output = self_attention_outputs[0]
+
+        outputs = self_attention_outputs[1:-1]
+        present_key_value = self_attention_outputs[-1]
+
+        if encoder_hidden_states is not None:
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                output_attentions=output_attentions,
+                training=training,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.self_output(intermediate_output, attention_output, training=training)
+        outputs = (layer_output,) + outputs
+
+        outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "attention", None) is not None:
+            with tf.name_scope(self.attention.name):
+                self.attention.build(None)
+        if getattr(self, "intermediate", None) is not None:
+            with tf.name_scope(self.intermediate.name):
+                self.intermediate.build(None)
+        if getattr(self, "self_output", None) is not None:
+            with tf.name_scope(self.self_output.name):
+                self.self_output.build(None)
+        if getattr(self, "crossattention", None) is not None:
+            with tf.name_scope(self.crossattention.name):
+                self.crossattention.build(None)
+
+
+# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L386
+@keras_serializable
+class TFBlipTextEncoder(keras.layers.Layer):
+    config_class = BlipTextConfig
+
+    def __init__(self, config, name=None, **kwargs):
+        super().__init__(name=name, **kwargs)
+        self.config = config
+        self.layer = [TFBlipTextLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)]
+
+    @unpack_inputs
+    def call(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+        training=None,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.is_decoder else None
+
+        next_decoder_cache = () if use_cache else None
+
+        for i in range(self.config.num_hidden_layers):
+            layer_module = self.layer[i]
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            past_key_value = past_key_values[i] if past_key_values is not None else None
+
+            layer_outputs = layer_module(
+                hidden_states,
+                attention_mask,
+                layer_head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                past_key_value,
+                output_attentions,
+                training=training,
+            )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return TFBaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "layer", None) is not None:
+            for layer in self.layer:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->BlipText
+class TFBlipTextPooler(keras.layers.Layer):
+    def __init__(self, config: BlipTextConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = keras.layers.Dense(
+            units=config.hidden_size,
+            kernel_initializer=get_initializer(config.initializer_range),
+            activation="tanh",
+            name="dense",
+        )
+        self.config = config
+
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(inputs=first_token_tensor)
+
+        return pooled_output
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertPredictionHeadTransform with Bert->BlipText
+class TFBlipTextPredictionHeadTransform(keras.layers.Layer):
+    def __init__(self, config: BlipTextConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense = keras.layers.Dense(
+            units=config.hidden_size,
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="dense",
+        )
+
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = get_tf_activation(config.hidden_act)
+        else:
+            self.transform_act_fn = config.hidden_act
+
+        self.LayerNorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+        self.config = config
+
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        hidden_states = self.dense(inputs=hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(inputs=hidden_states)
+
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.config.hidden_size])
+        if getattr(self, "LayerNorm", None) is not None:
+            with tf.name_scope(self.LayerNorm.name):
+                self.LayerNorm.build([None, None, self.config.hidden_size])
+
+
+class TFBlipTextLMPredictionHead(keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        self.transform = TFBlipTextPredictionHeadTransform(config, name="transform")
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = keras.layers.Dense(
+            config.vocab_size,
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="decoder",
+            use_bias=False,
+        )
+        self.config = config
+
+    def build(self, input_shape=None):
+        self.bias = self.add_weight(name="bias", shape=(self.config.vocab_size,), initializer="zeros", trainable=True)
+
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "transform", None) is not None:
+            with tf.name_scope(self.transform.name):
+                self.transform.build(None)
+        if getattr(self, "decoder", None) is not None:
+            with tf.name_scope(self.decoder.name):
+                self.decoder.build([None, None, self.config.hidden_size])
+
+    def call(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states) + self.bias
+        return hidden_states
+
+
+class TFBlipTextOnlyMLMHead(keras.layers.Layer):
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+        self.predictions = TFBlipTextLMPredictionHead(config, name="predictions")
+
+    def call(self, sequence_output: tf.Tensor) -> tf.Tensor:
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "predictions", None) is not None:
+            with tf.name_scope(self.predictions.name):
+                self.predictions.build(None)
+
+
+# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L548
+class TFBlipTextPreTrainedModel(TFPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = BlipTextConfig
+    base_model_prefix = "bert"
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+
+
+# Adapted from https://github.com/salesforce/BLIP/blob/3a29b7410476bf5f2ba0955827390eb6ea1f4f9d/models/med.py#L571
+class TFBlipTextModel(TFBlipTextPreTrainedModel):
+    """
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
+    all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin. argument and `is_decoder` set to `True`; an
+    `encoder_hidden_states` is then expected as an input to the forward pass.
+    """
+
+    def __init__(self, config, add_pooling_layer=True, name=None, **kwargs):
+        super().__init__(config, name=name, **kwargs)
+        self.config = config
+
+        self.embeddings = TFBlipTextEmbeddings(config, name="embeddings")
+        self.encoder = TFBlipTextEncoder(config, name="encoder")
+        self.pooler = TFBlipTextPooler(config, name="pooler") if add_pooling_layer else None
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    @tf.function
+    def get_extended_attention_mask(
+        self, attention_mask: tf.Tensor, input_shape: tuple[int], is_decoder: bool
+    ) -> tf.Tensor:
+        """
+        Makes broadcastable attention and causal masks so that future and masked tokens are ignored.
+
+        Arguments:
+            attention_mask (`tf.Tensor`):
+                Mask with ones indicating tokens to attend to, zeros for tokens to ignore.
+            input_shape (`tuple[int]`):
+                The shape of the input to the model.
+            is_decoder (`bool`):
+                Whether the model is used as a decoder.
+
+        Returns:
+            `tf.Tensor` The extended attention mask, with the same dtype as `attention_mask.dtype`.
+        """
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        if not isinstance(attention_mask, tf.Tensor):
+            attention_mask = tf.convert_to_tensor(attention_mask)  # Catches NumPy inputs that haven't been cast yet
+        if attention_mask.shape.rank == 3:
+            extended_attention_mask = attention_mask[:, None, :, :]
+        elif attention_mask.shape.rank == 2:
+            # Provided a padding mask of dimensions [batch_size, seq_length]
+            # - if the model is a decoder, apply a causal mask in addition to the padding mask
+            # - if the model is an encoder, make the mask broadcastable to [batch_size, num_heads, seq_length, seq_length]
+            if is_decoder:
+                batch_size, seq_length = input_shape
+
+                seq_ids = tf.range(seq_length, dtype=attention_mask.dtype)
+                causal_mask = tf.broadcast_to(seq_ids, (batch_size, seq_length, seq_length)) <= seq_ids[None, :, None]
+                # in case past_key_values are used we need to add a prefix ones mask to the causal mask
+
+                if shape_list(causal_mask)[1] < shape_list(attention_mask)[1]:
+                    prefix_seq_len = tf.shape(attention_mask)[1] - tf.shape(causal_mask)[1]
+                    causal_mask = tf.concat(
+                        [
+                            tf.ones((batch_size, seq_length, prefix_seq_len), dtype=causal_mask.dtype),
+                            causal_mask,
+                        ],
+                        axis=-1,
+                    )
+                extended_attention_mask = (
+                    tf.cast(causal_mask[:, None, :, :], attention_mask.dtype) * attention_mask[:, None, None, :]
+                )
+            else:
+                extended_attention_mask = attention_mask[:, None, None, :]
+        else:
+            raise ValueError(
+                f"Wrong shape for input_ids (shape {input_shape}) or attention_mask (shape {attention_mask.shape})"
+            )
+
+        # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+        # masked positions, this operation will create a tensor which is 0.0 for
+        # positions we want to attend and -10000.0 for masked positions.
+        # Since we are adding it to the raw scores before the softmax, this is
+        # effectively the same as removing these entirely.
+        extended_attention_mask = tf.cast(extended_attention_mask, self.dtype)  # fp16 compatibility
+        extended_attention_mask = (1.0 - extended_attention_mask) * -10000.0
+        return extended_attention_mask
+
+    @add_start_docstrings_to_model_forward(BLIP_TEXT_INPUTS_DOCSTRING)
+    @unpack_inputs
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        encoder_embeds: tf.Tensor | None = None,
+        encoder_hidden_states: tf.Tensor | None = None,
+        encoder_attention_mask: tf.Tensor | None = None,
+        past_key_values: tuple[tuple[tf.Tensor]] | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        is_decoder: bool = False,
+        training: bool = False,
+    ) -> tuple[tf.Tensor] | TFBaseModelOutputWithPoolingAndCrossAttentions:
+        r"""
+        encoder_hidden_states  (`tf.Tensor`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`tf.Tensor`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(tf.Tensor))`, *optional*):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = shape_list(input_ids)
+            batch_size, seq_length = input_shape
+        elif inputs_embeds is not None:
+            input_shape = shape_list(inputs_embeds)[:-1]
+            batch_size, seq_length = input_shape
+        elif encoder_embeds is not None:
+            input_shape = shape_list(encoder_embeds)[:-1]
+            batch_size, seq_length = input_shape
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds or encoder_embeds")
+
+        # past_key_values_length
+        past_key_values_length = past_key_values[0][0].shape[2] if past_key_values is not None else 0
+
+        if attention_mask is None:
+            attention_mask = tf.ones((batch_size, seq_length + past_key_values_length))
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: tf.Tensor = self.get_extended_attention_mask(attention_mask, input_shape, is_decoder)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if encoder_hidden_states is not None:
+            if isinstance(encoder_hidden_states, list):
+                encoder_batch_size, encoder_sequence_length, _ = shape_list(encoder_hidden_states[0])
+            else:
+                encoder_batch_size, encoder_sequence_length, _ = shape_list(encoder_hidden_states)
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+
+            if isinstance(encoder_attention_mask, list):
+                encoder_extended_attention_mask = [invert_attention_mask(mask) for mask in encoder_attention_mask]
+            elif encoder_attention_mask is None:
+                encoder_attention_mask = tf.ones(encoder_hidden_shape)
+                encoder_extended_attention_mask = invert_attention_mask(encoder_attention_mask)
+            else:
+                encoder_extended_attention_mask = invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        if encoder_embeds is None:
+            embedding_output = self.embeddings(
+                input_ids=input_ids,
+                position_ids=position_ids,
+                inputs_embeds=inputs_embeds,
+                past_key_values_length=past_key_values_length,
+            )
+        else:
+            embedding_output = encoder_embeds
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return TFBaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "embeddings", None) is not None:
+            with tf.name_scope(self.embeddings.name):
+                self.embeddings.build(None)
+        if getattr(self, "encoder", None) is not None:
+            with tf.name_scope(self.encoder.name):
+                self.encoder.build(None)
+        if getattr(self, "pooler", None) is not None:
+            with tf.name_scope(self.pooler.name):
+                self.pooler.build(None)
+
+
+# Adapted from https://github.com/salesforce/BLIP/blob/main/models/med.py#L811
+class TFBlipTextLMHeadModel(TFBlipTextPreTrainedModel):
+    _keys_to_ignore_on_load_unexpected = [r"pooler"]
+    _keys_to_ignore_on_load_missing = [r"position_ids", r"predictions.decoder.bias"]
+
+    def __init__(self, config, **kwargs):
+        super().__init__(config, **kwargs)
+
+        self.bert = TFBlipTextModel(config, add_pooling_layer=False, name="bert")
+        self.cls = TFBlipTextOnlyMLMHead(config, name="cls")
+        self.label_smoothing = config.label_smoothing
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+
+    @add_start_docstrings_to_model_forward(BLIP_TEXT_INPUTS_DOCSTRING)
+    @unpack_inputs
+    def call(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        labels=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        return_logits=False,
+        is_decoder=True,
+        training=None,
+    ):
+        r"""
+        encoder_hidden_states (`tf.Tensor`, *optional*): Sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention if the model is
+            configured as a decoder.
+        encoder_attention_mask (`tf.Tensor`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        labels (`tf.Tensor`, *optional*):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`
+        past_key_values (`tuple(tuple(tf.Tensor))`, *optional*):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            is_decoder=is_decoder,
+            training=training,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        if return_logits:
+            return prediction_scores[:, :-1, :]
+
+        lm_loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shifted_prediction_scores = prediction_scores[:, :-1, :]
+            shifted_prediction_scores = tf.reshape(shifted_prediction_scores, (-1, self.config.vocab_size))
+            labels = labels[:, 1:]
+            labels = tf.reshape(labels, (-1,))
+            # Keras won't give us label smoothing for sparse CE, so we de-sparsify things here
+            # Use relu to clamp masked labels at 0 to avoid NaN (we will be zeroing those out later anyway)
+            one_hot_labels = tf.one_hot(tf.nn.relu(labels), depth=self.config.vocab_size, dtype=tf.float32)
+            loss_fct = keras.losses.CategoricalCrossentropy(
+                from_logits=True, label_smoothing=self.label_smoothing, reduction="none"
+            )
+            masked_positions = tf.cast(tf.not_equal(labels, -100), dtype=tf.float32)
+            lm_loss = loss_fct(one_hot_labels, shifted_prediction_scores)
+            lm_loss *= masked_positions
+            lm_loss = tf.reduce_sum(lm_loss, axis=0) / tf.math.count_nonzero(masked_positions, dtype=tf.float32)
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+
+        return TFCausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_shape)
+
+        # cut decoder_input_ids if past_key_values is used
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "past_key_values": past_key_values,
+            "encoder_hidden_states": model_kwargs.get("encoder_hidden_states"),
+            "encoder_attention_mask": model_kwargs.get("encoder_attention_mask"),
+            "is_decoder": True,
+        }
+
+    def _reorder_cache(self, past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (tuple(past_state.index_select(0, beam_idx) for past_state in layer_past),)
+        return reordered_past
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "bert", None) is not None:
+            with tf.name_scope(self.bert.name):
+                self.bert.build(None)
+        if getattr(self, "cls", None) is not None:
+            with tf.name_scope(self.cls.name):
+                self.cls.build(None)
+
+
+__all__ = ["TFBlipTextLMHeadModel", "TFBlipTextModel", "TFBlipTextPreTrainedModel"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/processing_blip.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/processing_blip.py
new file mode 100644
index 0000000000000000000000000000000000000000..5cc4334a974cb8e47f1b94b180e1f5cf8f553513
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blip/processing_blip.py
@@ -0,0 +1,125 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for Blip.
+"""
+
+from typing import Optional, Union
+
+from ...image_utils import ImageInput
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
+from ...tokenization_utils_base import BatchEncoding, PreTokenizedInput, TextInput
+
+
+class BlipProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "text_kwargs": {
+            "add_special_tokens": True,
+            "padding": False,
+            "stride": 0,
+            "return_overflowing_tokens": False,
+            "return_special_tokens_mask": False,
+            "return_offsets_mapping": False,
+            "return_token_type_ids": False,
+            "return_length": False,
+            "verbose": True,
+        },
+        "images_kwargs": {},
+    }
+
+
+class BlipProcessor(ProcessorMixin):
+    r"""
+    Constructs a BLIP processor which wraps a BERT tokenizer and BLIP image processor into a single processor.
+
+    [`BlipProcessor`] offers all the functionalities of [`BlipImageProcessor`] and [`BertTokenizerFast`]. See the
+    docstring of [`~BlipProcessor.__call__`] and [`~BlipProcessor.decode`] for more information.
+
+    Args:
+        image_processor (`BlipImageProcessor`):
+            An instance of [`BlipImageProcessor`]. The image processor is a required input.
+        tokenizer (`BertTokenizerFast`):
+            An instance of ['BertTokenizerFast`]. The tokenizer is a required input.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = ("BlipImageProcessor", "BlipImageProcessorFast")
+    tokenizer_class = ("BertTokenizer", "BertTokenizerFast")
+
+    def __init__(self, image_processor, tokenizer, **kwargs):
+        tokenizer.return_token_type_ids = False
+        super().__init__(image_processor, tokenizer)
+        self.current_processor = self.image_processor
+
+    def __call__(
+        self,
+        images: Optional[ImageInput] = None,
+        text: Optional[Union[str, list[str], TextInput, PreTokenizedInput]] = None,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[BlipProcessorKwargs],
+    ) -> BatchEncoding:
+        """
+        This method uses [`BlipImageProcessor.__call__`] method to prepare image(s) for the model, and
+        [`BertTokenizerFast.__call__`] to prepare text for the model.
+
+        Please refer to the docstring of the above two methods for more information.
+        Args:
+            images (`ImageInput`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            text (`TextInput`, `PreTokenizedInput`, `list[TextInput]`, `list[PreTokenizedInput]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                    - `'tf'`: Return TensorFlow `tf.constant` objects.
+                    - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                    - `'np'`: Return NumPy `np.ndarray` objects.
+                    - `'jax'`: Return JAX `jnp.ndarray` objects.
+        """
+        if images is None and text is None:
+            raise ValueError("You have to specify either images or text.")
+
+        text_encoding = None
+
+        # add pixel_values encoding. If we also have text_encoding, update image encoding and return it.
+        # else, return the text encoding.
+        output_kwargs = self._merge_kwargs(
+            BlipProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        if text is not None:
+            text_encoding = self.tokenizer(text, **output_kwargs["text_kwargs"])
+        if images is not None:
+            encoding_image_processor = self.image_processor(images, **output_kwargs["images_kwargs"])
+
+            if text_encoding is not None:
+                encoding_image_processor.update(text_encoding)
+            return encoding_image_processor
+
+        return text_encoding
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        tokenizer_input_names = [name for name in tokenizer_input_names if name != "token_type_ids"]
+        return tokenizer_input_names + image_processor_input_names
+
+
+__all__ = ["BlipProcessor"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blt/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blt/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..487753e8b5f9ec04ee46a482d14ab68d2e15361e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blt/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blt/__pycache__/configuration_blt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blt/__pycache__/configuration_blt.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..da1f18bb95f45f59b3bdf7cf2650e2a843480ca9
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blt/__pycache__/configuration_blt.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blt/__pycache__/modeling_blt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blt/__pycache__/modeling_blt.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dd7176f6a7ef98d9f4c99ae68bc0aef7ca8fbedb
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blt/__pycache__/modeling_blt.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blt/__pycache__/modular_blt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blt/__pycache__/modular_blt.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d6fa7be1ac3d188d9e1cc635afb766427265df77
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/blt/__pycache__/modular_blt.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ebdb9f103b8ae6cd4cdcf0aca0b7114911ecb583
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/__pycache__/configuration_bridgetower.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/__pycache__/configuration_bridgetower.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9494384cbe71eee86707b5ea58d422fafc5da1b2
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/__pycache__/configuration_bridgetower.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/__pycache__/image_processing_bridgetower.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/__pycache__/image_processing_bridgetower.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f7dca811edcb346affef57b976ecf9b74d52f912
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/__pycache__/image_processing_bridgetower.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/__pycache__/image_processing_bridgetower_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/__pycache__/image_processing_bridgetower_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ef7f761d85b4bf539f6d4464d6ac25a1278da5d6
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/__pycache__/image_processing_bridgetower_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/__pycache__/modeling_bridgetower.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/__pycache__/modeling_bridgetower.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..01bc84d1cfd718d62d49cd8a208c78690542043f
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/__pycache__/modeling_bridgetower.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/__pycache__/processing_bridgetower.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/__pycache__/processing_bridgetower.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cbc4d01e2b4dcdb93b6499553e21722f794c81b5
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/bridgetower/__pycache__/processing_bridgetower.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/byt5/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/byt5/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb726942b0f16105f8a5a5f7c661485951d7ccc7
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/byt5/__init__.py
@@ -0,0 +1,26 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .tokenization_byt5 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/byt5/__pycache__/tokenization_byt5.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/byt5/__pycache__/tokenization_byt5.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cacbdb5774cd99a6d460705a47211970d216bbc9
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/byt5/__pycache__/tokenization_byt5.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/byt5/tokenization_byt5.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/byt5/tokenization_byt5.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a9804db1014a963fb2054083f2db2782a41016d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/byt5/tokenization_byt5.py
@@ -0,0 +1,236 @@
+# coding=utf-8
+# Copyright 2021 T5 Authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization class for model ByT5."""
+
+import warnings
+from typing import Optional
+
+from ...tokenization_utils import AddedToken, PreTrainedTokenizer
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class ByT5Tokenizer(PreTrainedTokenizer):
+    """
+    Construct a ByT5 tokenizer. ByT5 simply uses raw bytes utf-8 encoding.
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
+            The token used is the `sep_token`.
+
+            </Tip>
+
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        extra_ids (`int`, *optional*, defaults to 125):
+            Add a number of extra ids added to the end of the vocabulary for use as sentinels. These tokens are
+            accessible as "<extra_id_{%d}>" where "{%d}" is a number between 0 and extra_ids-1. Extra tokens are
+            indexed from the end of the vocabulary up to beginning ("<extra_id_0>" is the last token in the vocabulary
+            like in ByT5 preprocessing see
+            [here](https://github.com/google-research/text-to-text-transfer-transformer/blob/9fd7b14a769417be33bc6c850f9598764913c833/t5/data/preprocessors.py#L2117)).
+        additional_special_tokens (`list[str]`, *optional*):
+            Additional special tokens used by the tokenizer.
+    """
+
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        eos_token="</s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        extra_ids=125,
+        additional_special_tokens=None,
+        **kwargs,
+    ) -> None:
+        # Add extra_ids to the special token list
+        if extra_ids > 0 and additional_special_tokens is None:
+            additional_special_tokens = [f"<extra_id_{i}>" for i in range(extra_ids)]
+        elif extra_ids > 0 and additional_special_tokens is not None and len(additional_special_tokens) > 0:
+            # Check that we have the right number of extra_id special tokens
+            extra_tokens = len(set(filter(lambda x: bool("extra_id" in str(x)), additional_special_tokens)))
+            if extra_tokens != extra_ids:
+                raise ValueError(
+                    f"Both extra_ids ({extra_ids}) and additional_special_tokens ({additional_special_tokens}) are"
+                    " provided to ByT5Tokenizer. In this case the additional_special_tokens must include the"
+                    " extra_ids tokens"
+                )
+
+        pad_token = AddedToken(pad_token, lstrip=True, rstrip=True) if isinstance(pad_token, str) else pad_token
+        # we force left and right stripping for backward compatibility. The byt5tests depend on this.
+        eos_token = AddedToken(eos_token, lstrip=True, rstrip=True) if isinstance(eos_token, str) else eos_token
+        unk_token = AddedToken(unk_token, lstrip=True, rstrip=True) if isinstance(unk_token, str) else unk_token
+        # unk token needs to be in the vocab with correct index
+        self._added_tokens_decoder = {0: pad_token, 1: eos_token, 2: unk_token}
+        self.offset = len(self._added_tokens_decoder)
+        self._utf_vocab_size = 2**8  # utf is 8 bits
+        super().__init__(
+            eos_token=eos_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            extra_ids=0,
+            additional_special_tokens=additional_special_tokens,  # TODO extra ids are not used :sweatywmile:
+            **kwargs,
+        )
+
+    @property
+    def vocab_size(self):
+        return self._utf_vocab_size
+
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size + self.offset)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def get_special_tokens_mask(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
+    ) -> list[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        # normal case: some special tokens
+        if token_ids_1 is None:
+            return ([0] * len(token_ids_0)) + [1]
+        return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+
+    def _add_eos_if_not_present(self, token_ids: list[int]) -> list[int]:
+        """Do not add eos again if user already added it."""
+        if len(token_ids) > 0 and token_ids[-1] == self.eos_token_id:
+            warnings.warn(
+                f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated"
+                " eos tokens being added."
+            )
+            return token_ids
+        else:
+            return token_ids + [self.eos_token_id]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. ByT5 does not
+        make use of token type ids, therefore a list of zeros is returned.
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `list[int]`: List of zeros.
+        """
+        eos = [self.eos_token_id]
+
+        if token_ids_1 is None:
+            return len(token_ids_0 + eos) * [0]
+        return len(token_ids_0 + eos + token_ids_1 + eos) * [0]
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A sequence has the following format:
+
+        - single sequence: `X </s>`
+        - pair of sequences: `A </s> B </s>`
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        token_ids_0 = self._add_eos_if_not_present(token_ids_0)
+        if token_ids_1 is None:
+            return token_ids_0
+        else:
+            token_ids_1 = self._add_eos_if_not_present(token_ids_1)
+            return token_ids_0 + token_ids_1
+
+    def _tokenize(self, text: str) -> list[str]:
+        """Take as input a string and return a list of strings (tokens) for words/sub-words"""
+        tokens = [chr(i) for i in text.encode("utf-8")]
+        return tokens
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+
+        if len(token) != 1:
+            token_id = None
+        else:
+            token_id = ord(token) + self.offset
+
+        return token_id
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        token = chr(index - self.offset)
+        return token
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        bstring = b""
+        for token in tokens:
+            if token in self.added_tokens_decoder:
+                tok_string = self.added_tokens_decoder[token].encode("utf-8")
+            elif token in self.added_tokens_encoder:
+                tok_string = token.encode("utf-8")
+            else:
+                tok_string = bytes([ord(token)])
+            bstring += tok_string
+        string = bstring.decode("utf-8", errors="ignore")
+        return string
+
+    # ByT5Tokenizer has no vocab file
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        return ()
+
+
+__all__ = ["ByT5Tokenizer"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/chameleon/modeling_chameleon.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/chameleon/modeling_chameleon.py
new file mode 100644
index 0000000000000000000000000000000000000000..033b8ecd7c630208602e1ae4d38b95df7718aaed
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/chameleon/modeling_chameleon.py
@@ -0,0 +1,1169 @@
+# coding=utf-8
+# Copyright 2024 Meta Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Chameleon model."""
+
+from functools import cached_property
+from typing import Callable, Optional, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...masking_utils import create_causal_mask
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import (
+    TransformersKwargs,
+    auto_docstring,
+    can_return_tuple,
+    logging,
+)
+from ...utils.deprecation import deprecate_kwarg
+from .configuration_chameleon import ChameleonConfig, ChameleonVQVAEConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Chameleon
+class ChameleonRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        ChameleonRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+# copied from transformers.models.llama.modeling_llama.LlamaRotaryEmbedding with Llama->Chameleon
+# TODO(joao): add me back asap :)
+class ChameleonRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        super().__init__()
+        self.scaling_factor = scaling_factor
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (
+            self.base
+            ** (torch.arange(0, self.dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / self.dim)
+        )
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        # For BC we register cos and sin cached
+        self.max_seq_len_cached = max_position_embeddings
+
+    @torch.no_grad()
+    def forward(self, x, position_ids):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1)
+        position_ids_expanded = position_ids[:, None, :].float()
+        # Force float32 since bfloat16 loses precision on long contexts
+        # See https://github.com/huggingface/transformers/pull/29285
+        device_type = x.device.type
+        device_type = device_type if device_type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos()
+            sin = emb.sin()
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+class ChameleonLinearScalingRotaryEmbedding(ChameleonRotaryEmbedding):
+    """ChameleonRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+
+    def forward(self, x, position_ids):
+        # difference to the original RoPE: a scaling factor is applied to the position ids
+        position_ids = position_ids.float() / self.scaling_factor
+        cos, sin = super().forward(x, position_ids)
+        return cos, sin
+
+
+class ChameleonDynamicNTKScalingRotaryEmbedding(ChameleonRotaryEmbedding):
+    """ChameleonRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+
+    def forward(self, x, position_ids):
+        # difference to the original RoPE: inv_freq is recomputed when the sequence length > original length
+        seq_len = torch.max(position_ids) + 1
+        if seq_len > self.max_position_embeddings:
+            base = self.base * (
+                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
+            ) ** (self.dim / (self.dim - 2))
+            inv_freq = 1.0 / (
+                base
+                ** (torch.arange(0, self.dim, 2, dtype=torch.int64).to(device=x.device, dtype=torch.float) / self.dim)
+            )
+            self.register_buffer("inv_freq", inv_freq, persistent=False)  # TODO joao: this may break with compilation
+
+        cos, sin = super().forward(x, position_ids)
+        return cos, sin
+
+
+# Copied from transformers.models.llama.modeling_llama.rotate_half
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+# Copied from transformers.models.llama.modeling_llama.apply_rotary_pos_emb
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaMLP with Llama->Chameleon
+class ChameleonMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=config.mlp_bias)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.mlp_bias)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    # Ignore copy
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+
+class ChameleonLayerNorm(nn.LayerNorm):
+    """
+    LayerNorm but computes stats only over the last dim because Chameleon applies gamma and beta
+    from each shard separately to each head, instead of reducing. We can apply each head's own
+    gamma/beta by repeat-interleaving weights from each shard, but the stats have to be computed
+    in the last dimension. This module applies gamma/beta manually to fulfill this requirement.
+    """
+
+    def __init__(self, hidden_size, *args, **kwargs):
+        super().__init__(hidden_size, *args, **kwargs)
+        self.normalized_shape = (hidden_size[-1],)
+
+    def forward(self, hidden_states):
+        hidden_states = F.layer_norm(hidden_states, self.normalized_shape, None, None, eps=1e-5)
+        hidden_states = hidden_states * self.weight + self.bias
+        return hidden_states
+
+
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+# Copied from transformers.models.llama.modeling_llama.eager_attention_forward
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs: Unpack[TransformersKwargs],
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class ChameleonAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: ChameleonConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+        self.model_parallel_size = config.model_parallel_size
+        self.scaling = self.head_dim**-0.5
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=config.attention_bias)
+        self.q_norm = ChameleonLayerNorm((self.num_heads, self.head_dim))
+        self.k_norm = ChameleonLayerNorm((self.num_key_value_heads, self.head_dim))
+        self._init_rope()
+
+    # copied from transformers.models.llama.modeling_llama.LlamaAttention._init_rope with Llama->Chameleon
+    # TODO(joao): add me back asap :)
+    def _init_rope(self):
+        if self.config.rope_scaling is None:
+            self.rotary_emb = ChameleonRotaryEmbedding(
+                self.head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                base=self.rope_theta,
+            )
+        else:
+            scaling_type = self.config.rope_scaling["type"]
+            scaling_factor = self.config.rope_scaling["factor"]
+            if scaling_type == "linear":
+                self.rotary_emb = ChameleonLinearScalingRotaryEmbedding(
+                    self.head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                )
+            elif scaling_type == "dynamic":
+                self.rotary_emb = ChameleonDynamicNTKScalingRotaryEmbedding(
+                    self.head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                )
+            else:
+                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.reshape(-1, self.num_heads, self.head_dim)
+        query_states = self.q_norm(query_states)
+
+        key_states = key_states.reshape(-1, self.num_key_value_heads, self.head_dim)
+        key_states = self.k_norm(key_states)
+
+        query_states = query_states.reshape(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.reshape(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        cos, sin = self.rotary_emb(value_states, position_ids)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; position_ids needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+
+        return attn_output, attn_weights
+
+
+# copied from transformers.models.llama.modeling_llama.LlamaDecoderLayer with Llama->Chameleon, LLAMA->CHAMELEON
+class ChameleonDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: ChameleonConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = ChameleonAttention(config=config, layer_idx=layer_idx)
+
+        self.mlp = ChameleonMLP(config)
+        self.input_layernorm = ChameleonRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = ChameleonRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*):
+                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+                query_sequence_length, key_sequence_length)` if default attention is used.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_values (`Cache`, *optional*): cached past key and value projection states
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence
+            kwargs (`dict`, *optional*):
+                Arbitrary kwargs to be ignored, used for FSDP and other methods that injects code
+                into the model
+        """
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        return outputs
+
+
+class ChameleonSwinDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: ChameleonConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = ChameleonAttention(config=config, layer_idx=layer_idx)
+
+        self.mlp = ChameleonMLP(config)
+        self.input_layernorm = ChameleonRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = ChameleonRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*):
+                attention mask of size `(batch_size, sequence_length)` if flash attention is used or `(batch_size, 1,
+                query_sequence_length, key_sequence_length)` if default attention is used.
+            position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Indices of positions of each input sequence tokens in the position embeddings
+            past_key_values (`Cache`, *optional*): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence.
+        """
+
+        residual = hidden_states
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = residual + hidden_states
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        return outputs
+
+
+class ChameleonVQVAEVectorQuantizer(nn.Module):
+    """
+    A module for vector quantization using learned embedding vectors.
+
+    This module implements the quantization process similar to te one described in
+    the VQ-VAE (Vector Quantized Variational AutoEncoder) paper. It quantizes continuous
+    input vectors into discrete codebook vectors, which are learned during training.
+    Current implementation improves over previous ones by avoiding costly matrix multiplications
+    and allowing for post-hoc remapping of indices.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.num_embeddings = config.num_embeddings
+        self.embedding_dim = config.embed_dim
+        self.beta = getattr(config, "beta", 0.25)
+
+        self.embedding = nn.Embedding(self.num_embeddings, self.embedding_dim)
+
+    def forward(self, hidden_state: torch.Tensor):
+        hidden_state = hidden_state.permute(0, 2, 3, 1).contiguous()
+        hidden_state_flattened = hidden_state.view(-1, self.embedding_dim)
+
+        # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z
+        distances = (
+            torch.sum(hidden_state_flattened**2, dim=1, keepdim=True)
+            + torch.sum(self.embedding.weight**2, dim=1)
+            - 2 * torch.einsum("bd,dn->bn", hidden_state_flattened, self.embedding.weight.transpose(0, 1))
+        )
+
+        min_encoding_indices = torch.argmin(distances, dim=1)
+        hidden_state_quant = self.embedding(min_encoding_indices).view(hidden_state.shape)
+
+        # compute loss for embedding
+        loss = torch.mean((hidden_state_quant.detach() - hidden_state) ** 2) + self.beta * torch.mean(
+            (hidden_state_quant - hidden_state.detach()) ** 2
+        )
+
+        # preserve gradients
+        hidden_state_quant = hidden_state + (hidden_state_quant - hidden_state).detach()
+
+        # reshape back to match original input shape
+        hidden_state_quant = hidden_state_quant.permute(0, 3, 1, 2).contiguous()
+
+        return hidden_state_quant, loss, min_encoding_indices
+
+
+class ChameleonVQVAEEncoderConvDownsample(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0)
+
+    def forward(self, hidden_states):
+        # no asymmetric padding in torch conv, must do it ourselves
+        hidden_states = F.pad(hidden_states, pad=(0, 1, 0, 1), mode="constant", value=0)
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
+
+
+class ChameleonVQVAEEncoderResnetBlock(nn.Module):
+    def __init__(
+        self,
+        config,
+        in_channels,
+        out_channels=None,
+        conv_shortcut=False,
+    ):
+        super().__init__()
+        self.in_channels = in_channels
+        self.out_channels = in_channels if out_channels is None else out_channels
+        self.use_conv_shortcut = conv_shortcut
+
+        self.norm1 = torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+        self.conv1 = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.norm2 = torch.nn.GroupNorm(num_groups=32, num_channels=out_channels, eps=1e-6, affine=True)
+        self.dropout = torch.nn.Dropout(config.dropout)
+        self.conv2 = torch.nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                self.conv_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+            else:
+                self.nin_shortcut = torch.nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+
+    def forward(self, hidden_states):
+        residual = hidden_states
+        hidden_states = self.norm1(hidden_states)
+        hidden_states *= torch.sigmoid(hidden_states)
+        hidden_states = self.conv1(hidden_states)
+
+        hidden_states = self.norm2(hidden_states)
+        hidden_states *= torch.sigmoid(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+
+        if self.in_channels != self.out_channels:
+            if self.use_conv_shortcut:
+                residual = self.conv_shortcut(residual)
+            else:
+                residual = self.nin_shortcut(residual)
+
+        return residual + hidden_states
+
+
+class ChameleonVQVAEEncoderAttnBlock(nn.Module):
+    def __init__(self, in_channels):
+        super().__init__()
+        self.in_channels = in_channels
+
+        self.norm = torch.nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+        self.q = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.k = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.v = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+        self.proj_out = torch.nn.Conv2d(in_channels, in_channels, kernel_size=1, stride=1, padding=0)
+
+    def forward(self, hidden_states):
+        residual = hidden_states
+        hidden_states = self.norm(hidden_states)
+        query_states = self.q(hidden_states)
+        key_states = self.k(hidden_states)
+        value_states = self.v(hidden_states)
+
+        # compute attention
+        batch_size, channels, height, width = query_states.shape
+        query_states = query_states.reshape(batch_size, channels, height * width).permute(0, 2, 1)
+        key_states = key_states.reshape(batch_size, channels, height * width)
+        attn_weights = torch.bmm(query_states, key_states)
+        attn_weights = attn_weights * (int(channels) ** (-0.5))
+        attn_weights = F.softmax(attn_weights, dim=2)
+
+        # attend to values
+        value_states = value_states.reshape(batch_size, channels, height * width)
+        attn_weights = attn_weights.permute(0, 2, 1)
+        attn_output = torch.bmm(value_states, attn_weights).reshape(batch_size, channels, height, width)
+
+        attn_output = self.proj_out(attn_output)
+        return residual + attn_output
+
+
+class ChameleonVQVAEEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        self.num_resolutions = len(config.channel_multiplier)
+        self.num_res_blocks = config.num_res_blocks
+        base_channels = config.base_channels
+        resolution = config.resolution
+        in_channels = config.in_channels
+        double_latent = config.double_latent
+        latent_channels = config.latent_channels
+        channel_multiplier = config.channel_multiplier
+
+        self.conv_in = torch.nn.Conv2d(in_channels, base_channels, kernel_size=3, stride=1, padding=1)
+
+        curr_res = resolution
+        in_channel_multiplier = (1,) + tuple(channel_multiplier)
+        self.in_channel_multiplier = in_channel_multiplier
+        self.down = nn.ModuleList()
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = base_channels * in_channel_multiplier[i_level]
+            block_out = base_channels * channel_multiplier[i_level]
+            for i_block in range(self.num_res_blocks):
+                block.append(
+                    ChameleonVQVAEEncoderResnetBlock(
+                        config=config,
+                        in_channels=block_in,
+                        out_channels=block_out,
+                    )
+                )
+                block_in = block_out
+                if (
+                    config.attn_resolutions is not None
+                    and curr_res in config.attn_resolutions
+                    and config.attn_type == "vanilla"
+                ):
+                    attn.append(ChameleonVQVAEEncoderAttnBlock(block_in))
+
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = ChameleonVQVAEEncoderConvDownsample(block_in)
+                curr_res = curr_res // 2
+            self.down.append(down)
+
+        self.mid = nn.Module()
+        self.mid.block_1 = ChameleonVQVAEEncoderResnetBlock(
+            config=config,
+            in_channels=block_in,
+            out_channels=block_in,
+        )
+        self.mid.attn_1 = ChameleonVQVAEEncoderAttnBlock(block_in) if config.attn_type == "vanilla" else nn.Identity()
+        self.mid.block_2 = ChameleonVQVAEEncoderResnetBlock(
+            config=config,
+            in_channels=block_in,
+            out_channels=block_in,
+        )
+
+        self.norm_out = torch.nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
+        self.conv_out = torch.nn.Conv2d(
+            block_in,
+            2 * latent_channels if double_latent else latent_channels,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+        )
+
+    def forward(self, pixel_values: torch.LongTensor):
+        # downsampling
+        hidden_states = [self.conv_in(pixel_values)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                hidden_state = self.down[i_level].block[i_block](
+                    hidden_states[-1],
+                )
+                if len(self.down[i_level].attn) > 0:
+                    hidden_state = self.down[i_level].attn[i_block](hidden_state)
+                hidden_states.append(hidden_state)
+            if i_level != self.num_resolutions - 1:
+                hidden_states.append(self.down[i_level].downsample(hidden_states[-1]))
+
+        # middle
+        last_hidden_state = hidden_states[-1]
+        last_hidden_state = self.mid.block_1(last_hidden_state)
+        last_hidden_state = self.mid.attn_1(last_hidden_state)
+        last_hidden_state = self.mid.block_2(last_hidden_state)
+
+        # end
+        last_hidden_state = self.norm_out(last_hidden_state)
+        last_hidden_state *= torch.sigmoid(last_hidden_state)
+        last_hidden_state = self.conv_out(last_hidden_state)
+        return last_hidden_state
+
+
+class ChameleonImageVocabularyMapping:
+    """
+    A class for mapping discrete image tokens from VQGAN to BPE tokens.
+    """
+
+    def __init__(self, vocab_map):
+        self.vocab_map = vocab_map
+        self.image_token_id = vocab_map.get("<image>")
+
+    @cached_property
+    def val2name(self):
+        return {v: k for k, v in self.vocab_map.items()}
+
+    @cached_property
+    def image_tokens(self):
+        return sorted([val for name, val in self.vocab_map.items() if name.startswith("IMGIMG")])
+
+    @cached_property
+    def bpe2img(self):
+        img_tkn_chr_mapping = {chr(ord("A") + i): str(i) for i in range(10)}
+
+        def remap(old_name: str) -> str:
+            return "".join(img_tkn_chr_mapping.get(c, c) for c in old_name[len("IMGIMG") : -1])
+
+        return {tok: int(remap(self.val2name[tok])) for tok in self.image_tokens}
+
+    @cached_property
+    def img2bpe(self):
+        return {v: k for k, v in self.bpe2img.items()}
+
+    @cached_property
+    def bpe2img_search_tensors(self):
+        return torch.tensor(sorted(self.bpe2img.keys())), torch.tensor(sorted(self.bpe2img.values()))
+
+    @cached_property
+    def img2bpe_mapping_tensor(self):
+        mapping = torch.zeros(max(self.img2bpe.keys()) + 1, dtype=torch.int)
+        for k, v in self.img2bpe.items():
+            mapping[k] = v
+        return mapping
+
+    def convert_img2bpe(self, img_batch: torch.Tensor) -> torch.Tensor:
+        device = img_batch.device
+        img_tokens = self.img2bpe_mapping_tensor[img_batch.to("cpu")]
+        return img_tokens.to(device)
+
+
+@auto_docstring
+class ChameleonPreTrainedModel(PreTrainedModel):
+    config: ChameleonConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["ChameleonDecoderLayer", "ChameleonSwinDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values", "causal_mask"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+
+    _can_compile_fullgraph = True
+    _supports_param_buffer_assignment = False
+    _supports_flex_attn = True
+    _supports_attention_backend = True
+
+
+@auto_docstring(
+    custom_intro="""
+    The VQ-VAE model used in Chameleon for encoding/decoding images into discrete tokens.
+    This model follows the "Make-a-scene: Scene-based text-to-image generation with human priors" paper from
+    [ Oran Gafni, Adam Polyak, Oron Ashual, Shelly Sheynin, Devi Parikh, and Yaniv
+    Taigman](https://huggingface.co/papers/2203.13131).
+    """
+)
+class ChameleonVQVAE(ChameleonPreTrainedModel):
+    config: ChameleonVQVAEConfig
+    _no_split_modules = [
+        "ChameleonVQVAEVectorQuantizer",
+        "ChameleonVQVAEEncoderAttnBlock",
+        "ChameleonVQVAEEncoderResnetBlock",
+    ]
+
+    def __init__(self, config: ChameleonVQVAEConfig):
+        super().__init__(config)
+
+        self.encoder = ChameleonVQVAEEncoder(config)
+        self.quantize = ChameleonVQVAEVectorQuantizer(config)
+        self.quant_conv = torch.nn.Conv2d(config.latent_channels, config.embed_dim, 1)
+        self.post_quant_conv = torch.nn.Conv2d(config.embed_dim, config.latent_channels, 1)
+        self.eval()  # Chameleon's VQ model is frozen
+
+    def encode(self, pixel_values: torch.LongTensor):
+        hidden_states = self.encoder(pixel_values)
+        hidden_states = self.quant_conv(hidden_states)
+        quant, emb_loss, indices = self.quantize(hidden_states)
+        return quant, emb_loss, indices
+
+
+@auto_docstring
+class ChameleonModel(ChameleonPreTrainedModel):
+    def __init__(self, config: ChameleonConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.vocabulary_mapping = ChameleonImageVocabularyMapping(config.vocabulary_map)
+        decoder_layer = ChameleonDecoderLayer if not self.config.swin_norm else ChameleonSwinDecoderLayer
+        self.layers = nn.ModuleList(
+            [decoder_layer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = ChameleonRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.vqmodel = ChameleonVQVAE._from_config(config.vq_config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_image_tokens(self, pixel_values: torch.FloatTensor):
+        """
+        Tokenizes images into discrete tokens with VQGAN module. Converts
+        obtained image tokens into BPE tokens and wraps with "boi" and "eoi"
+        special tokens.
+
+        Args:
+            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
+                The tensors corresponding to the input images.
+        """
+        batch_size = pixel_values.shape[0]
+        _, _, image_toks = self.vqmodel.encode(pixel_values)
+        bpe_toks = self.vocabulary_mapping.convert_img2bpe(image_toks)
+        bpe_toks = bpe_toks.view(batch_size, -1)
+        return bpe_toks
+
+    def get_image_features(self, pixel_values: torch.FloatTensor):
+        """
+        Tokenizes images into discrete tokens with VQGAN module and embeds
+        them with text embeddings layer
+
+        Args:
+            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, image_size, image_size)):
+                The tensors corresponding to the input images.
+        """
+        image_tokens = self.get_image_tokens(pixel_values)
+        vision_embeddings = self.get_input_embeddings()(image_tokens)
+        return vision_embeddings
+
+    def get_placeholder_mask(
+        self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor
+    ):
+        """
+        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
+        equal to the length of multimodal features. If the lengths are different, an error is raised.
+        """
+        if input_ids is None:
+            special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.vocabulary_mapping.image_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_image_mask = special_image_mask.all(-1)
+        else:
+            special_image_mask = input_ids == self.vocabulary_mapping.image_token_id
+
+        n_image_tokens = special_image_mask.sum()
+        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        n_image_features = image_features.shape[0] * image_features.shape[1]
+        if inputs_embeds[special_image_mask].numel() != image_features.numel():
+            raise ValueError(
+                f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+            )
+        return special_image_mask
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Union[tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if pixel_values is not None:
+            image_embeds = self.get_image_features(pixel_values)
+            special_image_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, image_features=image_embeds
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_embeds)
+
+        # torch.jit.trace() doesn't support cache objects in the output
+        if use_cache and past_key_values is None and not torch.jit.is_tracing():
+            past_key_values = DynamicCache(config=self.config)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+
+        # embed positions
+        hidden_states = inputs_embeds
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                **kwargs,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v for v in [hidden_states, past_key_values, all_hidden_states, all_self_attns] if v is not None
+            )
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    Chameleon Model with a head on top used for outputting logits for next token prediction.
+    """
+)
+class ChameleonForConditionalGeneration(ChameleonPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = ChameleonModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_image_tokens(self, pixel_values):
+        return self.model.get_image_tokens(pixel_values)
+
+    def get_image_features(self, pixel_values):
+        return self.model.get_image_features(pixel_values)
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, CausalLMOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from transformers import ChameleonProcessor, ChameleonForConditionalGeneration
+        >>> import torch
+        >>> import requests
+        >>> from PIL import Image
+
+        >>> model = ChameleonForConditionalGeneration.from_pretrained("facebook/chameleon-7b", dtype=torch.bfloat16)
+        >>> processor = ChameleonProcessor.from_pretrained("facebook/chameleon-7b")
+
+        >>> prompt = "I used to know a lot about constellations when I was younger, but as I grew older, I forgot most of what I knew. These are the only two constellations that I really remember now.<image><image>I would like for you to tell me about 3 more constellations and give me a little bit of history about the constellation."
+        >>> image = Image.open(requests.get("https://nineplanets.org/wp-content/uploads/2020/12/the-big-dipper-1.jpg", stream=True).raw)
+        >>> image_2 = Image.open(requests.get("https://www.kxan.com/wp-content/uploads/sites/40/2020/10/ORION.jpg", stream=True).raw)
+
+        >>> inputs = processor(images=[image, image_2], text=prompt, return_tensors="pt").to(model.device, torch.bfloat16)
+
+        >>> generated_ids = model.generate(**inputs, max_new_tokens=100, do_sample=False)
+        >>> processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs[0]
+        logits = self.lm_head(hidden_states)
+
+        # Disallow image tokens which does not include special begin-image and end-image tokens
+        image_tokens = self.model.vocabulary_mapping.image_tokens
+        logits[:, :, image_tokens] = torch.finfo(logits.dtype).min
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        pixel_values=None,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        **kwargs,
+    ):
+        # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
+
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            pixel_values=pixel_values,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            position_ids=position_ids,
+            use_cache=use_cache,
+            **kwargs,
+        )
+
+        if cache_position[0] != 0:
+            # If we're in cached decoding stage, pixel values should be `None` because input ids do not contain special image token anymore
+            # Otherwise we need pixel values to be passed to model
+            model_inputs["pixel_values"] = None
+
+        return model_inputs
+
+
+__all__ = ["ChameleonForConditionalGeneration", "ChameleonModel", "ChameleonPreTrainedModel", "ChameleonVQVAE"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..53ea73dd0c14060b758f811234b95b6c484a2269
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/__pycache__/configuration_clap.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/__pycache__/configuration_clap.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ab3660bd5ead026f9b64e59553876e133243eb70
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/__pycache__/configuration_clap.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/__pycache__/feature_extraction_clap.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/__pycache__/feature_extraction_clap.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6a1a8121c8ad755e05175cd05f6b4def439928c2
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/__pycache__/feature_extraction_clap.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/__pycache__/modeling_clap.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/__pycache__/modeling_clap.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..97b11a07240e804b7240af0df0f62bf37b507752
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/__pycache__/modeling_clap.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/__pycache__/processing_clap.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/__pycache__/processing_clap.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2a3c756c8259fdf8d16362f72c9c65b802219c53
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clap/__pycache__/processing_clap.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9018656c0b8baaa34509b62abe2b204d26bd747d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/configuration_clip.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/configuration_clip.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..db643e9620cbc4380f66e0c1728b268c4971dfbe
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/configuration_clip.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/feature_extraction_clip.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/feature_extraction_clip.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9c39dc4c96b17f50052cedfe257b7987362cf937
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/feature_extraction_clip.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/image_processing_clip.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/image_processing_clip.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fd45b775ad5c444c5ea88899ffc4fce6e3e7a7cd
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/image_processing_clip.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/image_processing_clip_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/image_processing_clip_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bdf46cb18a085c0f48976aa8bf0a1a5cba958fa3
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/image_processing_clip_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/modeling_clip.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/modeling_clip.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..83d5ca52ec303ce8132f5c72c6fed62280d8beee
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/modeling_clip.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/modeling_flax_clip.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/modeling_flax_clip.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..79fa5c2722fb9218861b565f61dcb2580f5441e3
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/modeling_flax_clip.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/modeling_tf_clip.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/modeling_tf_clip.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d037fc30d4402d1a77ef8e2ef516d054255e48ef
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/modeling_tf_clip.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/processing_clip.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/processing_clip.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f8b311ef842d560873f7a6367eeb8aaedcac583f
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/processing_clip.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/tokenization_clip.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/tokenization_clip.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4dcc41cce811f9ffd560f4dc2439e38c79e7e869
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/tokenization_clip.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/tokenization_clip_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/tokenization_clip_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ae1b9d1b2e48b998018e77ddc42099ab5fd4ecb5
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clip/__pycache__/tokenization_clip_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clipseg/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clipseg/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9a96d64532a85a58a4d09c4d83fac10ce2db0dc1
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clipseg/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clipseg/__pycache__/configuration_clipseg.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clipseg/__pycache__/configuration_clipseg.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3be80c07d48e13d7308a1e3701e30b4738d1ad2c
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clipseg/__pycache__/configuration_clipseg.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clipseg/__pycache__/modeling_clipseg.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clipseg/__pycache__/modeling_clipseg.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c388077540c517bdf11bf6acb68803a55c808e8e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clipseg/__pycache__/modeling_clipseg.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clipseg/__pycache__/processing_clipseg.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clipseg/__pycache__/processing_clipseg.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bee811d085c8b0e0859833887b73bd14c79917f9
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clipseg/__pycache__/processing_clipseg.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clvp/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clvp/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..78be2bc4895f94929da48f07c8329a5e8179864d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clvp/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clvp/__pycache__/configuration_clvp.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clvp/__pycache__/configuration_clvp.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1461f2cec4b4f07d1de1789b8f632d09b9f867b1
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clvp/__pycache__/configuration_clvp.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clvp/__pycache__/feature_extraction_clvp.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clvp/__pycache__/feature_extraction_clvp.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..52555c678fd89d898b6fdba5bab40190d3e2b201
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clvp/__pycache__/feature_extraction_clvp.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clvp/__pycache__/modeling_clvp.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clvp/__pycache__/modeling_clvp.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..84c80c68b2da0001190e235152724c2b8033b83d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clvp/__pycache__/modeling_clvp.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clvp/__pycache__/number_normalizer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clvp/__pycache__/number_normalizer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..49b524dd8765fb5ec4ffdebd4368c7f9b735d65e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clvp/__pycache__/number_normalizer.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clvp/__pycache__/processing_clvp.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clvp/__pycache__/processing_clvp.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0329091cb51053fc9d57ebe3b548999160290f31
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clvp/__pycache__/processing_clvp.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clvp/__pycache__/tokenization_clvp.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clvp/__pycache__/tokenization_clvp.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bbc118fb933c1182fe2bdc3834f42dbb2222e52a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/clvp/__pycache__/tokenization_clvp.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/codegen/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/codegen/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea2d9af11150f556b2cedfb78271f174256e64b0
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/codegen/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_codegen import *
+    from .modeling_codegen import *
+    from .tokenization_codegen import *
+    from .tokenization_codegen_fast import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/codegen/configuration_codegen.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/codegen/configuration_codegen.py
new file mode 100644
index 0000000000000000000000000000000000000000..6a9ab842710cdd67911305ce324fe4c68dec173b
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/codegen/configuration_codegen.py
@@ -0,0 +1,231 @@
+# coding=utf-8
+# Copyright 2022 Salesforce authors, The EleutherAI, and HuggingFace Teams. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""CodeGen model configuration"""
+
+from collections import OrderedDict
+from collections.abc import Mapping
+from typing import Any, Optional
+
+from ... import PreTrainedTokenizer, TensorType, is_torch_available
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfigWithPast, PatchingSpec
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class CodeGenConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`CodeGenModel`]. It is used to instantiate a
+    CodeGen model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the CodeGen
+    [Salesforce/codegen-2B-mono](https://huggingface.co/Salesforce/codegen-2B-mono) architecture. Configuration objects
+    inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the documentation from
+    [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50400):
+            Vocabulary size of the CodeGen model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`CodeGenModel`].
+        n_positions (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        n_ctx (`int`, *optional*, defaults to 2048):
+            This attribute is used in `CodeGenModel.__init__` without any real effect.
+        n_embd (`int`, *optional*, defaults to 4096):
+            Dimensionality of the embeddings and hidden states.
+        n_layer (`int`, *optional*, defaults to 28):
+            Number of hidden layers in the Transformer encoder.
+        n_head (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        rotary_dim (`int`, *optional*, defaults to 64):
+            Number of dimensions in the embedding that Rotary Position Embedding is applied to.
+        n_inner (`int`, *optional*):
+            Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd
+        activation_function (`str`, *optional*, defaults to `"gelu_new"`):
+            Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new"]`.
+        resid_pdrop (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        embd_pdrop (`int`, *optional*, defaults to 0.0):
+            The dropout ratio for the embeddings.
+        attn_pdrop (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-05):
+            The epsilon to use in the layer normalization layers.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        bos_token_id (`int`, *optional*, defaults to 50256):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 50256):
+            End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the
+            model has a output word embedding layer.
+
+    Example:
+
+    ```python
+    >>> from transformers import CodeGenConfig, CodeGenModel
+
+    >>> # Initializing a CodeGen 6B configuration
+    >>> configuration = CodeGenConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = CodeGenModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "codegen"
+    attribute_map = {
+        "max_position_embeddings": "n_positions",
+        "hidden_size": "n_embd",
+        "num_attention_heads": "n_head",
+        "num_hidden_layers": "n_layer",
+    }
+
+    def __init__(
+        self,
+        vocab_size=50400,
+        n_positions=2048,
+        n_ctx=2048,
+        n_embd=4096,
+        n_layer=28,
+        n_head=16,
+        rotary_dim=64,
+        n_inner=None,
+        activation_function="gelu_new",
+        resid_pdrop=0.0,
+        embd_pdrop=0.0,
+        attn_pdrop=0.0,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        use_cache=True,
+        bos_token_id=50256,
+        eos_token_id=50256,
+        tie_word_embeddings=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.n_ctx = n_ctx
+        self.n_positions = n_positions
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.n_inner = n_inner
+        self.rotary_dim = rotary_dim
+        self.activation_function = activation_function
+        self.resid_pdrop = resid_pdrop
+        self.embd_pdrop = embd_pdrop
+        self.attn_pdrop = attn_pdrop
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+
+        super().__init__(
+            bos_token_id=bos_token_id, eos_token_id=eos_token_id, tie_word_embeddings=tie_word_embeddings, **kwargs
+        )
+
+
+# Copied from transformers.models.gpt2.configuration_gpt2.GPT2OnnxConfig
+class CodeGenOnnxConfig(OnnxConfigWithPast):
+    def __init__(
+        self,
+        config: PretrainedConfig,
+        task: str = "default",
+        patching_specs: Optional[list[PatchingSpec]] = None,
+        use_past: bool = False,
+    ):
+        super().__init__(config, task=task, patching_specs=patching_specs, use_past=use_past)
+        if not getattr(self._config, "pad_token_id", None):
+            # TODO: how to do that better?
+            self._config.pad_token_id = 0
+
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        common_inputs = OrderedDict({"input_ids": {0: "batch", 1: "sequence"}})
+        if self.use_past:
+            self.fill_with_past_key_values_(common_inputs, direction="inputs")
+            common_inputs["attention_mask"] = {0: "batch", 1: "past_sequence + sequence"}
+        else:
+            common_inputs["attention_mask"] = {0: "batch", 1: "sequence"}
+
+        return common_inputs
+
+    @property
+    def num_layers(self) -> int:
+        return self._config.n_layer
+
+    @property
+    def num_attention_heads(self) -> int:
+        return self._config.n_head
+
+    def generate_dummy_inputs(
+        self,
+        tokenizer: PreTrainedTokenizer,
+        batch_size: int = -1,
+        seq_length: int = -1,
+        is_pair: bool = False,
+        framework: Optional[TensorType] = None,
+    ) -> Mapping[str, Any]:
+        common_inputs = super(OnnxConfigWithPast, self).generate_dummy_inputs(
+            tokenizer, batch_size=batch_size, seq_length=seq_length, is_pair=is_pair, framework=framework
+        )
+
+        # We need to order the input in the way they appears in the forward()
+        ordered_inputs = OrderedDict({"input_ids": common_inputs["input_ids"]})
+
+        # Need to add the past_keys
+        if self.use_past:
+            if not is_torch_available():
+                raise ValueError("Cannot generate dummy past_keys inputs without PyTorch installed.")
+            else:
+                import torch
+
+                batch, seqlen = common_inputs["input_ids"].shape
+                # Not using the same length for past_key_values
+                past_key_values_length = seqlen + 2
+                past_shape = (
+                    batch,
+                    self.num_attention_heads,
+                    past_key_values_length,
+                    self._config.hidden_size // self.num_attention_heads,
+                )
+                ordered_inputs["past_key_values"] = [
+                    (torch.zeros(past_shape), torch.zeros(past_shape)) for _ in range(self.num_layers)
+                ]
+
+        ordered_inputs["attention_mask"] = common_inputs["attention_mask"]
+        if self.use_past:
+            mask_dtype = ordered_inputs["attention_mask"].dtype
+            ordered_inputs["attention_mask"] = torch.cat(
+                [ordered_inputs["attention_mask"], torch.ones(batch, past_key_values_length, dtype=mask_dtype)], dim=1
+            )
+
+        return ordered_inputs
+
+    @property
+    def default_onnx_opset(self) -> int:
+        return 13
+
+
+__all__ = ["CodeGenConfig", "CodeGenOnnxConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/codegen/modeling_codegen.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/codegen/modeling_codegen.py
new file mode 100644
index 0000000000000000000000000000000000000000..887b400b479929c947ed2dfeadc436294033ad8a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/codegen/modeling_codegen.py
@@ -0,0 +1,668 @@
+# coding=utf-8
+# Copyright 2022 Salesforce authors, The EleutherAI, and HuggingFace Teams. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch CodeGen model."""
+
+from typing import Optional, Union
+
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from ...modeling_utils import PreTrainedModel
+from ...utils import (
+    auto_docstring,
+    is_torch_flex_attn_available,
+    logging,
+)
+from .configuration_codegen import CodeGenConfig
+
+
+if is_torch_flex_attn_available():
+    from torch.nn.attention.flex_attention import BlockMask
+
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
+logger = logging.get_logger(__name__)
+
+
+# Copied from transformers.models.gptj.modeling_gptj.create_sinusoidal_positions
+def create_sinusoidal_positions(num_pos: int, dim: int) -> torch.Tensor:
+    inv_freq = 1.0 / (10000 ** (torch.arange(0, dim, 2, dtype=torch.int64) / dim))
+    sinusoid_inp = torch.einsum("i , j -> i j", torch.arange(num_pos, dtype=torch.int64).float(), inv_freq).float()
+    return torch.cat((torch.sin(sinusoid_inp), torch.cos(sinusoid_inp)), dim=1)
+
+
+# Copied from transformers.models.gptj.modeling_gptj.rotate_every_two
+def rotate_every_two(x: torch.Tensor) -> torch.Tensor:
+    x1 = x[:, :, :, ::2]
+    x2 = x[:, :, :, 1::2]
+    x = torch.stack((-x2, x1), dim=-1)
+    return x.flatten(-2)  # in einsum notation: rearrange(x, '... d j -> ... (d j)')
+
+
+# Copied from transformers.models.gptj.modeling_gptj.apply_rotary_pos_emb
+def apply_rotary_pos_emb(tensor: torch.Tensor, sin: torch.Tensor, cos: torch.Tensor) -> torch.Tensor:
+    sin = torch.repeat_interleave(sin[:, :, None, :], 2, 3)
+    cos = torch.repeat_interleave(cos[:, :, None, :], 2, 3)
+    return (tensor * cos) + (rotate_every_two(tensor) * sin)
+
+
+class CodeGenAttention(nn.Module):
+    def __init__(self, config, layer_idx=None):
+        super().__init__()
+
+        max_positions = config.max_position_embeddings
+        self.attn_dropout = nn.Dropout(config.attn_pdrop)
+        self.resid_dropout = nn.Dropout(config.resid_pdrop)
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.embed_dim = config.hidden_size
+        self.num_attention_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_attention_heads
+        if self.head_dim * self.num_attention_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_attention_heads (got `embed_dim`: {self.embed_dim} and"
+                f" `num_attention_heads`: {self.num_attention_heads})."
+            )
+        self.scale_attn = torch.sqrt(torch.tensor(self.head_dim, dtype=torch.float32)).to(torch.get_default_dtype())
+        self.qkv_proj = nn.Linear(self.embed_dim, self.embed_dim * 3, bias=False)
+
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim, bias=False)
+        self.rotary_dim = config.rotary_dim
+        pos_embd_dim = self.rotary_dim or self.embed_dim
+        self.embed_positions = create_sinusoidal_positions(max_positions, pos_embd_dim)
+
+    def _split_heads(self, x, n_head, dim_head, mp_num):
+        reshaped = x.reshape(x.shape[:-1] + (n_head // mp_num, dim_head))
+        reshaped = reshaped.reshape(x.shape[:-2] + (-1,) + reshaped.shape[-1:])
+        return reshaped
+
+    def _merge_heads(self, tensor, num_attention_heads, attn_head_size):
+        """
+        Merges attn_head_size dim and num_attn_heads dim into n_ctx
+        """
+        if len(tensor.shape) == 5:
+            tensor = tensor.permute(0, 1, 3, 2, 4).contiguous()
+        elif len(tensor.shape) == 4:
+            tensor = tensor.permute(0, 2, 1, 3).contiguous()
+        else:
+            raise ValueError(f"Input tensor rank should be one of [4, 5], but is: {len(tensor.shape)}")
+        new_shape = tensor.size()[:-2] + (num_attention_heads * attn_head_size,)
+        return tensor.view(new_shape)
+
+    def _attn(
+        self,
+        query,
+        key,
+        value,
+        attention_mask=None,
+        head_mask=None,
+    ):
+        # Keep the attention weights computation in fp32 to avoid overflow issues
+        query = query.to(torch.float32)
+        key = key.to(torch.float32)
+
+        attn_weights = torch.matmul(query, key.transpose(-1, -2))
+
+        if attention_mask is not None:
+            causal_mask = attention_mask[:, :, :, : key.shape[-2]]
+            attn_weights += causal_mask
+
+        attn_weights = attn_weights / self.scale_attn
+        attn_weights = nn.Softmax(dim=-1)(attn_weights)
+        attn_weights = attn_weights.to(value.dtype)
+        attn_weights = self.attn_dropout(attn_weights)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_weights = attn_weights * head_mask
+
+        attn_output = torch.matmul(attn_weights, value)
+
+        return attn_output, attn_weights
+
+    def forward(
+        self,
+        hidden_states: Optional[torch.FloatTensor],
+        layer_past: Optional[Cache] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[
+        tuple[torch.Tensor, tuple[torch.Tensor]],
+        Optional[tuple[torch.Tensor, tuple[torch.Tensor], tuple[torch.Tensor, ...]]],
+    ]:
+        qkv = self.qkv_proj(hidden_states)
+        # TODO(enijkamp): factor out number of logical TPU-v4 cores or make forward pass agnostic
+        mp_num = 4
+        qkv_split = qkv.reshape(qkv.shape[:-1] + (mp_num, -1))
+
+        local_dim = self.head_dim * self.num_attention_heads // mp_num
+        query, value, key = torch.split(qkv_split, local_dim, dim=-1)
+        query = self._split_heads(query, self.num_attention_heads, self.head_dim, mp_num=mp_num)
+        key = self._split_heads(key, self.num_attention_heads, self.head_dim, mp_num=mp_num)
+
+        value = self._split_heads(value, self.num_attention_heads, self.head_dim, mp_num=mp_num)
+        value = value.permute(0, 2, 1, 3)
+
+        embed_positions = self.embed_positions
+        if embed_positions.device != position_ids.device:
+            embed_positions = embed_positions.to(position_ids.device)
+            self.embed_positions = embed_positions
+
+        sincos = embed_positions[position_ids]
+        sin, cos = torch.split(sincos, sincos.shape[-1] // 2, dim=-1)
+
+        if self.rotary_dim is not None:
+            k_rot = key[:, :, :, : self.rotary_dim]
+            k_pass = key[:, :, :, self.rotary_dim :]
+
+            q_rot = query[:, :, :, : self.rotary_dim]
+            q_pass = query[:, :, :, self.rotary_dim :]
+
+            k_rot = apply_rotary_pos_emb(k_rot, sin, cos)
+            q_rot = apply_rotary_pos_emb(q_rot, sin, cos)
+
+            key = torch.cat([k_rot, k_pass], dim=-1)
+            query = torch.cat([q_rot, q_pass], dim=-1)
+        else:
+            key = apply_rotary_pos_emb(key, sin, cos)
+            query = apply_rotary_pos_emb(query, sin, cos)
+
+        key = key.permute(0, 2, 1, 3)
+        query = query.permute(0, 2, 1, 3)
+
+        # Note that this cast is quite ugly, but is not implemented before ROPE as k_rot in the original codebase is always in fp32.
+        # Reference: https://github.com/salesforce/CodeGen/blob/f210c3bb1216c975ad858cd4132c0fdeabf4bfc2/codegen1/jaxformer/hf/codegen/modeling_codegen.py#L38
+        if layer_past is not None:
+            cache_kwargs = {
+                "sin": sin,
+                "cos": cos,
+                "partial_rotation_size": self.rotary_dim,
+                "cache_position": cache_position,
+            }
+            key, value = layer_past.update(key.to(hidden_states.dtype), value, self.layer_idx, cache_kwargs)
+
+        # compute self-attention: V x Softmax(QK^T)
+        attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
+
+        attn_output = self._merge_heads(attn_output, self.num_attention_heads, self.head_dim)
+        attn_output = self.out_proj(attn_output)
+        attn_output = self.resid_dropout(attn_output)
+        return attn_output, attn_weights
+
+
+# Copied from transformers.models.gptj.modeling_gptj.GPTJMLP with GPTJ->CodeGen
+class CodeGenMLP(nn.Module):
+    def __init__(self, intermediate_size, config):  # in MLP: intermediate_size= 4 * embed_dim
+        super().__init__()
+        embed_dim = config.n_embd
+
+        self.fc_in = nn.Linear(embed_dim, intermediate_size)
+        self.fc_out = nn.Linear(intermediate_size, embed_dim)
+
+        self.act = ACT2FN[config.activation_function]
+        self.dropout = nn.Dropout(config.resid_pdrop)
+
+    def forward(self, hidden_states: Optional[torch.FloatTensor]) -> torch.FloatTensor:
+        hidden_states = self.fc_in(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.fc_out(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.gptj.modeling_gptj.GPTJBlock with GPTJ->CodeGen
+class CodeGenBlock(GradientCheckpointingLayer):
+    # Ignore copy
+    def __init__(self, config, layer_idx=None):
+        super().__init__()
+        inner_dim = config.n_inner if config.n_inner is not None else 4 * config.n_embd
+        self.ln_1 = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
+        self.attn = CodeGenAttention(config, layer_idx)
+        self.mlp = CodeGenMLP(inner_dim, config)
+
+    def forward(
+        self,
+        hidden_states: Optional[torch.FloatTensor],
+        layer_past: Optional[Cache] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[tuple[torch.Tensor], Optional[tuple[torch.Tensor, tuple[torch.FloatTensor, ...]]]]:
+        residual = hidden_states
+        hidden_states = self.ln_1(hidden_states)
+        attn_outputs, attn_weights = self.attn(
+            hidden_states=hidden_states,
+            layer_past=layer_past,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            cache_position=cache_position,
+        )
+        feed_forward_hidden_states = self.mlp(hidden_states)
+        hidden_states = attn_outputs + feed_forward_hidden_states + residual
+
+        return hidden_states, attn_weights
+
+
+@auto_docstring
+class CodeGenPreTrainedModel(PreTrainedModel):
+    config: CodeGenConfig
+    base_model_prefix = "transformer"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["CodeGenBlock"]
+    _skip_keys_device_placement = "past_key_values"
+
+    _can_compile_fullgraph = True
+
+    def __init__(self, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, (nn.Linear,)):
+            # Slightly different from Mesh Transformer JAX which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+@auto_docstring
+class CodeGenModel(CodeGenPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.embed_dim = config.n_embd
+        self.vocab_size = config.vocab_size
+        self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
+        self.drop = nn.Dropout(config.embd_pdrop)
+        self.h = nn.ModuleList([CodeGenBlock(config, layer_idx=i) for i in range(config.n_layer)])
+        self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
+        self.rotary_dim = min(config.rotary_dim, config.n_ctx // config.num_attention_heads)
+
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.wte
+
+    def set_input_embeddings(self, new_embeddings):
+        self.wte = new_embeddings
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, tuple[tuple[torch.Tensor]]]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,  # NOOP kwargs, for now
+    ) -> Union[tuple, BaseModelOutputWithPast]:
+        r"""
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.wte(input_ids)
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        seq_length = inputs_embeds.shape[1]
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(past_seen_tokens, past_seen_tokens + seq_length, device=inputs_embeds.device)
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = self._update_causal_mask(
+            attention_mask, inputs_embeds, cache_position, past_key_values, output_attentions
+        )
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x num_attention_heads x N x N
+        # head_mask has shape n_layer x batch x num_attention_heads x N x N
+        head_mask = self.get_head_mask(head_mask, self.config.n_layer)
+        hidden_states = inputs_embeds
+
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.view(-1, seq_length)
+            token_type_embeds = self.wte(token_type_ids)
+            hidden_states = hidden_states + token_type_embeds
+
+        hidden_states = self.drop(hidden_states)
+        output_shape = (-1, seq_length, hidden_states.size(-1))
+
+        all_self_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+        for i, block in enumerate(self.h):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            outputs = block(
+                hidden_states,
+                layer_past=past_key_values,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                head_mask=head_mask[i],
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                cache_position=cache_position,
+            )
+
+            hidden_states = outputs[0]
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (outputs[1],)
+
+        hidden_states = self.ln_f(hidden_states)
+
+        hidden_states = hidden_states.view(output_shape)
+        # Add last hidden state
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v for v in [hidden_states, past_key_values, all_hidden_states, all_self_attentions] if v is not None
+            )
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+    # Copied from transformers.models.gptj.modeling_gptj.GPTJModel._update_causal_mask
+    def _update_causal_mask(
+        self,
+        attention_mask: Union[torch.Tensor, "BlockMask"],
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+        output_attentions: bool = False,
+    ):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and (attention_mask == 0.0).any():
+                return attention_mask
+            return None
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            return attention_mask
+
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_compilable_cache = past_key_values.is_compileable if past_key_values is not None else False
+
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if self.config._attn_implementation == "sdpa" and not using_compilable_cache and not output_attentions:
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+
+        dtype = input_tensor.dtype
+        sequence_length = input_tensor.shape[1]
+        if using_compilable_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type in ["cuda", "xpu", "npu"]
+            and not output_attentions
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
+    @staticmethod
+    # Copied from transformers.models.gptj.modeling_gptj.GPTJModel._prepare_4d_causal_attention_mask_with_cache_position
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+                `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache,
+                to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
+                    causal_mask.device
+                )
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+
+        return causal_mask
+
+
+@auto_docstring(
+    custom_intro="""
+    The CodeGen Model transformer with a language modeling head on top.
+    """
+)
+class CodeGenForCausalLM(CodeGenPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.transformer = CodeGenModel(config)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, tuple[tuple[torch.Tensor]]]] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Union[tuple, CausalLMOutputWithPast]:
+        r"""
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_dim)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+        hidden_states = transformer_outputs[0]
+
+        # make sure sampling in fp16 works correctly and
+        # compute loss in fp32 to match with mesh-tf version
+        # https://github.com/EleutherAI/gpt-neo/blob/89ce74164da2fb16179106f54e2269b5da8db333/models/gpt2/gpt2.py#L179
+        lm_logits = self.lm_head(hidden_states).to(torch.float32)
+
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(lm_logits.device)
+            # Flatten the tokens
+            loss = self.loss_function(
+                lm_logits,
+                labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+
+            loss = loss.to(hidden_states.dtype)
+
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+__all__ = ["CodeGenForCausalLM", "CodeGenModel", "CodeGenPreTrainedModel"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/codegen/tokenization_codegen.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/codegen/tokenization_codegen.py
new file mode 100644
index 0000000000000000000000000000000000000000..152b1a84fc37d5ed6613072284133565ebc86cbf
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/codegen/tokenization_codegen.py
@@ -0,0 +1,390 @@
+# coding=utf-8
+# Copyright 2022 The Salesforce authors, The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for CodeGen"""
+
+import json
+import os
+from functools import lru_cache
+from typing import TYPE_CHECKING, Optional, Union
+
+import numpy as np
+import regex as re
+
+from ...utils import is_tf_available, is_torch_available, logging, to_py_obj
+
+
+if TYPE_CHECKING:
+    if is_torch_available():
+        import torch
+    if is_tf_available():
+        import tensorflow as tf
+
+from ...tokenization_utils import AddedToken, PreTrainedTokenizer
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "merges_file": "merges.txt",
+}
+
+
+@lru_cache
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
+    characters the bpe code barfs on.
+
+    The reversible bpe codes work on unicode strings. This means you need a large # of unicode characters in your vocab
+    if you want to avoid UNKs. When you're at something like a 10B token dataset you end up needing around 5K for
+    decent coverage. This is a significant percentage of your normal, say, 32K bpe vocab. To avoid that, we want lookup
+    tables between utf-8 bytes and unicode strings.
+    """
+    bs = (
+        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
+    )
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def get_pairs(word):
+    """
+    Return set of symbol pairs in a word.
+
+    Word is represented as tuple of symbols (symbols being variable-length strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+class CodeGenTokenizer(PreTrainedTokenizer):
+    """
+    Construct a CodeGen tokenizer. Based on byte-level Byte-Pair-Encoding.
+
+    This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
+    be encoded differently whether it is at the beginning of the sentence (without space) or not:
+
+    ```python
+    >>> from transformers import CodeGenTokenizer
+
+    >>> tokenizer = CodeGenTokenizer.from_pretrained("Salesforce/codegen-350M-mono")
+    >>> tokenizer("Hello world")["input_ids"]
+    [15496, 995]
+
+    >>> tokenizer(" Hello world")["input_ids"]
+    [18435, 995]
+    ```
+
+    You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
+    call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
+
+    <Tip>
+
+    When used with `is_split_into_words=True`, this tokenizer will add a space before each word (even the first one).
+
+    </Tip>
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        merges_file (`str`):
+            Path to the merges file.
+        errors (`str`, *optional*, defaults to `"replace"`):
+            Paradigm to follow when decoding bytes to UTF-8. See
+            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        bos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The beginning of sequence token.
+        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The end of sequence token.
+        pad_token (`str`, *optional*):
+            The token used for padding, for example when batching sequences of different lengths.
+        add_prefix_space (`bool`, *optional*, defaults to `False`):
+            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
+            other word. (CodeGen tokenizer detect beginning of words by the preceding space).
+        add_bos_token (`bool`, *optional*, defaults to `False`):
+            Whether to add a beginning of sequence token at the start of sequences.
+        return_token_type_ids (`bool`, *optional*, defaults to `False`):
+            Whether to return token type IDs.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        errors="replace",
+        unk_token="<|endoftext|>",
+        bos_token="<|endoftext|>",
+        eos_token="<|endoftext|>",
+        pad_token=None,
+        add_prefix_space=False,
+        add_bos_token=False,
+        return_token_type_ids=False,
+        **kwargs,
+    ):
+        bos_token = AddedToken(bos_token, special=True) if isinstance(bos_token, str) else bos_token
+        eos_token = AddedToken(eos_token, special=True) if isinstance(eos_token, str) else eos_token
+        unk_token = AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token
+        pad_token = AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token
+        self.add_bos_token = add_bos_token
+        self.return_token_type_ids = return_token_type_ids
+        if self.return_token_type_ids:
+            self.model_input_names.append("token_type_ids")
+
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.errors = errors  # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        with open(merges_file, encoding="utf-8") as merges_handle:
+            bpe_merges = merges_handle.read().split("\n")[1:-1]
+        bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        self.cache = {}
+        self.add_prefix_space = add_prefix_space
+
+        # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
+        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+        super().__init__(
+            errors=errors,
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            pad_token=pad_token,
+            add_prefix_space=add_prefix_space,
+            add_bos_token=add_bos_token,
+            return_token_type_ids=return_token_type_ids,
+            **kwargs,
+        )
+
+    @property
+    def vocab_size(self):
+        return len(self.encoder)
+
+    def get_vocab(self):
+        return dict(self.encoder, **self.added_tokens_encoder)
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token
+
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                except ValueError:
+                    new_word.extend(word[i:])
+                    break
+                else:
+                    new_word.extend(word[i:j])
+                    i = j
+
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = " ".join(word)
+        self.cache[token] = word
+        return word
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        if self.add_bos_token:
+            bos_token_ids = [self.bos_token_id]
+        else:
+            bos_token_ids = []
+
+        output = bos_token_ids + token_ids_0
+
+        if token_ids_1 is None:
+            return output
+
+        return output + bos_token_ids + token_ids_1
+
+    def _tokenize(self, text):
+        """Tokenize a string."""
+        bpe_tokens = []
+        for token in re.findall(self.pat, text):
+            token = "".join(
+                self.byte_encoder[b] for b in token.encode("utf-8")
+            )  # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
+            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
+        return bpe_tokens
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.decoder.get(index)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        text = "".join(tokens)
+        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
+        return text
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        merge_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
+        )
+
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
+
+        index = 0
+        with open(merge_file, "w", encoding="utf-8") as writer:
+            writer.write("#version: 0.2\n")
+            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
+                        " Please check that the tokenizer is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(" ".join(bpe_tokens) + "\n")
+                index += 1
+
+        return vocab_file, merge_file
+
+    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
+        add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
+        if is_split_into_words or add_prefix_space:
+            text = " " + text
+        return (text, kwargs)
+
+    def decode(
+        self,
+        token_ids: Union[int, list[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: Optional[bool] = None,
+        truncate_before_pattern: Optional[list[str]] = None,
+        **kwargs,
+    ) -> str:
+        """
+        Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
+        tokens and clean up tokenization spaces.
+
+        Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.
+
+        Args:
+            token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
+                List of tokenized input ids. Can be obtained using the `__call__` method.
+            skip_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not to remove special tokens in the decoding.
+            clean_up_tokenization_spaces (`bool`, *optional*):
+                Whether or not to clean up the tokenization spaces. If `None`, will default to
+                `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
+            truncate_before_pattern (`List[str]`, *optional*, defaults to `None`):
+                A list of regular expression strings that will be used to truncate the returned string. This can be
+                used to remove extra pieces of code (e.g. truncate if observing a comment symbol "#" at the beginning
+                of a new line). An example pattern could be `["^#", re.escape("<|endoftext|>"), "^'''", "\n\n\n"]`.
+            kwargs (additional keyword arguments, *optional*):
+                Will be passed to the underlying model specific decode method.
+
+        Returns:
+            `str`: The decoded sentence.
+        """
+
+        token_ids = to_py_obj(token_ids)
+
+        decoded_text = super()._decode(
+            token_ids=token_ids,
+            skip_special_tokens=skip_special_tokens,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs,
+        )
+
+        if truncate_before_pattern is not None and len(truncate_before_pattern) > 0:
+            decoded_text = self.truncate(decoded_text, truncate_before_pattern)
+
+        return decoded_text
+
+    def truncate(self, completion, truncate_before_pattern):
+        def find_re(string, pattern, start_pos):
+            m = pattern.search(string, start_pos)
+            return m.start() if m else -1
+
+        terminals = [re.compile(pattern, re.MULTILINE) for pattern in truncate_before_pattern]
+
+        prints = list(re.finditer("^print", completion, re.MULTILINE))
+
+        if len(prints) > 1:
+            completion = completion[: prints[1].start()]
+
+        defs = list(re.finditer("^def", completion, re.MULTILINE))
+
+        if len(defs) > 1:
+            completion = completion[: defs[1].start()]
+
+        start_pos = 0
+
+        terminals_pos = [
+            pos for pos in [find_re(completion, terminal, start_pos) for terminal in terminals] if pos != -1
+        ]
+
+        if len(terminals_pos) > 0:
+            return completion[: min(terminals_pos)]
+        else:
+            return completion
+
+
+__all__ = ["CodeGenTokenizer"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/codegen/tokenization_codegen_fast.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/codegen/tokenization_codegen_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..7bac0db7de4e7c548ddea0eb2b3c498919c6196a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/codegen/tokenization_codegen_fast.py
@@ -0,0 +1,235 @@
+# coding=utf-8
+# Copyright 2022 The Salesforce authors, The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for OpenAI GPT."""
+
+import re
+from typing import TYPE_CHECKING, Optional, Union
+
+import numpy as np
+
+from ...utils import is_tf_available, is_torch_available, logging
+
+
+if TYPE_CHECKING:
+    if is_torch_available():
+        import torch
+    if is_tf_available():
+        import tensorflow as tf
+
+
+from ...tokenization_utils_base import BatchEncoding
+from ...tokenization_utils_fast import PreTrainedTokenizerFast
+from .tokenization_codegen import CodeGenTokenizer
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt", "tokenizer_file": "tokenizer.json"}
+
+
+class CodeGenTokenizerFast(PreTrainedTokenizerFast):
+    """
+    Construct a "fast" CodeGen tokenizer (backed by HuggingFace's *tokenizers* library). Based on byte-level
+    Byte-Pair-Encoding.
+
+    This tokenizer has been trained to treat spaces like parts of the tokens (a bit like sentencepiece) so a word will
+    be encoded differently whether it is at the beginning of the sentence (without space) or not:
+
+    ```python
+    >>> from transformers import CodeGenTokenizerFast
+
+    >>> tokenizer = CodeGenTokenizerFast.from_pretrained("Salesforce/codegen-350M-mono")
+    >>> tokenizer("Hello world")["input_ids"]
+    [15496, 995]
+
+    >>> tokenizer(" Hello world")["input_ids"]
+    [18435, 995]
+    ```
+
+    You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer, but since
+    the model was not pretrained this way, it might yield a decrease in performance.
+
+    <Tip>
+
+    When used with `is_split_into_words=True`, this tokenizer needs to be instantiated with `add_prefix_space=True`.
+
+    </Tip>
+
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`, *optional*):
+            Path to the vocabulary file.
+        merges_file (`str`, *optional*):
+            Path to the merges file.
+        tokenizer_file (`str`, *optional*):
+            Path to [tokenizers](https://github.com/huggingface/tokenizers) file (generally has a .json extension) that
+            contains everything needed to load the tokenizer.
+        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        bos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The beginning of sequence token.
+        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The end of sequence token.
+        add_prefix_space (`bool`, *optional*, defaults to `False`):
+            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
+            other word. (CodeGen tokenizer detect beginning of words by the preceding space).
+        return_token_type_ids (`bool`, *optional*, defaults to `False`):
+            Whether to return token type IDs.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+    slow_tokenizer_class = CodeGenTokenizer
+
+    def __init__(
+        self,
+        vocab_file=None,
+        merges_file=None,
+        tokenizer_file=None,
+        unk_token="<|endoftext|>",
+        bos_token="<|endoftext|>",
+        eos_token="<|endoftext|>",
+        add_prefix_space=False,
+        return_token_type_ids=False,
+        **kwargs,
+    ):
+        self.return_token_type_ids = return_token_type_ids
+        if self.return_token_type_ids:
+            self.model_input_names.append("token_type_ids")
+
+        super().__init__(
+            vocab_file,
+            merges_file,
+            tokenizer_file=tokenizer_file,
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            add_prefix_space=add_prefix_space,
+            return_token_type_ids=return_token_type_ids,
+            **kwargs,
+        )
+
+        if kwargs.pop("add_bos_token", False):
+            model_id = kwargs.pop("name_or_path", "")
+            raise ValueError(
+                "Currently GPT2's fast tokenizer does NOT support adding a BOS token. "
+                "Instead you should use GPT2's slow tokenizer class `CodeGenTokenizer` as follows: \n"
+                f"`CodeGenTokenizer.from_pretrained('{model_id}')`\nor\n"
+                f"`AutoTokenizer.from_pretrained('{model_id}', use_fast=False)`\n"
+                "This issue will be fixed soon, see: https://github.com/huggingface/tokenizers/pull/1005."
+                " so that the fast tokenizer works correctly."
+            )
+
+    def _batch_encode_plus(self, *args, **kwargs) -> BatchEncoding:
+        is_split_into_words = kwargs.get("is_split_into_words", False)
+        assert self.add_prefix_space or not is_split_into_words, (
+            f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
+            "to use it with pretokenized inputs."
+        )
+
+        return super()._batch_encode_plus(*args, **kwargs)
+
+    def _encode_plus(self, *args, **kwargs) -> BatchEncoding:
+        is_split_into_words = kwargs.get("is_split_into_words", False)
+
+        assert self.add_prefix_space or not is_split_into_words, (
+            f"You need to instantiate {self.__class__.__name__} with add_prefix_space=True "
+            "to use it with pretokenized inputs."
+        )
+
+        return super()._encode_plus(*args, **kwargs)
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
+        return tuple(files)
+
+    def decode(
+        self,
+        token_ids: Union[int, list[int], "np.ndarray", "torch.Tensor", "tf.Tensor"],
+        skip_special_tokens: bool = False,
+        clean_up_tokenization_spaces: Optional[bool] = None,
+        truncate_before_pattern: Optional[list[str]] = None,
+        **kwargs,
+    ) -> str:
+        """
+        Converts a sequence of ids in a string, using the tokenizer and vocabulary with options to remove special
+        tokens and clean up tokenization spaces.
+
+        Similar to doing `self.convert_tokens_to_string(self.convert_ids_to_tokens(token_ids))`.
+
+        Args:
+            token_ids (`Union[int, List[int], np.ndarray, torch.Tensor, tf.Tensor]`):
+                List of tokenized input ids. Can be obtained using the `__call__` method.
+            skip_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not to remove special tokens in the decoding.
+            clean_up_tokenization_spaces (`bool`, *optional*):
+                Whether or not to clean up the tokenization spaces. If `None`, will default to
+                `self.clean_up_tokenization_spaces` (available in the `tokenizer_config`).
+            truncate_before_pattern (`List[str]`, *optional*, defaults to `None`):
+                A list of regular expression strings that will be used to truncate the returned string. This can be
+                used to remove extra pieces of code (e.g. truncate if observing a comment symbol "#" at the beginning
+                of a new line). An example pattern could be `["^#", re.escape("<|endoftext|>"), "^'''", "\n\n\n"]`.
+            kwargs (additional keyword arguments, *optional*):
+                Will be passed to the underlying model specific decode method.
+
+        Returns:
+            `str`: The decoded sentence.
+        """
+
+        decoded_text = super().decode(
+            token_ids=token_ids,
+            skip_special_tokens=skip_special_tokens,
+            clean_up_tokenization_spaces=clean_up_tokenization_spaces,
+            **kwargs,
+        )
+
+        if truncate_before_pattern is not None and len(truncate_before_pattern) > 0:
+            decoded_text = self.truncate(decoded_text, truncate_before_pattern)
+
+        return decoded_text
+
+    def truncate(self, completion, truncate_before_pattern):
+        def find_re(string, pattern, start_pos):
+            m = pattern.search(string, start_pos)
+            return m.start() if m else -1
+
+        terminals = [re.compile(pattern, re.MULTILINE) for pattern in truncate_before_pattern]
+
+        prints = list(re.finditer("^print", completion, re.MULTILINE))
+
+        if len(prints) > 1:
+            completion = completion[: prints[1].start()]
+
+        defs = list(re.finditer("^def", completion, re.MULTILINE))
+
+        if len(defs) > 1:
+            completion = completion[: defs[1].start()]
+
+        start_pos = 0
+
+        terminals_pos = [
+            pos for pos in [find_re(completion, terminal, start_pos) for terminal in terminals] if pos != -1
+        ]
+
+        if len(terminals_pos) > 0:
+            return completion[: min(terminals_pos)]
+        else:
+            return completion
+
+
+__all__ = ["CodeGenTokenizerFast"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f68cfedd1c15ebd4b3f19fb1b461f39edc144523
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2/__pycache__/modeling_cohere2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2/__pycache__/modeling_cohere2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..31a61473ffdb1841669126ccc01a88c0e7328afd
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2/__pycache__/modeling_cohere2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2/__pycache__/modular_cohere2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2/__pycache__/modular_cohere2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4016c539f7c6cf5556e923e20b5108931f1898dd
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cohere2/__pycache__/modular_cohere2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnextv2/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnextv2/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..89bbcf782e83f833c2cff63d7b31135ae0c3cd5b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnextv2/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnextv2/__pycache__/configuration_convnextv2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnextv2/__pycache__/configuration_convnextv2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e1b2f2beee45764600d85c28ba7de1e79a0c9658
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnextv2/__pycache__/configuration_convnextv2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnextv2/__pycache__/modeling_convnextv2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnextv2/__pycache__/modeling_convnextv2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..468e500e3a573ba2e1d94534c1255eda540ccf18
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnextv2/__pycache__/modeling_convnextv2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnextv2/__pycache__/modeling_tf_convnextv2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnextv2/__pycache__/modeling_tf_convnextv2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..23c7cef0bf690248567453eb67e86b9c99a4ace8
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/convnextv2/__pycache__/modeling_tf_convnextv2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/csm/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/csm/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..31fc0d8ebef4685bcba991f44a465f0106e08637
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/csm/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/csm/__pycache__/configuration_csm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/csm/__pycache__/configuration_csm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8f4b6b062611bff4756a5a70288215257ae06ea9
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/csm/__pycache__/configuration_csm.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/csm/__pycache__/generation_csm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/csm/__pycache__/generation_csm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..21da0752c3d8ab55eaeedc3988247190561c4a4d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/csm/__pycache__/generation_csm.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/csm/__pycache__/modeling_csm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/csm/__pycache__/modeling_csm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a187bc3d44b0ff684a52fca5c66d1f42a8e4142b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/csm/__pycache__/modeling_csm.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/csm/__pycache__/modular_csm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/csm/__pycache__/modular_csm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8f3204d44aa3c10ddd0a15130ecfea23814f945f
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/csm/__pycache__/modular_csm.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/csm/__pycache__/processing_csm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/csm/__pycache__/processing_csm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d025117a0b314521ca8cf3500f757718873e1936
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/csm/__pycache__/processing_csm.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ctrl/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ctrl/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea62163babef934550a87c17577891614b071642
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ctrl/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_ctrl import *
+    from .modeling_ctrl import *
+    from .modeling_tf_ctrl import *
+    from .tokenization_ctrl import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ctrl/configuration_ctrl.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ctrl/configuration_ctrl.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a812f0b55650fab8ab9a4d1b4c6bad4cd9446f4
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ctrl/configuration_ctrl.py
@@ -0,0 +1,116 @@
+# coding=utf-8
+# Copyright 2018 Salesforce and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Salesforce CTRL configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class CTRLConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`CTRLModel`] or a [`TFCTRLModel`]. It is used to
+    instantiate a CTRL model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the
+    [Salesforce/ctrl](https://huggingface.co/Salesforce/ctrl) architecture from SalesForce.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 246534):
+            Vocabulary size of the CTRL model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`CTRLModel`] or [`TFCTRLModel`].
+        n_positions (`int`, *optional*, defaults to 256):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        n_embd (`int`, *optional*, defaults to 1280):
+            Dimensionality of the embeddings and hidden states.
+        dff (`int`, *optional*, defaults to 8192):
+            Dimensionality of the inner dimension of the feed forward networks (FFN).
+        n_layer (`int`, *optional*, defaults to 48):
+            Number of hidden layers in the Transformer encoder.
+        n_head (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        resid_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        embd_pdrop (`int`, *optional*, defaults to 0.1):
+            The dropout ratio for the embeddings.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-06):
+            The epsilon to use in the layer normalization layers
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+
+
+    Examples:
+
+    ```python
+    >>> from transformers import CTRLConfig, CTRLModel
+
+    >>> # Initializing a CTRL configuration
+    >>> configuration = CTRLConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = CTRLModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "ctrl"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "max_position_embeddings": "n_positions",
+        "hidden_size": "n_embd",
+        "num_attention_heads": "n_head",
+        "num_hidden_layers": "n_layer",
+    }
+
+    def __init__(
+        self,
+        vocab_size=246534,
+        n_positions=256,
+        n_embd=1280,
+        dff=8192,
+        n_layer=48,
+        n_head=16,
+        resid_pdrop=0.1,
+        embd_pdrop=0.1,
+        layer_norm_epsilon=1e-6,
+        initializer_range=0.02,
+        use_cache=True,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.n_positions = n_positions
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.dff = dff
+        self.resid_pdrop = resid_pdrop
+        self.embd_pdrop = embd_pdrop
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+
+        self.use_cache = use_cache
+
+        super().__init__(**kwargs)
+
+
+__all__ = ["CTRLConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ctrl/modeling_ctrl.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ctrl/modeling_ctrl.py
new file mode 100644
index 0000000000000000000000000000000000000000..506bed039b17d4dadfce2755366207f3a2e718f3
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ctrl/modeling_ctrl.py
@@ -0,0 +1,787 @@
+# coding=utf-8
+# Copyright 2018 Salesforce and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch CTRL model."""
+
+from typing import Optional, Union
+
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutput
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import Conv1D, find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
+    auto_docstring,
+    logging,
+)
+from .configuration_ctrl import CTRLConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+def angle_defn(pos, i, d_model_size):
+    angle_rates = 1 / torch.pow(10000, (2 * (i // 2)) / d_model_size)
+    return pos * angle_rates
+
+
+def positional_encoding(position, d_model_size, dtype):
+    # create the sinusoidal pattern for the positional encoding
+    angle_rads = angle_defn(
+        torch.arange(position, dtype=torch.int64).to(dtype).unsqueeze(1),
+        torch.arange(d_model_size, dtype=torch.int64).to(dtype).unsqueeze(0),
+        d_model_size,
+    )
+
+    sines = torch.sin(angle_rads[:, 0::2])
+    cosines = torch.cos(angle_rads[:, 1::2])
+
+    pos_encoding = torch.cat([sines, cosines], dim=-1)
+    return pos_encoding
+
+
+def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=None):
+    # calculate attention
+    matmul_qk = torch.matmul(q, k.permute(0, 1, 3, 2))
+
+    dk = k.shape[-1]
+    scaled_attention_logits = matmul_qk / np.sqrt(dk)
+
+    if mask is not None:
+        nd, ns = scaled_attention_logits.size(-2), scaled_attention_logits.size(-1)
+        scaled_attention_logits += mask[ns - nd : ns, :ns] * -1e4
+
+    if attention_mask is not None:
+        # Apply the attention mask
+        scaled_attention_logits = scaled_attention_logits + attention_mask
+
+    attention_weights = torch.softmax(scaled_attention_logits, dim=-1)
+
+    # Mask heads if we want to
+    if head_mask is not None:
+        attention_weights = attention_weights * head_mask
+
+    output = torch.matmul(attention_weights, v)
+
+    return output, attention_weights
+
+
+class MultiHeadAttention(nn.Module):
+    def __init__(self, d_model_size, num_heads, layer_idx=None):
+        super().__init__()
+        self.num_heads = num_heads
+        self.d_model_size = d_model_size
+        self.layer_idx = layer_idx
+
+        self.depth = int(d_model_size / self.num_heads)
+
+        self.Wq = nn.Linear(d_model_size, d_model_size)
+        self.Wk = nn.Linear(d_model_size, d_model_size)
+        self.Wv = nn.Linear(d_model_size, d_model_size)
+
+        self.dense = nn.Linear(d_model_size, d_model_size)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        attention_head_size = self.d_model_size // self.num_heads
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(heads, self.num_heads, attention_head_size, self.pruned_heads)
+
+        # Prune linear layers
+        self.Wq = prune_linear_layer(self.Wq, index)
+        self.Wk = prune_linear_layer(self.Wk, index)
+        self.Wv = prune_linear_layer(self.Wv, index)
+        self.dense = prune_linear_layer(self.dense, index, dim=1)
+
+        # Update hyper params
+        self.num_heads = self.num_heads - len(heads)
+        self.d_model_size = attention_head_size * self.num_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def split_into_heads(self, x, batch_size):
+        x = x.reshape(batch_size, -1, self.num_heads, self.depth)
+        return x.permute([0, 2, 1, 3])
+
+    def forward(
+        self,
+        v,
+        k,
+        q,
+        mask,
+        layer_past=None,
+        attention_mask=None,
+        head_mask=None,
+        use_cache=False,
+        output_attentions=False,
+        cache_position=None,
+    ):
+        batch_size = q.shape[0]
+
+        q = self.Wq(q)
+        k = self.Wk(k)
+        v = self.Wv(v)
+
+        q = self.split_into_heads(q, batch_size)
+        k = self.split_into_heads(k, batch_size)
+        v = self.split_into_heads(v, batch_size)
+
+        if layer_past is not None:
+            k, v = layer_past.update(k, v, self.layer_idx, {"cache_position": cache_position})
+
+        output = scaled_dot_product_attention(q, k, v, mask, attention_mask, head_mask)
+        scaled_attention = output[0].permute([0, 2, 1, 3])
+        attn = output[1]
+        original_size_attention = scaled_attention.reshape(batch_size, -1, self.d_model_size)
+        output = self.dense(original_size_attention)
+        return output, attn
+
+
+def point_wise_feed_forward_network(d_model_size, dff):
+    return nn.Sequential(nn.Linear(d_model_size, dff), nn.ReLU(), nn.Linear(dff, d_model_size))
+
+
+class EncoderLayer(nn.Module):
+    def __init__(self, d_model_size, num_heads, dff, rate=0.1, layer_idx=None):
+        super().__init__()
+
+        self.multi_head_attention = MultiHeadAttention(d_model_size, num_heads, layer_idx=layer_idx)
+        self.ffn = point_wise_feed_forward_network(d_model_size, dff)
+
+        self.layernorm1 = nn.LayerNorm(d_model_size, eps=1e-6)
+        self.layernorm2 = nn.LayerNorm(d_model_size, eps=1e-6)
+
+        self.dropout1 = nn.Dropout(rate)
+        self.dropout2 = nn.Dropout(rate)
+
+    def forward(
+        self,
+        x,
+        mask,
+        layer_past=None,
+        attention_mask=None,
+        head_mask=None,
+        use_cache=False,
+        output_attentions=False,
+        cache_position=None,
+    ):
+        normed = self.layernorm1(x)
+        attn_outputs = self.multi_head_attention(
+            normed,
+            normed,
+            normed,
+            mask,
+            layer_past=layer_past,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            cache_position=cache_position,
+        )
+        attn_output = attn_outputs[0]
+        attn_output = self.dropout1(attn_output)
+        out1 = x + attn_output
+
+        out2 = self.layernorm2(out1)
+        ffn_output = self.ffn(out2)
+        ffn_output = self.dropout2(ffn_output)
+        out2 = out1 + ffn_output
+
+        outputs = (out2,) + attn_outputs[1:]
+        return outputs
+
+
+@auto_docstring
+class CTRLPreTrainedModel(PreTrainedModel):
+    config: CTRLConfig
+    base_model_prefix = "transformer"
+
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, (nn.Linear, Conv1D)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+@auto_docstring
+class CTRLModel(CTRLPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.d_model_size = config.n_embd
+        self.num_layers = config.n_layer
+
+        self.pos_encoding = positional_encoding(config.n_positions, self.d_model_size, torch.float)
+
+        self.w = nn.Embedding(config.vocab_size, config.n_embd)
+
+        self.dropout = nn.Dropout(config.embd_pdrop)
+        self.h = nn.ModuleList(
+            [
+                EncoderLayer(config.n_embd, config.n_head, config.dff, config.resid_pdrop, layer_idx=i)
+                for i in range(config.n_layer)
+            ]
+        )
+        self.layernorm = nn.LayerNorm(config.n_embd, eps=config.layer_norm_epsilon)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.w
+
+    def set_input_embeddings(self, new_embeddings):
+        self.w = new_embeddings
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        """
+        for layer, heads in heads_to_prune.items():
+            self.h[layer].multi_head_attention.prune_heads(heads)
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+        **kwargs,  # NOOP kwargs, for now
+    ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPast]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0].shape[-2]`
+            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.
+
+            If `past_key_values` is used, only input IDs that do not have their past calculated should be passed as
+            `input_ids`.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            [`PreTrainedTokenizer.encode`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, CTRLModel
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
+        >>> model = CTRLModel.from_pretrained("Salesforce/ctrl")
+
+        >>> # CTRL was trained with control codes as the first token
+        >>> inputs = tokenizer("Opinion My dog is cute", return_tensors="pt")
+        >>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values()
+
+        >>> outputs = model(**inputs)
+
+        >>> last_hidden_states = outputs.last_hidden_state
+        >>> list(last_hidden_states.shape)
+        [1, 5, 1280]
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+            batch_size = input_ids.shape[0]
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            batch_size = inputs_embeds.shape[0]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+        if use_cache and isinstance(past_key_values, tuple):
+            logger.warning_once(
+                "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. "
+                "You should pass an instance of `DynamicCache` instead, e.g. "
+                "`past_key_values=DynamicCache.from_legacy_cache(past_key_values)`."
+            )
+            past_key_values = DynamicCache.from_legacy_cache(past_key_values)
+
+        past_length = past_key_values.get_seq_length() if past_key_values is not None else 0
+        if position_ids is None:
+            position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
+            position_ids = position_ids.unsqueeze(0)
+
+        # Attention mask.
+        if attention_mask is not None:
+            if batch_size <= 0:
+                raise ValueError("batch_size has to be defined and > 0")
+            attention_mask = attention_mask.view(batch_size, -1)
+            # We create a 3D attention mask from a 2D tensor mask.
+            # Sizes are [batch_size, 1, 1, to_seq_length]
+            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+            # this attention mask is more simple than the triangular masking of causal attention
+            # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+            attention_mask = attention_mask.unsqueeze(1).unsqueeze(2)
+
+            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+            # masked positions, this operation will create a tensor which is 0.0 for
+            # positions we want to attend and the dtype's smallest value for masked positions.
+            # Since we are adding it to the raw scores before the softmax, this is
+            # effectively the same as removing these entirely.
+            attention_mask = attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+            attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min
+
+        # Prepare head mask if needed
+        head_mask = self.get_head_mask(head_mask, self.config.n_layer)
+
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.view(-1, input_shape[-1])
+            token_type_embeds = self.w(token_type_ids)
+            token_type_embeds *= np.sqrt(self.d_model_size)
+        else:
+            token_type_embeds = 0
+
+        if inputs_embeds is None:
+            inputs_embeds = self.w(input_ids)
+        # inputs_embeds = embedded.unsqueeze(0) if len(input_ids.shape)<2 else embedded
+        seq_len = input_shape[-1]
+        mask = torch.triu(torch.ones(seq_len + past_length, seq_len + past_length), 1).to(device)
+
+        inputs_embeds *= np.sqrt(self.d_model_size)
+
+        # `self.pos_encoding` won't be sent to the correct device along the model, so we do it manually.
+        self.pos_encoding = self.pos_encoding.to(device)
+        pos_embeds = self.pos_encoding[position_ids, :]
+
+        hidden_states = inputs_embeds + pos_embeds + token_type_embeds
+
+        hidden_states = self.dropout(hidden_states)
+
+        all_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        for i, h in enumerate(self.h):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            outputs = h(
+                hidden_states,
+                mask,
+                layer_past=past_key_values,
+                attention_mask=attention_mask,
+                head_mask=head_mask[i],
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                cache_position=cache_position,
+            )
+            hidden_states = outputs[0]
+            if output_attentions:
+                all_attentions += (outputs[1],)
+
+        hidden_states = self.layernorm(hidden_states)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v for v in [hidden_states, past_key_values, all_hidden_states, all_attentions] if v is not None
+            )
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    The CTRL Model transformer with a language modeling head on top (linear layer with weights tied to the input
+    embeddings).
+    """
+)
+class CTRLLMHeadModel(CTRLPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.transformer = CTRLModel(config)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size, bias=True)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> Union[tuple[torch.Tensor], CausalLMOutputWithPast]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0].shape[-2]`
+            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.
+
+            If `past_key_values` is used, only input IDs that do not have their past calculated should be passed as
+            `input_ids`.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            [`PreTrainedTokenizer.encode`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+
+        Example:
+
+        ```python
+        >>> import torch
+        >>> from transformers import AutoTokenizer, CTRLLMHeadModel
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
+        >>> model = CTRLLMHeadModel.from_pretrained("Salesforce/ctrl")
+
+        >>> # CTRL was trained with control codes as the first token
+        >>> inputs = tokenizer("Wikipedia The llama is", return_tensors="pt")
+        >>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values()
+
+        >>> sequence_ids = model.generate(inputs["input_ids"])
+        >>> sequences = tokenizer.batch_decode(sequence_ids)
+        >>> sequences
+        ['Wikipedia The llama is a member of the family Bovidae. It is native to the Andes of Peru,']
+
+        >>> outputs = model(**inputs, labels=inputs["input_ids"])
+        >>> round(outputs.loss.item(), 2)
+        9.21
+
+        >>> list(outputs.logits.shape)
+        [1, 5, 246534]
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+
+        hidden_states = transformer_outputs[0]
+
+        lm_logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(
+                lm_logits,
+                labels,
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, use_cache=None, **kwargs):
+        # Overwritten -- inputs_embeds not working properly
+
+        # only last tokens for inputs_ids if past is defined in kwargs
+        if past_key_values is not None:
+            past_length = past_key_values.get_seq_length()
+
+            # Some generation methods already pass only the last input ID
+            if input_ids.shape[1] > past_length:
+                remove_prefix_length = past_length
+            else:
+                # Default to old behavior: keep only final ID
+                remove_prefix_length = input_ids.shape[1] - 1
+
+            input_ids = input_ids[:, remove_prefix_length:]
+
+        model_inputs = {"input_ids": input_ids, "past_key_values": past_key_values, "use_cache": use_cache}
+
+        # token_type_ids are computed on CTRLModel.forward()
+        kwargs.pop("token_type_ids", None)
+        # Forward ALL kwargs that are uninitialized (e.g. `use_cache`).
+        for key, value in kwargs.items():
+            if key not in model_inputs:
+                print(f"Warning: {key} is not a recognized input.")
+                model_inputs[key] = value
+
+        return model_inputs
+
+
+@auto_docstring(
+    custom_intro="""
+    The CTRL Model transformer with a sequence classification head on top (linear layer).
+    [`CTRLForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do. Since it does classification on the last token, it requires to know the position of the last
+    token. If a `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in
+    each row. If no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot
+    guess the padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last
+    value in each row of the batch).
+    """
+)
+class CTRLForSequenceClassification(CTRLPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.transformer = CTRLModel(config)
+        self.classifier = nn.Linear(config.n_embd, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], SequenceClassifierOutput]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else `past_key_values[0].shape[-2]`
+            (`sequence_length` of input past key value states). Indices of input sequence tokens in the vocabulary.
+
+            If `past_key_values` is used, only input IDs that do not have their past calculated should be passed as
+            `input_ids`.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            [`PreTrainedTokenizer.encode`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+
+        Example of single-label classification:
+
+        ```python
+        >>> import torch
+        >>> from transformers import AutoTokenizer, CTRLForSequenceClassification
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
+        >>> model = CTRLForSequenceClassification.from_pretrained("Salesforce/ctrl")
+
+        >>> # CTRL was trained with control codes as the first token
+        >>> inputs = tokenizer("Opinion My dog is cute", return_tensors="pt")
+        >>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values()
+
+        >>> with torch.no_grad():
+        ...     logits = model(**inputs).logits
+
+        >>> predicted_class_id = logits.argmax().item()
+        >>> model.config.id2label[predicted_class_id]
+        'LABEL_0'
+        ```
+
+        ```python
+        >>> import torch
+
+        >>> torch.manual_seed(42)  # doctest: +IGNORE_RESULT
+        >>> # To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)`
+        >>> num_labels = len(model.config.id2label)
+        >>> model = CTRLForSequenceClassification.from_pretrained("Salesforce/ctrl", num_labels=num_labels)
+
+        >>> labels = torch.tensor(1)
+        >>> loss = model(**inputs, labels=labels).loss
+        >>> round(loss.item(), 2)
+        0.93
+        ```
+
+        Example of multi-label classification:
+
+        ```python
+        >>> import torch
+        >>> from transformers import AutoTokenizer, CTRLForSequenceClassification
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("Salesforce/ctrl")
+        >>> model = CTRLForSequenceClassification.from_pretrained(
+        ...     "Salesforce/ctrl", problem_type="multi_label_classification"
+        ... )
+
+        >>> # CTRL was trained with control codes as the first token
+        >>> inputs = tokenizer("Opinion My dog is cute", return_tensors="pt")
+        >>> assert inputs["input_ids"][0, 0].item() in tokenizer.control_codes.values()
+
+        >>> with torch.no_grad():
+        ...     logits = model(**inputs).logits
+
+        >>> predicted_class_id = logits.argmax().item()
+        >>> model.config.id2label[predicted_class_id]
+        'LABEL_0'
+        ```
+
+        ```python
+        >>> # To train a model on `num_labels` classes, you can pass `num_labels=num_labels` to `.from_pretrained(...)`
+        >>> num_labels = len(model.config.id2label)
+        >>> model = CTRLForSequenceClassification.from_pretrained("Salesforce/ctrl", num_labels=num_labels)
+
+        >>> num_labels = len(model.config.id2label)
+        >>> labels = torch.nn.functional.one_hot(torch.tensor([predicted_class_id]), num_classes=num_labels).to(
+        ...     torch.float
+        ... )
+        >>> loss = model(**inputs, labels=labels).loss
+        >>> loss.backward()  # doctest: +IGNORE_RESULT
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = transformer_outputs[0]
+        logits = self.classifier(hidden_states)
+
+        if input_ids is not None:
+            batch_size, sequence_length = input_ids.shape[:2]
+        else:
+            batch_size, sequence_length = inputs_embeds.shape[:2]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            last_non_pad_token = -1
+        elif input_ids is not None:
+            # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
+            non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
+            last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
+        else:
+            last_non_pad_token = -1
+            logger.warning_once(
+                f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
+                "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
+            )
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token]
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=pooled_logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+__all__ = ["CTRLForSequenceClassification", "CTRLLMHeadModel", "CTRLModel", "CTRLPreTrainedModel"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ctrl/modeling_tf_ctrl.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ctrl/modeling_tf_ctrl.py
new file mode 100644
index 0000000000000000000000000000000000000000..1dce90147bd854b5072410eb6d6e74f73c0a9fd0
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ctrl/modeling_tf_ctrl.py
@@ -0,0 +1,920 @@
+# coding=utf-8
+# Copyright 2018 Salesforce and HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TF 2.0 CTRL model."""
+
+from __future__ import annotations
+
+import numpy as np
+import tensorflow as tf
+
+from ...modeling_tf_outputs import TFBaseModelOutputWithPast, TFCausalLMOutputWithPast, TFSequenceClassifierOutput
+from ...modeling_tf_utils import (
+    TFCausalLanguageModelingLoss,
+    TFModelInputType,
+    TFPreTrainedModel,
+    TFSequenceClassificationLoss,
+    get_initializer,
+    keras,
+    keras_serializable,
+    unpack_inputs,
+)
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
+from ...utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from .configuration_ctrl import CTRLConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "Salesforce/ctrl"
+_CONFIG_FOR_DOC = "CTRLConfig"
+
+
+def angle_defn(pos, i, d_model_size):
+    angle_rates = 1 / np.power(10000, (2 * (i // 2)) / d_model_size)
+    return pos * angle_rates
+
+
+def positional_encoding(position, d_model_size):
+    # create the sinusoidal pattern for the positional encoding
+    angle_rads = angle_defn(np.arange(position)[:, np.newaxis], np.arange(d_model_size)[np.newaxis, :], d_model_size)
+
+    sines = np.sin(angle_rads[:, 0::2])
+    cosines = np.cos(angle_rads[:, 1::2])
+    pos_encoding = tf.convert_to_tensor(np.concatenate([sines, cosines], axis=-1))
+
+    return pos_encoding
+
+
+def scaled_dot_product_attention(q, k, v, mask, attention_mask=None, head_mask=None):
+    # calculate attention
+    matmul_qk = tf.matmul(q, k, transpose_b=True)
+
+    dk = tf.cast(shape_list(k)[-1], dtype=matmul_qk.dtype)
+    scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)
+
+    if mask is not None:
+        scaled_attention_logits += tf.cast(mask * -1e4, dtype=scaled_attention_logits.dtype)
+
+    if attention_mask is not None:
+        # Apply the attention mask
+        attention_mask = tf.cast(attention_mask, dtype=scaled_attention_logits.dtype)
+        scaled_attention_logits = scaled_attention_logits + attention_mask
+
+    attention_weights = stable_softmax(scaled_attention_logits, axis=-1)
+
+    # Mask heads if we want to
+    if head_mask is not None:
+        attention_weights = attention_weights * head_mask
+
+    output = tf.matmul(attention_weights, v)
+
+    return output, attention_weights
+
+
+class TFMultiHeadAttention(keras.layers.Layer):
+    def __init__(self, d_model_size, num_heads, output_attentions=False, **kwargs):
+        super().__init__(**kwargs)
+        self.num_heads = num_heads
+        self.d_model_size = d_model_size
+        self.output_attentions = output_attentions
+
+        self.depth = int(d_model_size / self.num_heads)
+
+        self.Wq = keras.layers.Dense(d_model_size, name="Wq")
+        self.Wk = keras.layers.Dense(d_model_size, name="Wk")
+        self.Wv = keras.layers.Dense(d_model_size, name="Wv")
+
+        self.dense = keras.layers.Dense(d_model_size, name="dense")
+
+    def split_into_heads(self, x, batch_size):
+        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
+        return tf.transpose(x, perm=[0, 2, 1, 3])
+
+    def call(self, v, k, q, mask, layer_past, attention_mask, head_mask, use_cache, output_attentions, training=False):
+        batch_size = shape_list(q)[0]
+
+        q = self.Wq(q)
+        k = self.Wk(k)
+        v = self.Wv(v)
+
+        q = self.split_into_heads(q, batch_size)
+        k = self.split_into_heads(k, batch_size)
+        v = self.split_into_heads(v, batch_size)
+
+        if layer_past is not None:
+            past_key, past_value = tf.unstack(layer_past, axis=0)
+            k = tf.concat((past_key, k), axis=-2)
+            v = tf.concat((past_value, v), axis=-2)
+
+        if use_cache:
+            present = tf.stack((k, v), axis=0)
+        else:
+            present = (None,)
+
+        output = scaled_dot_product_attention(q, k, v, mask, attention_mask, head_mask)
+        scaled_attention = tf.transpose(output[0], perm=[0, 2, 1, 3])
+        attn = output[1]
+        original_size_attention = tf.reshape(scaled_attention, (batch_size, -1, self.d_model_size))
+        output = self.dense(original_size_attention)
+        outputs = (output, present)
+
+        if output_attentions:
+            outputs = outputs + (attn,)
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "Wq", None) is not None:
+            with tf.name_scope(self.Wq.name):
+                self.Wq.build([None, None, self.d_model_size])
+        if getattr(self, "Wk", None) is not None:
+            with tf.name_scope(self.Wk.name):
+                self.Wk.build([None, None, self.d_model_size])
+        if getattr(self, "Wv", None) is not None:
+            with tf.name_scope(self.Wv.name):
+                self.Wv.build([None, None, self.d_model_size])
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.d_model_size])
+
+
+class TFPointWiseFeedForwardLayer(keras.layers.Layer):
+    def __init__(self, d_model_size, dff, **kwargs):
+        super().__init__(**kwargs)
+
+        self.dense_0 = keras.layers.Dense(dff, activation="relu", name="0")
+        self.dense_2 = keras.layers.Dense(d_model_size, name="2")
+        self.d_model_size = d_model_size
+        self.dff = dff
+
+    def call(self, inputs, trainable=False):
+        dense_0_output = self.dense_0(inputs)
+        dense_2_output = self.dense_2(dense_0_output)
+
+        return dense_2_output
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense_0", None) is not None:
+            with tf.name_scope(self.dense_0.name):
+                self.dense_0.build([None, None, self.d_model_size])
+        if getattr(self, "dense_2", None) is not None:
+            with tf.name_scope(self.dense_2.name):
+                self.dense_2.build([None, None, self.dff])
+
+
+class TFEncoderLayer(keras.layers.Layer):
+    def __init__(
+        self, d_model_size, num_heads, dff, rate=0.1, layer_norm_epsilon=1e-6, output_attentions=False, **kwargs
+    ):
+        super().__init__(**kwargs)
+
+        self.output_attentions = output_attentions
+
+        self.multi_head_attention = TFMultiHeadAttention(
+            d_model_size, num_heads, output_attentions=self.output_attentions, name="multi_head_attention"
+        )
+        self.ffn = TFPointWiseFeedForwardLayer(d_model_size, dff, name="ffn")
+
+        self.layernorm1 = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layernorm1")
+        self.layernorm2 = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layernorm2")
+
+        self.dropout1 = keras.layers.Dropout(rate)
+        self.dropout2 = keras.layers.Dropout(rate)
+        self.d_model_size = d_model_size
+
+    def call(self, x, mask, layer_past, attention_mask, head_mask, use_cache, output_attentions, training=False):
+        normed = self.layernorm1(x)
+        attn_outputs = self.multi_head_attention(
+            normed,
+            normed,
+            normed,
+            mask,
+            layer_past,
+            attention_mask,
+            head_mask,
+            use_cache,
+            output_attentions,
+            training=training,
+        )
+        attn_output = attn_outputs[0]
+        attn_output = self.dropout1(attn_output, training=training)
+        out1 = x + attn_output
+
+        out2 = self.layernorm2(out1)
+        ffn_output = self.ffn(out2)
+        ffn_output = self.dropout2(ffn_output, training=training)
+        out2 = out1 + ffn_output
+
+        outputs = (out2,) + attn_outputs[1:]
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "multi_head_attention", None) is not None:
+            with tf.name_scope(self.multi_head_attention.name):
+                self.multi_head_attention.build(None)
+        if getattr(self, "ffn", None) is not None:
+            with tf.name_scope(self.ffn.name):
+                self.ffn.build(None)
+        if getattr(self, "layernorm1", None) is not None:
+            with tf.name_scope(self.layernorm1.name):
+                self.layernorm1.build([None, None, self.d_model_size])
+        if getattr(self, "layernorm2", None) is not None:
+            with tf.name_scope(self.layernorm2.name):
+                self.layernorm2.build([None, None, self.d_model_size])
+
+
+@keras_serializable
+class TFCTRLMainLayer(keras.layers.Layer):
+    config_class = CTRLConfig
+
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+
+        self.config = config
+        self.output_hidden_states = config.output_hidden_states
+        self.output_attentions = config.output_attentions
+        self.use_cache = config.use_cache
+        self.return_dict = config.use_return_dict
+
+        self.d_model_size = config.n_embd
+        self.num_layers = config.n_layer
+
+        self.pos_encoding = positional_encoding(config.n_positions, self.d_model_size)
+
+        self.w = keras.layers.Embedding(
+            input_dim=config.vocab_size,
+            output_dim=config.n_embd,
+            embeddings_initializer=get_initializer(config.initializer_range),
+            name="w",
+        )
+
+        self.dropout = keras.layers.Dropout(config.embd_pdrop)
+        self.h = [
+            TFEncoderLayer(
+                config.n_embd,
+                config.n_head,
+                config.dff,
+                config.resid_pdrop,
+                config.layer_norm_epsilon,
+                self.output_attentions,
+                name=f"h_._{i}",
+            )
+            for i in range(config.n_layer)
+        ]
+        self.layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_epsilon, name="layernorm")
+
+    def get_input_embeddings(self):
+        return self.w
+
+    def set_input_embeddings(self, new_embeddings):
+        self.w = new_embeddings
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        """
+        raise NotImplementedError
+
+    @unpack_inputs
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool | None = False,
+    ) -> tuple | TFBaseModelOutputWithPast:
+        # If using past key value states, only the last tokens
+        # should be given as an input
+        if past_key_values is not None:
+            if input_ids is not None:
+                input_ids = input_ids[:, -1:]
+            if inputs_embeds is not None:
+                inputs_embeds = inputs_embeds[:, -1:]
+            if token_type_ids is not None:
+                token_type_ids = token_type_ids[:, -1:]
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = shape_list(input_ids)
+            input_ids = tf.reshape(input_ids, [-1, input_shape[-1]])
+        elif inputs_embeds is not None:
+            input_shape = shape_list(inputs_embeds)[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if past_key_values is None:
+            past_length = 0
+            past_key_values = [None] * len(self.h)
+        else:
+            past_length = shape_list(past_key_values[0][0])[-2]
+        if position_ids is None:
+            position_ids = tf.expand_dims(tf.range(past_length, input_shape[-1] + past_length, dtype=tf.int32), axis=0)
+            position_ids = tf.tile(position_ids, [input_shape[0], 1])
+
+        # Attention mask.
+        if attention_mask is not None:
+            # We create a 3D attention mask from a 2D tensor mask.
+            # Sizes are [batch_size, 1, 1, to_seq_length]
+            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+            # this attention mask is more simple than the triangular masking of causal attention
+            # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+            attention_mask = tf.reshape(attention_mask, (input_shape[0], 1, 1, input_shape[1] + past_length))
+
+            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+            # masked positions, this operation will create a tensor which is 0.0 for
+            # positions we want to attend and -10000.0 for masked positions.
+            # Since we are adding it to the raw scores before the softmax, this is
+            # effectively the same as removing these entirely.
+
+            one_cst = tf.constant(1.0)
+            ten_thousand_cst = tf.constant(-10000.0)
+            attention_mask = tf.cast(attention_mask, dtype=one_cst.dtype)
+            attention_mask = tf.multiply(tf.subtract(one_cst, attention_mask), ten_thousand_cst)
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # head_mask has shape n_layer x batch x n_heads x N x N
+        if head_mask is not None:
+            raise NotImplementedError
+        else:
+            head_mask = [None] * self.num_layers
+
+        if token_type_ids is not None:
+            token_type_ids = tf.reshape(token_type_ids, [-1, shape_list(token_type_ids)[-1]])
+            token_type_embeds = self.w(token_type_ids)
+            token_type_embeds *= tf.math.sqrt(tf.cast(self.d_model_size, dtype=token_type_embeds.dtype))
+        else:
+            token_type_embeds = tf.constant(0.0)
+        position_ids = tf.reshape(position_ids, [-1, shape_list(position_ids)[-1]])
+
+        if inputs_embeds is None:
+            check_embeddings_within_bounds(input_ids, self.w.input_dim)
+            inputs_embeds = self.w(input_ids)
+        seq_len = input_shape[-1]
+        mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
+
+        inputs_embeds *= tf.math.sqrt(tf.cast(self.d_model_size, inputs_embeds.dtype))
+
+        pos_embeds = tf.gather(self.pos_encoding, position_ids)
+        pos_embeds = tf.cast(pos_embeds, dtype=token_type_embeds.dtype)
+        hidden_states = inputs_embeds + pos_embeds + token_type_embeds
+
+        hidden_states = self.dropout(hidden_states, training=training)
+
+        output_shape = input_shape + [shape_list(hidden_states)[-1]]
+        presents = () if use_cache else None
+        all_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        for i, (h, layer_past) in enumerate(zip(self.h, past_key_values)):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (tf.reshape(hidden_states, output_shape),)
+            outputs = h(
+                hidden_states,
+                mask,
+                layer_past,
+                attention_mask,
+                head_mask[i],
+                use_cache,
+                output_attentions,
+                training=training,
+            )
+            hidden_states, present = outputs[:2]
+
+            if use_cache:
+                presents = presents + (present,)
+
+            if output_attentions:
+                all_attentions = all_attentions + (outputs[2],)
+
+        hidden_states = self.layernorm(hidden_states)
+        hidden_states = tf.reshape(hidden_states, output_shape)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if output_attentions:
+            # let the number of heads free (-1) so we can extract attention even after head pruning
+            attention_output_shape = input_shape[:-1] + [-1] + shape_list(all_attentions[0])[-2:]
+            all_attentions = tuple(tf.reshape(t, attention_output_shape) for t in all_attentions)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, presents, all_hidden_states, all_attentions] if v is not None)
+
+        return TFBaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "w", None) is not None:
+            with tf.name_scope(self.w.name):
+                self.w.build(None)
+        if getattr(self, "layernorm", None) is not None:
+            with tf.name_scope(self.layernorm.name):
+                self.layernorm.build([None, None, self.config.n_embd])
+        if getattr(self, "h", None) is not None:
+            for layer in self.h:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+
+
+class TFCTRLPreTrainedModel(TFPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = CTRLConfig
+    base_model_prefix = "transformer"
+
+
+CTRL_START_DOCSTRING = r"""
+
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
+    behavior.
+
+    <Tip>
+
+    TensorFlow models and layers in `transformers` accept two formats as input:
+
+    - having all inputs as keyword arguments (like PyTorch models), or
+    - having all inputs as a list, tuple or dict in the first positional argument.
+
+    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
+    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
+    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
+    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
+    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
+    positional argument:
+
+    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
+    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+    Note that when creating models and layers with
+    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
+    about any of this, as you can just pass inputs like you would to any other Python function!
+
+    </Tip>
+
+    Parameters:
+        config ([`CTRLConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+CTRL_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`Numpy array` or `tf.Tensor` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length` if `past` is `None` else `past[0].shape[-2]` (`sequence_length` of
+            input past key value states).
+
+            Indices of input sequence tokens in the vocabulary.
+
+            If `past` is used, only input IDs that do not have their past calculated should be passed as `input_ids`.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            [`PreTrainedTokenizer.encode`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        past (`list[tf.Tensor]` of length `config.n_layers`):
+            Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see
+            `past` output below). Can be used to speed up sequential decoding. The token ids which have their past
+            given to this model should not be passed as input ids as they have already been computed.
+        attention_mask (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, input_ids_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, input_ids_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`tf.Tensor` or `Numpy array` of shape `(batch_size, input_ids_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past` key value states are returned and can be used to speed up decoding (see `past`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
+            config will be used instead.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
+            used instead.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
+            eager mode, in graph mode the value will always be set to True.
+        training (`bool`, *optional*, defaults to `False`):
+            Whether or not to use the model in training mode (some modules like dropout modules have different
+            behaviors between training and evaluation).
+"""
+
+
+@add_start_docstrings(
+    "The bare CTRL Model transformer outputting raw hidden-states without any specific head on top.",
+    CTRL_START_DOCSTRING,
+)
+class TFCTRLModel(TFCTRLPreTrainedModel):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.transformer = TFCTRLMainLayer(config, name="transformer")
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(CTRL_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFBaseModelOutputWithPast,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool | None = False,
+    ) -> tuple | TFBaseModelOutputWithPast:
+        outputs = self.transformer(
+            input_ids=input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "transformer", None) is not None:
+            with tf.name_scope(self.transformer.name):
+                self.transformer.build(None)
+
+
+class TFCTRLBiasLayer(keras.layers.Layer):
+    """
+    Bias as a layer. It is used for serialization purposes: `keras.Model.save_weights` stores on a per-layer basis,
+    so all weights have to be registered in a layer.
+    """
+
+    def __init__(self, shape, initializer, trainable, name, **kwargs):
+        super().__init__(name=name, **kwargs)
+        self.shape = shape
+        self.initializer = initializer
+        self.trainable = trainable
+
+    def build(self, input_shape):
+        self.bias = self.add_weight(
+            name="bias", shape=self.shape, initializer=self.initializer, trainable=self.trainable
+        )
+        super().build(input_shape)
+
+    def call(self, x):
+        return x + self.bias
+
+
+@add_start_docstrings(
+    """
+    The CTRL Model transformer with a language modeling head on top (linear layer with weights tied to the input
+    embeddings).
+    """,
+    CTRL_START_DOCSTRING,
+)
+class TFCTRLLMHeadModel(TFCTRLPreTrainedModel, TFCausalLanguageModelingLoss):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.transformer = TFCTRLMainLayer(config, name="transformer")
+        self.bias_layer = TFCTRLBiasLayer(
+            name="lm_head", shape=[1, config.vocab_size], initializer="zeros", trainable=True
+        )
+
+    def get_output_embeddings(self):
+        return self.get_input_embeddings()
+
+    def set_output_embeddings(self, value):
+        self.set_input_embeddings(value)
+
+    def get_bias(self):
+        return {"lm_head.bias": self.bias_layer.bias}
+
+    def set_bias(self, value):
+        # Replaces the existing layers containing bias for correct (de)serialization.
+        vocab_size = value["lm_head.bias"].shape[-1]
+        self.bias_layer = TFCTRLBiasLayer(
+            name="final_logits_bias", shape=[1, vocab_size], initializer="zeros", trainable=True
+        )
+        self.bias_layer.build(None)
+        self.bias_layer.bias.assign(value["lm_head.bias"])
+
+    # Copied from transformers.models.gpt2.modeling_tf_gpt2.TFGPT2LMHeadModel.prepare_inputs_for_generation
+    def prepare_inputs_for_generation(self, inputs, past_key_values=None, use_cache=None, **kwargs):
+        token_type_ids = kwargs.get("token_type_ids")
+        # only last token for inputs_ids if past is defined in kwargs
+        if past_key_values:
+            inputs = tf.expand_dims(inputs[:, -1], -1)
+            if token_type_ids is not None:
+                token_type_ids = tf.expand_dims(token_type_ids[:, -1], -1)
+
+        position_ids = kwargs.get("position_ids")
+        attention_mask = kwargs.get("attention_mask")
+
+        if attention_mask is not None and position_ids is None:
+            position_ids = tf.math.cumsum(attention_mask, axis=-1, exclusive=True)
+            if past_key_values:
+                position_ids = tf.expand_dims(position_ids[:, -1], -1)
+
+        return {
+            "input_ids": inputs,
+            "attention_mask": attention_mask,
+            "position_ids": position_ids,
+            "past_key_values": past_key_values,
+            "use_cache": use_cache,
+            "token_type_ids": token_type_ids,
+        }
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(CTRL_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFCausalLMOutputWithPast,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool | None = False,
+    ) -> tuple | TFCausalLMOutputWithPast:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the cross entropy classification loss. Indices should be in `[0, ...,
+            config.vocab_size - 1]`.
+        """
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = tf.matmul(hidden_states, self.transformer.w.weights, transpose_b=True)
+        logits = self.bias_layer(logits)
+
+        loss = None
+        if labels is not None:
+            # shift labels to the left and cut last logit token
+            shifted_logits = logits[:, :-1]
+            labels = labels[:, 1:]
+            loss = self.hf_compute_loss(labels, shifted_logits)
+
+        if not return_dict:
+            output = (logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "transformer", None) is not None:
+            with tf.name_scope(self.transformer.name):
+                self.transformer.build(None)
+        if getattr(self, "bias_layer", None) is not None:
+            with tf.name_scope(self.bias_layer.name):
+                self.bias_layer.build(None)
+
+
+@add_start_docstrings(
+    """
+    The CTRL Model transformer with a sequence classification head on top (linear layer).
+
+    [`TFCTRLForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-1, GPT-2) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    CTRL_START_DOCSTRING,
+)
+class TFCTRLForSequenceClassification(TFCTRLPreTrainedModel, TFSequenceClassificationLoss):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+        self.classifier = keras.layers.Dense(
+            config.num_labels,
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="classifier",
+            use_bias=False,
+        )
+        self.transformer = TFCTRLMainLayer(config, name="transformer")
+        self.config = config
+
+    def get_output_embeddings(self):
+        # Remove after transformers v4.32. Fix this model's `test_model_common_attributes` test too.
+        logger.warning(
+            "Sequence classification models do not have output embeddings. `.get_output_embeddings` will be removed "
+            "in transformers v4.32."
+        )
+        return self.transformer.w
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(CTRL_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFSequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        token_type_ids: np.ndarray | tf.Tensor | None = None,
+        position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool | None = False,
+    ) -> tuple | TFSequenceClassifierOutput:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the cross entropy classification loss. Indices should be in `[0, ...,
+            config.vocab_size - 1]`.
+        """
+
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.classifier(hidden_states)
+        logits_shape = shape_list(logits)
+        batch_size = logits_shape[0]
+
+        if self.config.pad_token_id is None:
+            last_non_pad_token = tf.fill((batch_size,), value=logits_shape[1] - 1)
+        else:
+            if input_ids is not None:
+                token_indices = tf.range(shape_list(input_ids)[-1])
+                non_pad_mask = tf.cast(input_ids != self.config.pad_token_id, token_indices.dtype)
+                last_non_pad_token = tf.reduce_max(token_indices * non_pad_mask, axis=-1)
+            else:
+                last_non_pad_token = tf.fill((batch_size,), value=logits_shape[1] - 1)
+                logger.warning_once(
+                    f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
+                    "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
+                )
+        loss = None
+
+        pooled_logits = tf.gather(logits, last_non_pad_token, batch_dims=1, axis=1)
+
+        if labels is not None:
+            if self.config.pad_token_id is None and logits_shape[0] != 1:
+                raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+
+            loss = self.hf_compute_loss(tf.reshape(labels, [-1]), tf.reshape(pooled_logits, [-1, self.num_labels]))
+
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFSequenceClassifierOutput(
+            loss=loss,
+            logits=pooled_logits,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build([None, None, self.config.n_embd])
+        if getattr(self, "transformer", None) is not None:
+            with tf.name_scope(self.transformer.name):
+                self.transformer.build(None)
+
+
+__all__ = ["TFCTRLForSequenceClassification", "TFCTRLLMHeadModel", "TFCTRLModel", "TFCTRLPreTrainedModel"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cvt/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cvt/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..756aded9e6ad2964ffa5add89af2c4af56071e88
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cvt/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_cvt import *
+    from .modeling_cvt import *
+    from .modeling_tf_cvt import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cvt/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cvt/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1493ccb6e1ec2e3ee7b6276c321e046d639707bf
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cvt/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cvt/__pycache__/configuration_cvt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cvt/__pycache__/configuration_cvt.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8d18d603511f707943e62c9ee01827891550a06b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cvt/__pycache__/configuration_cvt.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cvt/__pycache__/modeling_cvt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cvt/__pycache__/modeling_cvt.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fdb38ce24c0e75e6ca4df0b7749b71a18120d003
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cvt/__pycache__/modeling_cvt.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cvt/__pycache__/modeling_tf_cvt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cvt/__pycache__/modeling_tf_cvt.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cf1bb68399f58b5df4bf4f4cecf196b37977158a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cvt/__pycache__/modeling_tf_cvt.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cvt/configuration_cvt.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cvt/configuration_cvt.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba3f6b339627d868f79e4c53a022fed117166074
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cvt/configuration_cvt.py
@@ -0,0 +1,146 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""CvT model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class CvtConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`CvtModel`]. It is used to instantiate a CvT model
+    according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the CvT
+    [microsoft/cvt-13](https://huggingface.co/microsoft/cvt-13) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        patch_sizes (`list[int]`, *optional*, defaults to `[7, 3, 3]`):
+            The kernel size of each encoder's patch embedding.
+        patch_stride (`list[int]`, *optional*, defaults to `[4, 2, 2]`):
+            The stride size of each encoder's patch embedding.
+        patch_padding (`list[int]`, *optional*, defaults to `[2, 1, 1]`):
+            The padding size of each encoder's patch embedding.
+        embed_dim (`list[int]`, *optional*, defaults to `[64, 192, 384]`):
+            Dimension of each of the encoder blocks.
+        num_heads (`list[int]`, *optional*, defaults to `[1, 3, 6]`):
+            Number of attention heads for each attention layer in each block of the Transformer encoder.
+        depth (`list[int]`, *optional*, defaults to `[1, 2, 10]`):
+            The number of layers in each encoder block.
+        mlp_ratios (`list[float]`, *optional*, defaults to `[4.0, 4.0, 4.0, 4.0]`):
+            Ratio of the size of the hidden layer compared to the size of the input layer of the Mix FFNs in the
+            encoder blocks.
+        attention_drop_rate (`list[float]`, *optional*, defaults to `[0.0, 0.0, 0.0]`):
+            The dropout ratio for the attention probabilities.
+        drop_rate (`list[float]`, *optional*, defaults to `[0.0, 0.0, 0.0]`):
+            The dropout ratio for the patch embeddings probabilities.
+        drop_path_rate (`list[float]`, *optional*, defaults to `[0.0, 0.0, 0.1]`):
+            The dropout probability for stochastic depth, used in the blocks of the Transformer encoder.
+        qkv_bias (`list[bool]`, *optional*, defaults to `[True, True, True]`):
+            The bias bool for query, key and value in attentions
+        cls_token (`list[bool]`, *optional*, defaults to `[False, False, True]`):
+            Whether or not to add a classification token to the output of each of the last 3 stages.
+        qkv_projection_method (`list[string]`, *optional*, defaults to ["dw_bn", "dw_bn", "dw_bn"]`):
+            The projection method for query, key and value Default is depth-wise convolutions with batch norm. For
+            Linear projection use "avg".
+        kernel_qkv (`list[int]`, *optional*, defaults to `[3, 3, 3]`):
+            The kernel size for query, key and value in attention layer
+        padding_kv (`list[int]`, *optional*, defaults to `[1, 1, 1]`):
+            The padding size for key and value in attention layer
+        stride_kv (`list[int]`, *optional*, defaults to `[2, 2, 2]`):
+            The stride size for key and value in attention layer
+        padding_q (`list[int]`, *optional*, defaults to `[1, 1, 1]`):
+            The padding size for query in attention layer
+        stride_q (`list[int]`, *optional*, defaults to `[1, 1, 1]`):
+            The stride size for query in attention layer
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-6):
+            The epsilon used by the layer normalization layers.
+
+    Example:
+
+    ```python
+    >>> from transformers import CvtConfig, CvtModel
+
+    >>> # Initializing a Cvt msft/cvt style configuration
+    >>> configuration = CvtConfig()
+
+    >>> # Initializing a model (with random weights) from the msft/cvt style configuration
+    >>> model = CvtModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "cvt"
+
+    def __init__(
+        self,
+        num_channels=3,
+        patch_sizes=[7, 3, 3],
+        patch_stride=[4, 2, 2],
+        patch_padding=[2, 1, 1],
+        embed_dim=[64, 192, 384],
+        num_heads=[1, 3, 6],
+        depth=[1, 2, 10],
+        mlp_ratio=[4.0, 4.0, 4.0],
+        attention_drop_rate=[0.0, 0.0, 0.0],
+        drop_rate=[0.0, 0.0, 0.0],
+        drop_path_rate=[0.0, 0.0, 0.1],
+        qkv_bias=[True, True, True],
+        cls_token=[False, False, True],
+        qkv_projection_method=["dw_bn", "dw_bn", "dw_bn"],
+        kernel_qkv=[3, 3, 3],
+        padding_kv=[1, 1, 1],
+        stride_kv=[2, 2, 2],
+        padding_q=[1, 1, 1],
+        stride_q=[1, 1, 1],
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.num_channels = num_channels
+        self.patch_sizes = patch_sizes
+        self.patch_stride = patch_stride
+        self.patch_padding = patch_padding
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.depth = depth
+        self.mlp_ratio = mlp_ratio
+        self.attention_drop_rate = attention_drop_rate
+        self.drop_rate = drop_rate
+        self.drop_path_rate = drop_path_rate
+        self.qkv_bias = qkv_bias
+        self.cls_token = cls_token
+        self.qkv_projection_method = qkv_projection_method
+        self.kernel_qkv = kernel_qkv
+        self.padding_kv = padding_kv
+        self.stride_kv = stride_kv
+        self.padding_q = padding_q
+        self.stride_q = stride_q
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+
+
+__all__ = ["CvtConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cvt/modeling_cvt.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cvt/modeling_cvt.py
new file mode 100644
index 0000000000000000000000000000000000000000..9d935ee84893e05e925fde0ab2fd80397c7144e5
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cvt/modeling_cvt.py
@@ -0,0 +1,670 @@
+# coding=utf-8
+# Copyright 2022 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch CvT model."""
+
+import collections.abc
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...modeling_outputs import ImageClassifierOutputWithNoAttention, ModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import auto_docstring, logging
+from .configuration_cvt import CvtConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for model's outputs, with potential hidden states and attentions.
+    """
+)
+class BaseModelOutputWithCLSToken(ModelOutput):
+    r"""
+    cls_token_value (`torch.FloatTensor` of shape `(batch_size, 1, hidden_size)`):
+        Classification token at the output of the last layer of the model.
+    """
+
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    cls_token_value: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+# Copied from transformers.models.beit.modeling_beit.drop_path
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+
+
+# Copied from transformers.models.beit.modeling_beit.BeitDropPath
+class CvtDropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return f"p={self.drop_prob}"
+
+
+class CvtEmbeddings(nn.Module):
+    """
+    Construct the CvT embeddings.
+    """
+
+    def __init__(self, patch_size, num_channels, embed_dim, stride, padding, dropout_rate):
+        super().__init__()
+        self.convolution_embeddings = CvtConvEmbeddings(
+            patch_size=patch_size, num_channels=num_channels, embed_dim=embed_dim, stride=stride, padding=padding
+        )
+        self.dropout = nn.Dropout(dropout_rate)
+
+    def forward(self, pixel_values):
+        hidden_state = self.convolution_embeddings(pixel_values)
+        hidden_state = self.dropout(hidden_state)
+        return hidden_state
+
+
+class CvtConvEmbeddings(nn.Module):
+    """
+    Image to Conv Embedding.
+    """
+
+    def __init__(self, patch_size, num_channels, embed_dim, stride, padding):
+        super().__init__()
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        self.patch_size = patch_size
+        self.projection = nn.Conv2d(num_channels, embed_dim, kernel_size=patch_size, stride=stride, padding=padding)
+        self.normalization = nn.LayerNorm(embed_dim)
+
+    def forward(self, pixel_values):
+        pixel_values = self.projection(pixel_values)
+        batch_size, num_channels, height, width = pixel_values.shape
+        hidden_size = height * width
+        # rearrange "b c h w -> b (h w) c"
+        pixel_values = pixel_values.view(batch_size, num_channels, hidden_size).permute(0, 2, 1)
+        if self.normalization:
+            pixel_values = self.normalization(pixel_values)
+        # rearrange "b (h w) c" -> b c h w"
+        pixel_values = pixel_values.permute(0, 2, 1).view(batch_size, num_channels, height, width)
+        return pixel_values
+
+
+class CvtSelfAttentionConvProjection(nn.Module):
+    def __init__(self, embed_dim, kernel_size, padding, stride):
+        super().__init__()
+        self.convolution = nn.Conv2d(
+            embed_dim,
+            embed_dim,
+            kernel_size=kernel_size,
+            padding=padding,
+            stride=stride,
+            bias=False,
+            groups=embed_dim,
+        )
+        self.normalization = nn.BatchNorm2d(embed_dim)
+
+    def forward(self, hidden_state):
+        hidden_state = self.convolution(hidden_state)
+        hidden_state = self.normalization(hidden_state)
+        return hidden_state
+
+
+class CvtSelfAttentionLinearProjection(nn.Module):
+    def forward(self, hidden_state):
+        batch_size, num_channels, height, width = hidden_state.shape
+        hidden_size = height * width
+        # rearrange " b c h w -> b (h w) c"
+        hidden_state = hidden_state.view(batch_size, num_channels, hidden_size).permute(0, 2, 1)
+        return hidden_state
+
+
+class CvtSelfAttentionProjection(nn.Module):
+    def __init__(self, embed_dim, kernel_size, padding, stride, projection_method="dw_bn"):
+        super().__init__()
+        if projection_method == "dw_bn":
+            self.convolution_projection = CvtSelfAttentionConvProjection(embed_dim, kernel_size, padding, stride)
+        self.linear_projection = CvtSelfAttentionLinearProjection()
+
+    def forward(self, hidden_state):
+        hidden_state = self.convolution_projection(hidden_state)
+        hidden_state = self.linear_projection(hidden_state)
+        return hidden_state
+
+
+class CvtSelfAttention(nn.Module):
+    def __init__(
+        self,
+        num_heads,
+        embed_dim,
+        kernel_size,
+        padding_q,
+        padding_kv,
+        stride_q,
+        stride_kv,
+        qkv_projection_method,
+        qkv_bias,
+        attention_drop_rate,
+        with_cls_token=True,
+        **kwargs,
+    ):
+        super().__init__()
+        self.scale = embed_dim**-0.5
+        self.with_cls_token = with_cls_token
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+
+        self.convolution_projection_query = CvtSelfAttentionProjection(
+            embed_dim,
+            kernel_size,
+            padding_q,
+            stride_q,
+            projection_method="linear" if qkv_projection_method == "avg" else qkv_projection_method,
+        )
+        self.convolution_projection_key = CvtSelfAttentionProjection(
+            embed_dim, kernel_size, padding_kv, stride_kv, projection_method=qkv_projection_method
+        )
+        self.convolution_projection_value = CvtSelfAttentionProjection(
+            embed_dim, kernel_size, padding_kv, stride_kv, projection_method=qkv_projection_method
+        )
+
+        self.projection_query = nn.Linear(embed_dim, embed_dim, bias=qkv_bias)
+        self.projection_key = nn.Linear(embed_dim, embed_dim, bias=qkv_bias)
+        self.projection_value = nn.Linear(embed_dim, embed_dim, bias=qkv_bias)
+
+        self.dropout = nn.Dropout(attention_drop_rate)
+
+    def rearrange_for_multi_head_attention(self, hidden_state):
+        batch_size, hidden_size, _ = hidden_state.shape
+        head_dim = self.embed_dim // self.num_heads
+        # rearrange 'b t (h d) -> b h t d'
+        return hidden_state.view(batch_size, hidden_size, self.num_heads, head_dim).permute(0, 2, 1, 3)
+
+    def forward(self, hidden_state, height, width):
+        if self.with_cls_token:
+            cls_token, hidden_state = torch.split(hidden_state, [1, height * width], 1)
+        batch_size, hidden_size, num_channels = hidden_state.shape
+        # rearrange "b (h w) c -> b c h w"
+        hidden_state = hidden_state.permute(0, 2, 1).view(batch_size, num_channels, height, width)
+
+        key = self.convolution_projection_key(hidden_state)
+        query = self.convolution_projection_query(hidden_state)
+        value = self.convolution_projection_value(hidden_state)
+
+        if self.with_cls_token:
+            query = torch.cat((cls_token, query), dim=1)
+            key = torch.cat((cls_token, key), dim=1)
+            value = torch.cat((cls_token, value), dim=1)
+
+        head_dim = self.embed_dim // self.num_heads
+
+        query = self.rearrange_for_multi_head_attention(self.projection_query(query))
+        key = self.rearrange_for_multi_head_attention(self.projection_key(key))
+        value = self.rearrange_for_multi_head_attention(self.projection_value(value))
+
+        attention_score = torch.einsum("bhlk,bhtk->bhlt", [query, key]) * self.scale
+        attention_probs = torch.nn.functional.softmax(attention_score, dim=-1)
+        attention_probs = self.dropout(attention_probs)
+
+        context = torch.einsum("bhlt,bhtv->bhlv", [attention_probs, value])
+        # rearrange"b h t d -> b t (h d)"
+        _, _, hidden_size, _ = context.shape
+        context = context.permute(0, 2, 1, 3).contiguous().view(batch_size, hidden_size, self.num_heads * head_dim)
+        return context
+
+
+class CvtSelfOutput(nn.Module):
+    """
+    The residual connection is defined in CvtLayer instead of here (as is the case with other models), due to the
+    layernorm applied before each block.
+    """
+
+    def __init__(self, embed_dim, drop_rate):
+        super().__init__()
+        self.dense = nn.Linear(embed_dim, embed_dim)
+        self.dropout = nn.Dropout(drop_rate)
+
+    def forward(self, hidden_state, input_tensor):
+        hidden_state = self.dense(hidden_state)
+        hidden_state = self.dropout(hidden_state)
+        return hidden_state
+
+
+class CvtAttention(nn.Module):
+    def __init__(
+        self,
+        num_heads,
+        embed_dim,
+        kernel_size,
+        padding_q,
+        padding_kv,
+        stride_q,
+        stride_kv,
+        qkv_projection_method,
+        qkv_bias,
+        attention_drop_rate,
+        drop_rate,
+        with_cls_token=True,
+    ):
+        super().__init__()
+        self.attention = CvtSelfAttention(
+            num_heads,
+            embed_dim,
+            kernel_size,
+            padding_q,
+            padding_kv,
+            stride_q,
+            stride_kv,
+            qkv_projection_method,
+            qkv_bias,
+            attention_drop_rate,
+            with_cls_token,
+        )
+        self.output = CvtSelfOutput(embed_dim, drop_rate)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(self, hidden_state, height, width):
+        self_output = self.attention(hidden_state, height, width)
+        attention_output = self.output(self_output, hidden_state)
+        return attention_output
+
+
+class CvtIntermediate(nn.Module):
+    def __init__(self, embed_dim, mlp_ratio):
+        super().__init__()
+        self.dense = nn.Linear(embed_dim, int(embed_dim * mlp_ratio))
+        self.activation = nn.GELU()
+
+    def forward(self, hidden_state):
+        hidden_state = self.dense(hidden_state)
+        hidden_state = self.activation(hidden_state)
+        return hidden_state
+
+
+class CvtOutput(nn.Module):
+    def __init__(self, embed_dim, mlp_ratio, drop_rate):
+        super().__init__()
+        self.dense = nn.Linear(int(embed_dim * mlp_ratio), embed_dim)
+        self.dropout = nn.Dropout(drop_rate)
+
+    def forward(self, hidden_state, input_tensor):
+        hidden_state = self.dense(hidden_state)
+        hidden_state = self.dropout(hidden_state)
+        hidden_state = hidden_state + input_tensor
+        return hidden_state
+
+
+class CvtLayer(nn.Module):
+    """
+    CvtLayer composed by attention layers, normalization and multi-layer perceptrons (mlps).
+    """
+
+    def __init__(
+        self,
+        num_heads,
+        embed_dim,
+        kernel_size,
+        padding_q,
+        padding_kv,
+        stride_q,
+        stride_kv,
+        qkv_projection_method,
+        qkv_bias,
+        attention_drop_rate,
+        drop_rate,
+        mlp_ratio,
+        drop_path_rate,
+        with_cls_token=True,
+    ):
+        super().__init__()
+        self.attention = CvtAttention(
+            num_heads,
+            embed_dim,
+            kernel_size,
+            padding_q,
+            padding_kv,
+            stride_q,
+            stride_kv,
+            qkv_projection_method,
+            qkv_bias,
+            attention_drop_rate,
+            drop_rate,
+            with_cls_token,
+        )
+
+        self.intermediate = CvtIntermediate(embed_dim, mlp_ratio)
+        self.output = CvtOutput(embed_dim, mlp_ratio, drop_rate)
+        self.drop_path = CvtDropPath(drop_prob=drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
+        self.layernorm_before = nn.LayerNorm(embed_dim)
+        self.layernorm_after = nn.LayerNorm(embed_dim)
+
+    def forward(self, hidden_state, height, width):
+        self_attention_output = self.attention(
+            self.layernorm_before(hidden_state),  # in Cvt, layernorm is applied before self-attention
+            height,
+            width,
+        )
+        attention_output = self_attention_output
+        attention_output = self.drop_path(attention_output)
+
+        # first residual connection
+        hidden_state = attention_output + hidden_state
+
+        # in Cvt, layernorm is also applied after self-attention
+        layer_output = self.layernorm_after(hidden_state)
+        layer_output = self.intermediate(layer_output)
+
+        # second residual connection is done here
+        layer_output = self.output(layer_output, hidden_state)
+        layer_output = self.drop_path(layer_output)
+        return layer_output
+
+
+class CvtStage(nn.Module):
+    def __init__(self, config, stage):
+        super().__init__()
+        self.config = config
+        self.stage = stage
+        if self.config.cls_token[self.stage]:
+            self.cls_token = nn.Parameter(torch.randn(1, 1, self.config.embed_dim[-1]))
+
+        self.embedding = CvtEmbeddings(
+            patch_size=config.patch_sizes[self.stage],
+            stride=config.patch_stride[self.stage],
+            num_channels=config.num_channels if self.stage == 0 else config.embed_dim[self.stage - 1],
+            embed_dim=config.embed_dim[self.stage],
+            padding=config.patch_padding[self.stage],
+            dropout_rate=config.drop_rate[self.stage],
+        )
+
+        drop_path_rates = [
+            x.item() for x in torch.linspace(0, config.drop_path_rate[self.stage], config.depth[stage], device="cpu")
+        ]
+
+        self.layers = nn.Sequential(
+            *[
+                CvtLayer(
+                    num_heads=config.num_heads[self.stage],
+                    embed_dim=config.embed_dim[self.stage],
+                    kernel_size=config.kernel_qkv[self.stage],
+                    padding_q=config.padding_q[self.stage],
+                    padding_kv=config.padding_kv[self.stage],
+                    stride_kv=config.stride_kv[self.stage],
+                    stride_q=config.stride_q[self.stage],
+                    qkv_projection_method=config.qkv_projection_method[self.stage],
+                    qkv_bias=config.qkv_bias[self.stage],
+                    attention_drop_rate=config.attention_drop_rate[self.stage],
+                    drop_rate=config.drop_rate[self.stage],
+                    drop_path_rate=drop_path_rates[self.stage],
+                    mlp_ratio=config.mlp_ratio[self.stage],
+                    with_cls_token=config.cls_token[self.stage],
+                )
+                for _ in range(config.depth[self.stage])
+            ]
+        )
+
+    def forward(self, hidden_state):
+        cls_token = None
+        hidden_state = self.embedding(hidden_state)
+        batch_size, num_channels, height, width = hidden_state.shape
+        # rearrange b c h w -> b (h w) c"
+        hidden_state = hidden_state.view(batch_size, num_channels, height * width).permute(0, 2, 1)
+        if self.config.cls_token[self.stage]:
+            cls_token = self.cls_token.expand(batch_size, -1, -1)
+            hidden_state = torch.cat((cls_token, hidden_state), dim=1)
+
+        for layer in self.layers:
+            layer_outputs = layer(hidden_state, height, width)
+            hidden_state = layer_outputs
+
+        if self.config.cls_token[self.stage]:
+            cls_token, hidden_state = torch.split(hidden_state, [1, height * width], 1)
+        hidden_state = hidden_state.permute(0, 2, 1).view(batch_size, num_channels, height, width)
+        return hidden_state, cls_token
+
+
+class CvtEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.stages = nn.ModuleList([])
+        for stage_idx in range(len(config.depth)):
+            self.stages.append(CvtStage(config, stage_idx))
+
+    def forward(self, pixel_values, output_hidden_states=False, return_dict=True):
+        all_hidden_states = () if output_hidden_states else None
+        hidden_state = pixel_values
+
+        cls_token = None
+        for _, (stage_module) in enumerate(self.stages):
+            hidden_state, cls_token = stage_module(hidden_state)
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_state,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_state, cls_token, all_hidden_states] if v is not None)
+
+        return BaseModelOutputWithCLSToken(
+            last_hidden_state=hidden_state,
+            cls_token_value=cls_token,
+            hidden_states=all_hidden_states,
+        )
+
+
+@auto_docstring
+class CvtPreTrainedModel(PreTrainedModel):
+    config: CvtConfig
+    base_model_prefix = "cvt"
+    main_input_name = "pixel_values"
+    _no_split_modules = ["CvtLayer"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            module.weight.data = nn.init.trunc_normal_(module.weight.data, mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, CvtStage):
+            if self.config.cls_token[module.stage]:
+                module.cls_token.data = nn.init.trunc_normal_(
+                    module.cls_token.data, mean=0.0, std=self.config.initializer_range
+                )
+
+
+@auto_docstring
+class CvtModel(CvtPreTrainedModel):
+    def __init__(self, config, add_pooling_layer=True):
+        r"""
+        add_pooling_layer (bool, *optional*, defaults to `True`):
+            Whether to add a pooling layer
+        """
+        super().__init__(config)
+        self.config = config
+        self.encoder = CvtEncoder(config)
+        self.post_init()
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutputWithCLSToken]:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        encoder_outputs = self.encoder(
+            pixel_values,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+
+        if not return_dict:
+            return (sequence_output,) + encoder_outputs[1:]
+
+        return BaseModelOutputWithCLSToken(
+            last_hidden_state=sequence_output,
+            cls_token_value=encoder_outputs.cls_token_value,
+            hidden_states=encoder_outputs.hidden_states,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    Cvt Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
+    the [CLS] token) e.g. for ImageNet.
+    """
+)
+class CvtForImageClassification(CvtPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.cvt = CvtModel(config, add_pooling_layer=False)
+        self.layernorm = nn.LayerNorm(config.embed_dim[-1])
+        # Classifier head
+        self.classifier = (
+            nn.Linear(config.embed_dim[-1], config.num_labels) if config.num_labels > 0 else nn.Identity()
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, ImageClassifierOutputWithNoAttention]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.cvt(
+            pixel_values,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        cls_token = outputs[1]
+        if self.config.cls_token[-1]:
+            sequence_output = self.layernorm(cls_token)
+        else:
+            batch_size, num_channels, height, width = sequence_output.shape
+            # rearrange "b c h w -> b (h w) c"
+            sequence_output = sequence_output.view(batch_size, num_channels, height * width).permute(0, 2, 1)
+            sequence_output = self.layernorm(sequence_output)
+
+        sequence_output_mean = sequence_output.mean(dim=1)
+        logits = self.classifier(sequence_output_mean)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.config.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.config.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.config.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return ImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states)
+
+
+__all__ = ["CvtForImageClassification", "CvtModel", "CvtPreTrainedModel"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cvt/modeling_tf_cvt.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cvt/modeling_tf_cvt.py
new file mode 100644
index 0000000000000000000000000000000000000000..9239e1918eeca7588e0a978af4ef524380cd3f5f
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/cvt/modeling_tf_cvt.py
@@ -0,0 +1,1095 @@
+# coding=utf-8
+# Copyright 2022 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TF 2.0 Cvt model."""
+
+from __future__ import annotations
+
+import collections.abc
+from dataclasses import dataclass
+
+import tensorflow as tf
+
+from ...modeling_tf_outputs import TFImageClassifierOutputWithNoAttention
+from ...modeling_tf_utils import (
+    TFModelInputType,
+    TFPreTrainedModel,
+    TFSequenceClassificationLoss,
+    get_initializer,
+    keras,
+    keras_serializable,
+    unpack_inputs,
+)
+from ...tf_utils import shape_list, stable_softmax
+from ...utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_cvt import CvtConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "CvtConfig"
+
+
+@dataclass
+class TFBaseModelOutputWithCLSToken(ModelOutput):
+    """
+    Base class for model's outputs.
+
+    Args:
+        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        cls_token_value (`tf.Tensor` of shape `(batch_size, 1, hidden_size)`):
+            Classification token at the output of the last layer of the model.
+        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus
+            the initial embedding outputs.
+    """
+
+    last_hidden_state: tf.Tensor | None = None
+    cls_token_value: tf.Tensor | None = None
+    hidden_states: tuple[tf.Tensor, ...] | None = None
+
+
+class TFCvtDropPath(keras.layers.Layer):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    References:
+        (1) github.com:rwightman/pytorch-image-models
+    """
+
+    def __init__(self, drop_prob: float, **kwargs):
+        super().__init__(**kwargs)
+        self.drop_prob = drop_prob
+
+    def call(self, x: tf.Tensor, training=None):
+        if self.drop_prob == 0.0 or not training:
+            return x
+        keep_prob = 1 - self.drop_prob
+        shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1)
+        random_tensor = keep_prob + tf.random.uniform(shape, 0, 1, dtype=self.compute_dtype)
+        random_tensor = tf.floor(random_tensor)
+        return (x / keep_prob) * random_tensor
+
+
+class TFCvtEmbeddings(keras.layers.Layer):
+    """Construct the Convolutional Token Embeddings."""
+
+    def __init__(
+        self,
+        config: CvtConfig,
+        patch_size: int,
+        num_channels: int,
+        embed_dim: int,
+        stride: int,
+        padding: int,
+        dropout_rate: float,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.convolution_embeddings = TFCvtConvEmbeddings(
+            config,
+            patch_size=patch_size,
+            num_channels=num_channels,
+            embed_dim=embed_dim,
+            stride=stride,
+            padding=padding,
+            name="convolution_embeddings",
+        )
+        self.dropout = keras.layers.Dropout(dropout_rate)
+
+    def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor:
+        hidden_state = self.convolution_embeddings(pixel_values)
+        hidden_state = self.dropout(hidden_state, training=training)
+        return hidden_state
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "convolution_embeddings", None) is not None:
+            with tf.name_scope(self.convolution_embeddings.name):
+                self.convolution_embeddings.build(None)
+
+
+class TFCvtConvEmbeddings(keras.layers.Layer):
+    """Image to Convolution Embeddings. This convolutional operation aims to model local spatial contexts."""
+
+    def __init__(
+        self,
+        config: CvtConfig,
+        patch_size: int,
+        num_channels: int,
+        embed_dim: int,
+        stride: int,
+        padding: int,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.padding = keras.layers.ZeroPadding2D(padding=padding)
+        self.patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        self.projection = keras.layers.Conv2D(
+            filters=embed_dim,
+            kernel_size=patch_size,
+            strides=stride,
+            padding="valid",
+            data_format="channels_last",
+            kernel_initializer=get_initializer(config.initializer_range),
+            name="projection",
+        )
+        # Using the same default epsilon as PyTorch
+        self.normalization = keras.layers.LayerNormalization(epsilon=1e-5, name="normalization")
+        self.num_channels = num_channels
+        self.embed_dim = embed_dim
+
+    def call(self, pixel_values: tf.Tensor) -> tf.Tensor:
+        if isinstance(pixel_values, dict):
+            pixel_values = pixel_values["pixel_values"]
+
+        pixel_values = self.projection(self.padding(pixel_values))
+
+        # "batch_size, height, width, num_channels -> batch_size, (height*width), num_channels"
+        batch_size, height, width, num_channels = shape_list(pixel_values)
+        hidden_size = height * width
+        pixel_values = tf.reshape(pixel_values, shape=(batch_size, hidden_size, num_channels))
+        pixel_values = self.normalization(pixel_values)
+
+        # "batch_size, (height*width), num_channels -> batch_size, height, width, num_channels"
+        pixel_values = tf.reshape(pixel_values, shape=(batch_size, height, width, num_channels))
+        return pixel_values
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "projection", None) is not None:
+            with tf.name_scope(self.projection.name):
+                self.projection.build([None, None, None, self.num_channels])
+        if getattr(self, "normalization", None) is not None:
+            with tf.name_scope(self.normalization.name):
+                self.normalization.build([None, None, self.embed_dim])
+
+
+class TFCvtSelfAttentionConvProjection(keras.layers.Layer):
+    """Convolutional projection layer."""
+
+    def __init__(self, config: CvtConfig, embed_dim: int, kernel_size: int, stride: int, padding: int, **kwargs):
+        super().__init__(**kwargs)
+        self.padding = keras.layers.ZeroPadding2D(padding=padding)
+        self.convolution = keras.layers.Conv2D(
+            filters=embed_dim,
+            kernel_size=kernel_size,
+            kernel_initializer=get_initializer(config.initializer_range),
+            padding="valid",
+            strides=stride,
+            use_bias=False,
+            name="convolution",
+            groups=embed_dim,
+        )
+        # Using the same default epsilon as PyTorch, TF uses (1 - pytorch momentum)
+        self.normalization = keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="normalization")
+        self.embed_dim = embed_dim
+
+    def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor:
+        hidden_state = self.convolution(self.padding(hidden_state))
+        hidden_state = self.normalization(hidden_state, training=training)
+        return hidden_state
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "convolution", None) is not None:
+            with tf.name_scope(self.convolution.name):
+                self.convolution.build([None, None, None, self.embed_dim])
+        if getattr(self, "normalization", None) is not None:
+            with tf.name_scope(self.normalization.name):
+                self.normalization.build([None, None, None, self.embed_dim])
+
+
+class TFCvtSelfAttentionLinearProjection(keras.layers.Layer):
+    """Linear projection layer used to flatten tokens into 1D."""
+
+    def call(self, hidden_state: tf.Tensor) -> tf.Tensor:
+        # "batch_size, height, width, num_channels -> batch_size, (height*width), num_channels"
+        batch_size, height, width, num_channels = shape_list(hidden_state)
+        hidden_size = height * width
+        hidden_state = tf.reshape(hidden_state, shape=(batch_size, hidden_size, num_channels))
+        return hidden_state
+
+
+class TFCvtSelfAttentionProjection(keras.layers.Layer):
+    """Convolutional Projection for Attention."""
+
+    def __init__(
+        self,
+        config: CvtConfig,
+        embed_dim: int,
+        kernel_size: int,
+        stride: int,
+        padding: int,
+        projection_method: str = "dw_bn",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if projection_method == "dw_bn":
+            self.convolution_projection = TFCvtSelfAttentionConvProjection(
+                config, embed_dim, kernel_size, stride, padding, name="convolution_projection"
+            )
+        self.linear_projection = TFCvtSelfAttentionLinearProjection()
+
+    def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor:
+        hidden_state = self.convolution_projection(hidden_state, training=training)
+        hidden_state = self.linear_projection(hidden_state)
+        return hidden_state
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "convolution_projection", None) is not None:
+            with tf.name_scope(self.convolution_projection.name):
+                self.convolution_projection.build(None)
+
+
+class TFCvtSelfAttention(keras.layers.Layer):
+    """
+    Self-attention layer. A depth-wise separable convolution operation (Convolutional Projection), is applied for
+    query, key, and value embeddings.
+    """
+
+    def __init__(
+        self,
+        config: CvtConfig,
+        num_heads: int,
+        embed_dim: int,
+        kernel_size: int,
+        stride_q: int,
+        stride_kv: int,
+        padding_q: int,
+        padding_kv: int,
+        qkv_projection_method: str,
+        qkv_bias: bool,
+        attention_drop_rate: float,
+        with_cls_token: bool = True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.scale = embed_dim**-0.5
+        self.with_cls_token = with_cls_token
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+
+        self.convolution_projection_query = TFCvtSelfAttentionProjection(
+            config,
+            embed_dim,
+            kernel_size,
+            stride_q,
+            padding_q,
+            projection_method="linear" if qkv_projection_method == "avg" else qkv_projection_method,
+            name="convolution_projection_query",
+        )
+        self.convolution_projection_key = TFCvtSelfAttentionProjection(
+            config,
+            embed_dim,
+            kernel_size,
+            stride_kv,
+            padding_kv,
+            projection_method=qkv_projection_method,
+            name="convolution_projection_key",
+        )
+        self.convolution_projection_value = TFCvtSelfAttentionProjection(
+            config,
+            embed_dim,
+            kernel_size,
+            stride_kv,
+            padding_kv,
+            projection_method=qkv_projection_method,
+            name="convolution_projection_value",
+        )
+
+        self.projection_query = keras.layers.Dense(
+            units=embed_dim,
+            kernel_initializer=get_initializer(config.initializer_range),
+            use_bias=qkv_bias,
+            bias_initializer="zeros",
+            name="projection_query",
+        )
+        self.projection_key = keras.layers.Dense(
+            units=embed_dim,
+            kernel_initializer=get_initializer(config.initializer_range),
+            use_bias=qkv_bias,
+            bias_initializer="zeros",
+            name="projection_key",
+        )
+        self.projection_value = keras.layers.Dense(
+            units=embed_dim,
+            kernel_initializer=get_initializer(config.initializer_range),
+            use_bias=qkv_bias,
+            bias_initializer="zeros",
+            name="projection_value",
+        )
+        self.dropout = keras.layers.Dropout(attention_drop_rate)
+
+    def rearrange_for_multi_head_attention(self, hidden_state: tf.Tensor) -> tf.Tensor:
+        batch_size, hidden_size, _ = shape_list(hidden_state)
+        head_dim = self.embed_dim // self.num_heads
+        hidden_state = tf.reshape(hidden_state, shape=(batch_size, hidden_size, self.num_heads, head_dim))
+        hidden_state = tf.transpose(hidden_state, perm=(0, 2, 1, 3))
+        return hidden_state
+
+    def call(self, hidden_state: tf.Tensor, height: int, width: int, training: bool = False) -> tf.Tensor:
+        if self.with_cls_token:
+            cls_token, hidden_state = tf.split(hidden_state, [1, height * width], 1)
+
+        # "batch_size, (height*width), num_channels -> batch_size, height, width, num_channels"
+        batch_size, hidden_size, num_channels = shape_list(hidden_state)
+        hidden_state = tf.reshape(hidden_state, shape=(batch_size, height, width, num_channels))
+
+        key = self.convolution_projection_key(hidden_state, training=training)
+        query = self.convolution_projection_query(hidden_state, training=training)
+        value = self.convolution_projection_value(hidden_state, training=training)
+
+        if self.with_cls_token:
+            query = tf.concat((cls_token, query), axis=1)
+            key = tf.concat((cls_token, key), axis=1)
+            value = tf.concat((cls_token, value), axis=1)
+
+        head_dim = self.embed_dim // self.num_heads
+
+        query = self.rearrange_for_multi_head_attention(self.projection_query(query))
+        key = self.rearrange_for_multi_head_attention(self.projection_key(key))
+        value = self.rearrange_for_multi_head_attention(self.projection_value(value))
+
+        attention_score = tf.matmul(query, key, transpose_b=True) * self.scale
+        attention_probs = stable_softmax(logits=attention_score, axis=-1)
+        attention_probs = self.dropout(attention_probs, training=training)
+
+        context = tf.matmul(attention_probs, value)
+        # "batch_size, num_heads, hidden_size, head_dim -> batch_size, hidden_size, (num_heads*head_dim)"
+        _, _, hidden_size, _ = shape_list(context)
+        context = tf.transpose(context, perm=(0, 2, 1, 3))
+        context = tf.reshape(context, (batch_size, hidden_size, self.num_heads * head_dim))
+        return context
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "convolution_projection_query", None) is not None:
+            with tf.name_scope(self.convolution_projection_query.name):
+                self.convolution_projection_query.build(None)
+        if getattr(self, "convolution_projection_key", None) is not None:
+            with tf.name_scope(self.convolution_projection_key.name):
+                self.convolution_projection_key.build(None)
+        if getattr(self, "convolution_projection_value", None) is not None:
+            with tf.name_scope(self.convolution_projection_value.name):
+                self.convolution_projection_value.build(None)
+        if getattr(self, "projection_query", None) is not None:
+            with tf.name_scope(self.projection_query.name):
+                self.projection_query.build([None, None, self.embed_dim])
+        if getattr(self, "projection_key", None) is not None:
+            with tf.name_scope(self.projection_key.name):
+                self.projection_key.build([None, None, self.embed_dim])
+        if getattr(self, "projection_value", None) is not None:
+            with tf.name_scope(self.projection_value.name):
+                self.projection_value.build([None, None, self.embed_dim])
+
+
+class TFCvtSelfOutput(keras.layers.Layer):
+    """Output of the Attention layer ."""
+
+    def __init__(self, config: CvtConfig, embed_dim: int, drop_rate: float, **kwargs):
+        super().__init__(**kwargs)
+        self.dense = keras.layers.Dense(
+            units=embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        self.dropout = keras.layers.Dropout(drop_rate)
+        self.embed_dim = embed_dim
+
+    def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor:
+        hidden_state = self.dense(inputs=hidden_state)
+        hidden_state = self.dropout(inputs=hidden_state, training=training)
+        return hidden_state
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.embed_dim])
+
+
+class TFCvtAttention(keras.layers.Layer):
+    """Attention layer. First chunk of the convolutional transformer block."""
+
+    def __init__(
+        self,
+        config: CvtConfig,
+        num_heads: int,
+        embed_dim: int,
+        kernel_size: int,
+        stride_q: int,
+        stride_kv: int,
+        padding_q: int,
+        padding_kv: int,
+        qkv_projection_method: str,
+        qkv_bias: bool,
+        attention_drop_rate: float,
+        drop_rate: float,
+        with_cls_token: bool = True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.attention = TFCvtSelfAttention(
+            config,
+            num_heads,
+            embed_dim,
+            kernel_size,
+            stride_q,
+            stride_kv,
+            padding_q,
+            padding_kv,
+            qkv_projection_method,
+            qkv_bias,
+            attention_drop_rate,
+            with_cls_token,
+            name="attention",
+        )
+        self.dense_output = TFCvtSelfOutput(config, embed_dim, drop_rate, name="output")
+
+    def prune_heads(self, heads):
+        raise NotImplementedError
+
+    def call(self, hidden_state: tf.Tensor, height: int, width: int, training: bool = False):
+        self_output = self.attention(hidden_state, height, width, training=training)
+        attention_output = self.dense_output(self_output, training=training)
+        return attention_output
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "attention", None) is not None:
+            with tf.name_scope(self.attention.name):
+                self.attention.build(None)
+        if getattr(self, "dense_output", None) is not None:
+            with tf.name_scope(self.dense_output.name):
+                self.dense_output.build(None)
+
+
+class TFCvtIntermediate(keras.layers.Layer):
+    """Intermediate dense layer. Second chunk of the convolutional transformer block."""
+
+    def __init__(self, config: CvtConfig, embed_dim: int, mlp_ratio: int, **kwargs):
+        super().__init__(**kwargs)
+        self.dense = keras.layers.Dense(
+            units=int(embed_dim * mlp_ratio),
+            kernel_initializer=get_initializer(config.initializer_range),
+            activation="gelu",
+            name="dense",
+        )
+        self.embed_dim = embed_dim
+
+    def call(self, hidden_state: tf.Tensor) -> tf.Tensor:
+        hidden_state = self.dense(hidden_state)
+        return hidden_state
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.embed_dim])
+
+
+class TFCvtOutput(keras.layers.Layer):
+    """
+    Output of the Convolutional Transformer Block (last chunk). It consists of a MLP and a residual connection.
+    """
+
+    def __init__(self, config: CvtConfig, embed_dim: int, mlp_ratio: int, drop_rate: int, **kwargs):
+        super().__init__(**kwargs)
+        self.dense = keras.layers.Dense(
+            units=embed_dim, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+        )
+        self.dropout = keras.layers.Dropout(drop_rate)
+        self.embed_dim = embed_dim
+        self.mlp_ratio = mlp_ratio
+
+    def call(self, hidden_state: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
+        hidden_state = self.dense(inputs=hidden_state)
+        hidden_state = self.dropout(inputs=hidden_state, training=training)
+        hidden_state = hidden_state + input_tensor
+        return hidden_state
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, int(self.embed_dim * self.mlp_ratio)])
+
+
+class TFCvtLayer(keras.layers.Layer):
+    """
+    Convolutional Transformer Block composed by attention layers, normalization and multi-layer perceptrons (mlps). It
+    consists of 3 chunks : an attention layer, an intermediate dense layer and an output layer. This corresponds to the
+    `Block` class in the original implementation.
+    """
+
+    def __init__(
+        self,
+        config: CvtConfig,
+        num_heads: int,
+        embed_dim: int,
+        kernel_size: int,
+        stride_q: int,
+        stride_kv: int,
+        padding_q: int,
+        padding_kv: int,
+        qkv_projection_method: str,
+        qkv_bias: bool,
+        attention_drop_rate: float,
+        drop_rate: float,
+        mlp_ratio: float,
+        drop_path_rate: float,
+        with_cls_token: bool = True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.attention = TFCvtAttention(
+            config,
+            num_heads,
+            embed_dim,
+            kernel_size,
+            stride_q,
+            stride_kv,
+            padding_q,
+            padding_kv,
+            qkv_projection_method,
+            qkv_bias,
+            attention_drop_rate,
+            drop_rate,
+            with_cls_token,
+            name="attention",
+        )
+        self.intermediate = TFCvtIntermediate(config, embed_dim, mlp_ratio, name="intermediate")
+        self.dense_output = TFCvtOutput(config, embed_dim, mlp_ratio, drop_rate, name="output")
+        # Using `layers.Activation` instead of `tf.identity` to better control `training` behaviour.
+        self.drop_path = (
+            TFCvtDropPath(drop_path_rate, name="drop_path")
+            if drop_path_rate > 0.0
+            else keras.layers.Activation("linear", name="drop_path")
+        )
+        # Using the same default epsilon as PyTorch
+        self.layernorm_before = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_before")
+        self.layernorm_after = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm_after")
+        self.embed_dim = embed_dim
+
+    def call(self, hidden_state: tf.Tensor, height: int, width: int, training: bool = False) -> tf.Tensor:
+        # in Cvt, layernorm is applied before self-attention
+        attention_output = self.attention(self.layernorm_before(hidden_state), height, width, training=training)
+        attention_output = self.drop_path(attention_output, training=training)
+
+        # first residual connection
+        hidden_state = attention_output + hidden_state
+
+        # in Cvt, layernorm is also applied after self-attention
+        layer_output = self.layernorm_after(hidden_state)
+        layer_output = self.intermediate(layer_output)
+
+        # second residual connection is done here
+        layer_output = self.dense_output(layer_output, hidden_state)
+        layer_output = self.drop_path(layer_output, training=training)
+        return layer_output
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "attention", None) is not None:
+            with tf.name_scope(self.attention.name):
+                self.attention.build(None)
+        if getattr(self, "intermediate", None) is not None:
+            with tf.name_scope(self.intermediate.name):
+                self.intermediate.build(None)
+        if getattr(self, "dense_output", None) is not None:
+            with tf.name_scope(self.dense_output.name):
+                self.dense_output.build(None)
+        if getattr(self, "drop_path", None) is not None:
+            with tf.name_scope(self.drop_path.name):
+                self.drop_path.build(None)
+        if getattr(self, "layernorm_before", None) is not None:
+            with tf.name_scope(self.layernorm_before.name):
+                self.layernorm_before.build([None, None, self.embed_dim])
+        if getattr(self, "layernorm_after", None) is not None:
+            with tf.name_scope(self.layernorm_after.name):
+                self.layernorm_after.build([None, None, self.embed_dim])
+
+
+class TFCvtStage(keras.layers.Layer):
+    """
+    Cvt stage (encoder block). Each stage has 2 parts :
+    - (1) A Convolutional Token Embedding layer
+    - (2) A Convolutional Transformer Block (layer).
+    The classification token is added only in the last stage.
+
+    Args:
+        config ([`CvtConfig`]): Model configuration class.
+        stage (`int`): Stage number.
+    """
+
+    def __init__(self, config: CvtConfig, stage: int, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.stage = stage
+        if self.config.cls_token[self.stage]:
+            self.cls_token = self.add_weight(
+                shape=(1, 1, self.config.embed_dim[-1]),
+                initializer=get_initializer(self.config.initializer_range),
+                trainable=True,
+                name="cvt.encoder.stages.2.cls_token",
+            )
+
+        self.embedding = TFCvtEmbeddings(
+            self.config,
+            patch_size=config.patch_sizes[self.stage],
+            num_channels=config.num_channels if self.stage == 0 else config.embed_dim[self.stage - 1],
+            stride=config.patch_stride[self.stage],
+            embed_dim=config.embed_dim[self.stage],
+            padding=config.patch_padding[self.stage],
+            dropout_rate=config.drop_rate[self.stage],
+            name="embedding",
+        )
+
+        drop_path_rates = tf.linspace(0.0, config.drop_path_rate[self.stage], config.depth[stage])
+        drop_path_rates = [x.numpy().item() for x in drop_path_rates]
+        self.layers = [
+            TFCvtLayer(
+                config,
+                num_heads=config.num_heads[self.stage],
+                embed_dim=config.embed_dim[self.stage],
+                kernel_size=config.kernel_qkv[self.stage],
+                stride_q=config.stride_q[self.stage],
+                stride_kv=config.stride_kv[self.stage],
+                padding_q=config.padding_q[self.stage],
+                padding_kv=config.padding_kv[self.stage],
+                qkv_projection_method=config.qkv_projection_method[self.stage],
+                qkv_bias=config.qkv_bias[self.stage],
+                attention_drop_rate=config.attention_drop_rate[self.stage],
+                drop_rate=config.drop_rate[self.stage],
+                mlp_ratio=config.mlp_ratio[self.stage],
+                drop_path_rate=drop_path_rates[self.stage],
+                with_cls_token=config.cls_token[self.stage],
+                name=f"layers.{j}",
+            )
+            for j in range(config.depth[self.stage])
+        ]
+
+    def call(self, hidden_state: tf.Tensor, training: bool = False):
+        cls_token = None
+        hidden_state = self.embedding(hidden_state, training)
+
+        # "batch_size, height, width, num_channels -> batch_size, (height*width), num_channels"
+        batch_size, height, width, num_channels = shape_list(hidden_state)
+        hidden_size = height * width
+        hidden_state = tf.reshape(hidden_state, shape=(batch_size, hidden_size, num_channels))
+
+        if self.config.cls_token[self.stage]:
+            cls_token = tf.repeat(self.cls_token, repeats=batch_size, axis=0)
+            hidden_state = tf.concat((cls_token, hidden_state), axis=1)
+
+        for layer in self.layers:
+            layer_outputs = layer(hidden_state, height, width, training=training)
+            hidden_state = layer_outputs
+
+        if self.config.cls_token[self.stage]:
+            cls_token, hidden_state = tf.split(hidden_state, [1, height * width], 1)
+
+        # "batch_size, (height*width), num_channels -> batch_size, height, width, num_channels"
+        hidden_state = tf.reshape(hidden_state, shape=(batch_size, height, width, num_channels))
+        return hidden_state, cls_token
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "embedding", None) is not None:
+            with tf.name_scope(self.embedding.name):
+                self.embedding.build(None)
+        if getattr(self, "layers", None) is not None:
+            for layer in self.layers:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+
+
+class TFCvtEncoder(keras.layers.Layer):
+    """
+    Convolutional Vision Transformer encoder. CVT has 3 stages of encoder blocks with their respective number of layers
+    (depth) being 1, 2 and 10.
+
+    Args:
+        config ([`CvtConfig`]): Model configuration class.
+    """
+
+    config_class = CvtConfig
+
+    def __init__(self, config: CvtConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.stages = [
+            TFCvtStage(config, stage_idx, name=f"stages.{stage_idx}") for stage_idx in range(len(config.depth))
+        ]
+
+    def call(
+        self,
+        pixel_values: TFModelInputType,
+        output_hidden_states: bool | None = False,
+        return_dict: bool | None = True,
+        training: bool | None = False,
+    ) -> TFBaseModelOutputWithCLSToken | tuple[tf.Tensor]:
+        all_hidden_states = () if output_hidden_states else None
+        hidden_state = pixel_values
+        # When running on CPU, `keras.layers.Conv2D` doesn't support (batch_size, num_channels, height, width)
+        # as input format. So change the input format to (batch_size, height, width, num_channels).
+        hidden_state = tf.transpose(hidden_state, perm=(0, 2, 3, 1))
+
+        cls_token = None
+        for _, (stage_module) in enumerate(self.stages):
+            hidden_state, cls_token = stage_module(hidden_state, training=training)
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_state,)
+
+        # Change back to (batch_size, num_channels, height, width) format to have uniformity in the modules
+        hidden_state = tf.transpose(hidden_state, perm=(0, 3, 1, 2))
+        if output_hidden_states:
+            all_hidden_states = tuple(tf.transpose(hs, perm=(0, 3, 1, 2)) for hs in all_hidden_states)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_state, cls_token, all_hidden_states] if v is not None)
+
+        return TFBaseModelOutputWithCLSToken(
+            last_hidden_state=hidden_state,
+            cls_token_value=cls_token,
+            hidden_states=all_hidden_states,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "stages", None) is not None:
+            for layer in self.stages:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+
+
+@keras_serializable
+class TFCvtMainLayer(keras.layers.Layer):
+    """Construct the Cvt model."""
+
+    config_class = CvtConfig
+
+    def __init__(self, config: CvtConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.encoder = TFCvtEncoder(config, name="encoder")
+
+    @unpack_inputs
+    def call(
+        self,
+        pixel_values: TFModelInputType | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool | None = False,
+    ) -> TFBaseModelOutputWithCLSToken | tuple[tf.Tensor]:
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        encoder_outputs = self.encoder(
+            pixel_values,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        sequence_output = encoder_outputs[0]
+
+        if not return_dict:
+            return (sequence_output,) + encoder_outputs[1:]
+
+        return TFBaseModelOutputWithCLSToken(
+            last_hidden_state=sequence_output,
+            cls_token_value=encoder_outputs.cls_token_value,
+            hidden_states=encoder_outputs.hidden_states,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "encoder", None) is not None:
+            with tf.name_scope(self.encoder.name):
+                self.encoder.build(None)
+
+
+class TFCvtPreTrainedModel(TFPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = CvtConfig
+    base_model_prefix = "cvt"
+    main_input_name = "pixel_values"
+
+
+TFCVT_START_DOCSTRING = r"""
+
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
+    behavior.
+
+    <Tip>
+
+    TF 2.0 models accepts two formats as inputs:
+
+    - having all inputs as keyword arguments (like PyTorch models), or
+    - having all inputs as a list, tuple or dict in the first positional arguments.
+
+    This second option is useful when using [`keras.Model.fit`] method which currently requires having all the
+    tensors in the first argument of the model call function: `model(inputs)`.
+
+    </Tip>
+
+    Args:
+        config ([`CvtConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+TFCVT_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`np.ndarray`, `tf.Tensor`, `list[tf.Tensor]` ``dict[str, tf.Tensor]` or `dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`CvtImageProcessor.__call__`]
+            for details.
+
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
+            used instead.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
+            eager mode, in graph mode the value will always be set to True.
+        training (`bool`, *optional*, defaults to `False``):
+            Whether or not to use the model in training mode (some modules like dropout modules have different
+            behaviors between training and evaluation).
+"""
+
+
+@add_start_docstrings(
+    "The bare Cvt Model transformer outputting raw hidden-states without any specific head on top.",
+    TFCVT_START_DOCSTRING,
+)
+class TFCvtModel(TFCvtPreTrainedModel):
+    def __init__(self, config: CvtConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.cvt = TFCvtMainLayer(config, name="cvt")
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(TFCVT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=TFBaseModelOutputWithCLSToken, config_class=_CONFIG_FOR_DOC)
+    def call(
+        self,
+        pixel_values: tf.Tensor | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool | None = False,
+    ) -> TFBaseModelOutputWithCLSToken | tuple[tf.Tensor]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, TFCvtModel
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/cvt-13")
+        >>> model = TFCvtModel.from_pretrained("microsoft/cvt-13")
+
+        >>> inputs = image_processor(images=image, return_tensors="tf")
+        >>> outputs = model(**inputs)
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        outputs = self.cvt(
+            pixel_values=pixel_values,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        if not return_dict:
+            return (outputs[0],) + outputs[1:]
+
+        return TFBaseModelOutputWithCLSToken(
+            last_hidden_state=outputs.last_hidden_state,
+            cls_token_value=outputs.cls_token_value,
+            hidden_states=outputs.hidden_states,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "cvt", None) is not None:
+            with tf.name_scope(self.cvt.name):
+                self.cvt.build(None)
+
+
+@add_start_docstrings(
+    """
+    Cvt Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
+    the [CLS] token) e.g. for ImageNet.
+    """,
+    TFCVT_START_DOCSTRING,
+)
+class TFCvtForImageClassification(TFCvtPreTrainedModel, TFSequenceClassificationLoss):
+    def __init__(self, config: CvtConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.num_labels = config.num_labels
+        self.cvt = TFCvtMainLayer(config, name="cvt")
+        # Using same default epsilon as in the original implementation.
+        self.layernorm = keras.layers.LayerNormalization(epsilon=1e-5, name="layernorm")
+
+        # Classifier head
+        self.classifier = keras.layers.Dense(
+            units=config.num_labels,
+            kernel_initializer=get_initializer(config.initializer_range),
+            use_bias=True,
+            bias_initializer="zeros",
+            name="classifier",
+        )
+        self.config = config
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(TFCVT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=TFImageClassifierOutputWithNoAttention, config_class=_CONFIG_FOR_DOC)
+    def call(
+        self,
+        pixel_values: tf.Tensor | None = None,
+        labels: tf.Tensor | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool | None = False,
+    ) -> TFImageClassifierOutputWithNoAttention | tuple[tf.Tensor]:
+        r"""
+        labels (`tf.Tensor` or `np.ndarray` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, TFCvtForImageClassification
+        >>> import tensorflow as tf
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("microsoft/cvt-13")
+        >>> model = TFCvtForImageClassification.from_pretrained("microsoft/cvt-13")
+
+        >>> inputs = image_processor(images=image, return_tensors="tf")
+        >>> outputs = model(**inputs)
+        >>> logits = outputs.logits
+        >>> # model predicts one of the 1000 ImageNet classes
+        >>> predicted_class_idx = tf.math.argmax(logits, axis=-1)[0]
+        >>> print("Predicted class:", model.config.id2label[int(predicted_class_idx)])
+        ```"""
+
+        outputs = self.cvt(
+            pixel_values,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        sequence_output = outputs[0]
+        cls_token = outputs[1]
+        if self.config.cls_token[-1]:
+            sequence_output = self.layernorm(cls_token)
+        else:
+            # rearrange "batch_size, num_channels, height, width -> batch_size, (height*width), num_channels"
+            batch_size, num_channels, height, width = shape_list(sequence_output)
+            sequence_output = tf.reshape(sequence_output, shape=(batch_size, num_channels, height * width))
+            sequence_output = tf.transpose(sequence_output, perm=(0, 2, 1))
+            sequence_output = self.layernorm(sequence_output)
+
+        sequence_output_mean = tf.reduce_mean(sequence_output, axis=1)
+        logits = self.classifier(sequence_output_mean)
+        loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states)
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "cvt", None) is not None:
+            with tf.name_scope(self.cvt.name):
+                self.cvt.build(None)
+        if getattr(self, "layernorm", None) is not None:
+            with tf.name_scope(self.layernorm.name):
+                self.layernorm.build([None, None, self.config.embed_dim[-1]])
+        if getattr(self, "classifier", None) is not None:
+            if hasattr(self.classifier, "name"):
+                with tf.name_scope(self.classifier.name):
+                    self.classifier.build([None, None, self.config.embed_dim[-1]])
+
+
+__all__ = ["TFCvtForImageClassification", "TFCvtModel", "TFCvtPreTrainedModel"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dac/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dac/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..40f84ccb59be91ad440ba413366dd726e555ca26
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dac/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_dac import *
+    from .feature_extraction_dac import *
+    from .modeling_dac import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dac/configuration_dac.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dac/configuration_dac.py
new file mode 100644
index 0000000000000000000000000000000000000000..bdf07cef45bde6a6064d4b6b02eb69f4f077e018
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dac/configuration_dac.py
@@ -0,0 +1,114 @@
+# coding=utf-8
+# Copyright 2024 Descript and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Dac model configuration"""
+
+import math
+
+import numpy as np
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class DacConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of an [`DacModel`]. It is used to instantiate a
+    Dac model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the
+    [descript/dac_16khz](https://huggingface.co/descript/dac_16khz) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        encoder_hidden_size (`int`, *optional*, defaults to 64):
+            Intermediate representation dimension for the encoder.
+        downsampling_ratios (`list[int]`, *optional*, defaults to `[2, 4, 8, 8]`):
+            Ratios for downsampling in the encoder. These are used in reverse order for upsampling in the decoder.
+        decoder_hidden_size (`int`, *optional*, defaults to 1536):
+            Intermediate representation dimension for the decoder.
+        n_codebooks (`int`, *optional*, defaults to 9):
+            Number of codebooks in the VQVAE.
+        codebook_size (`int`, *optional*, defaults to 1024):
+            Number of discrete codes in each codebook.
+        codebook_dim (`int`, *optional*, defaults to 8):
+            Dimension of the codebook vectors. If not defined, uses `encoder_hidden_size`.
+        quantizer_dropout (`bool`, *optional*, defaults to 0):
+            Whether to apply dropout to the quantizer.
+        commitment_loss_weight (float, *optional*, defaults to 0.25):
+            Weight of the commitment loss term in the VQVAE loss function.
+        codebook_loss_weight (float, *optional*, defaults to 1.0):
+            Weight of the codebook loss term in the VQVAE loss function.
+        sampling_rate (`int`, *optional*, defaults to 16000):
+            The sampling rate at which the audio waveform should be digitalized expressed in hertz (Hz).
+    Example:
+
+    ```python
+    >>> from transformers import DacModel, DacConfig
+
+    >>> # Initializing a "descript/dac_16khz" style configuration
+    >>> configuration = DacConfig()
+
+    >>> # Initializing a model (with random weights) from the "descript/dac_16khz" style configuration
+    >>> model = DacModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "dac"
+
+    def __init__(
+        self,
+        encoder_hidden_size=64,
+        downsampling_ratios=[2, 4, 8, 8],
+        decoder_hidden_size=1536,
+        n_codebooks=9,
+        codebook_size=1024,
+        codebook_dim=8,
+        quantizer_dropout=0,
+        commitment_loss_weight=0.25,
+        codebook_loss_weight=1.0,
+        sampling_rate=16000,
+        **kwargs,
+    ):
+        self.encoder_hidden_size = encoder_hidden_size
+        self.downsampling_ratios = downsampling_ratios
+        self.decoder_hidden_size = decoder_hidden_size
+        self.upsampling_ratios = downsampling_ratios[::-1]
+        self.n_codebooks = n_codebooks
+        self.codebook_size = codebook_size
+        self.codebook_dim = codebook_dim
+        self.quantizer_dropout = quantizer_dropout
+        self.sampling_rate = sampling_rate
+
+        self.hidden_size = encoder_hidden_size * (2 ** len(downsampling_ratios))
+
+        self.hop_length = int(np.prod(downsampling_ratios))
+        self.commitment_loss_weight = commitment_loss_weight
+        self.codebook_loss_weight = codebook_loss_weight
+
+        super().__init__(**kwargs)
+
+    @property
+    def frame_rate(self) -> int:
+        hop_length = np.prod(self.upsampling_ratios)
+        return math.ceil(self.sampling_rate / hop_length)
+
+
+__all__ = ["DacConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dac/feature_extraction_dac.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dac/feature_extraction_dac.py
new file mode 100644
index 0000000000000000000000000000000000000000..e81a2466dd91faa91ad9a34590825aa55e9497db
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dac/feature_extraction_dac.py
@@ -0,0 +1,174 @@
+# coding=utf-8
+# Copyright 2024 Descript and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for DAC"""
+
+from typing import Optional, Union
+
+import numpy as np
+
+from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
+from ...feature_extraction_utils import BatchFeature
+from ...utils import PaddingStrategy, TensorType, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class DacFeatureExtractor(SequenceFeatureExtractor):
+    r"""
+    Constructs an Dac feature extractor.
+
+    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
+    most of the main methods. Users should refer to this superclass for more information regarding those methods.
+
+    Args:
+        feature_size (`int`, *optional*, defaults to 1):
+            The feature dimension of the extracted features. Use 1 for mono, 2 for stereo.
+        sampling_rate (`int`, *optional*, defaults to 16000):
+            The sampling rate at which the audio waveform should be digitalized, expressed in hertz (Hz).
+        padding_value (`float`, *optional*, defaults to 0.0):
+            The value that is used for padding.
+        hop_length (`int`, *optional*, defaults to 512):
+            Overlap length between successive windows.
+    """
+
+    model_input_names = ["input_values", "n_quantizers"]
+
+    def __init__(
+        self,
+        feature_size: int = 1,
+        sampling_rate: int = 16000,
+        padding_value: float = 0.0,
+        hop_length: int = 512,
+        **kwargs,
+    ):
+        super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
+        self.hop_length = hop_length
+
+    def __call__(
+        self,
+        raw_audio: Union[np.ndarray, list[float], list[np.ndarray], list[list[float]]],
+        padding: Optional[Union[bool, str, PaddingStrategy]] = None,
+        truncation: Optional[bool] = False,
+        max_length: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        sampling_rate: Optional[int] = None,
+    ) -> BatchFeature:
+        """
+        Main method to featurize and prepare for the model one or several sequence(s).
+
+        Args:
+            raw_audio (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`):
+                The sequence or batch of sequences to be processed. Each sequence can be a numpy array, a list of float
+                values, a list of numpy arrays or a list of list of float values. The numpy array must be of shape
+                `(num_samples,)` for mono audio (`feature_size = 1`), or `(2, num_samples)` for stereo audio
+                (`feature_size = 2`).
+            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
+                Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                index) among:
+
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            truncation (`bool`, *optional*, defaults to `False`):
+                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
+            max_length (`int`, *optional*):
+                Maximum length of the returned list and optionally padding length (see above).
+            return_tensors (`str` or [`~utils.TensorType`], *optional*, default to 'pt'):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+            sampling_rate (`int`, *optional*):
+                The sampling rate at which the `audio` input was sampled. It is strongly recommended to pass
+                `sampling_rate` at the forward call to prevent silent errors.
+        """
+        if sampling_rate is not None:
+            if sampling_rate != self.sampling_rate:
+                raise ValueError(
+                    f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
+                    f" {self.sampling_rate}. Please make sure that the provided audio input was sampled with"
+                    f" {self.sampling_rate} and not {sampling_rate}."
+                )
+        else:
+            logger.warning(
+                f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
+                "Failing to do so can result in silent errors that might be hard to debug."
+            )
+
+        if padding and truncation:
+            raise ValueError("Both padding and truncation were set. Make sure you only set one.")
+        elif padding is None:
+            # by default let's pad the inputs
+            padding = True
+
+        is_batched = bool(
+            isinstance(raw_audio, (list, tuple)) and (isinstance(raw_audio[0], (np.ndarray, tuple, list)))
+        )
+
+        if is_batched:
+            raw_audio = [np.asarray(audio, dtype=np.float32).T for audio in raw_audio]
+        elif not is_batched and not isinstance(raw_audio, np.ndarray):
+            raw_audio = np.asarray(raw_audio, dtype=np.float32)
+        elif isinstance(raw_audio, np.ndarray) and raw_audio.dtype is np.dtype(np.float64):
+            raw_audio = raw_audio.astype(np.float32)
+
+        # always return batch
+        if not is_batched:
+            raw_audio = [np.asarray(raw_audio).T]
+
+        # verify inputs are valid
+        for idx, example in enumerate(raw_audio):
+            if example.ndim > 2:
+                raise ValueError(f"Expected input shape (channels, length) but got shape {example.shape}")
+            if self.feature_size == 1 and example.ndim != 1:
+                raise ValueError(f"Expected mono audio but example has {example.shape[-1]} channels")
+            if self.feature_size == 2:
+                raise ValueError("Stereo audio isn't supported for now")
+
+        input_values = BatchFeature({"input_values": raw_audio})
+
+        # normal padding on batch
+        padded_inputs = self.pad(
+            input_values,
+            max_length=max_length,
+            truncation=truncation,
+            padding=padding,
+            return_attention_mask=padding,
+            pad_to_multiple_of=self.hop_length,
+        )
+        if padding:
+            padded_inputs["padding_mask"] = padded_inputs.pop("attention_mask")
+        if padding:
+            padded_inputs.input_values = padded_inputs.input_values[:, np.newaxis, :]
+
+        input_values = []
+        for example in padded_inputs.pop("input_values"):
+            if self.feature_size == 1:
+                example = example[..., None]
+            input_values.append(example.T)
+
+        padded_inputs["input_values"] = input_values
+        if return_tensors is not None:
+            padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
+
+        return padded_inputs
+
+
+__all__ = ["DacFeatureExtractor"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dac/modeling_dac.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dac/modeling_dac.py
new file mode 100644
index 0000000000000000000000000000000000000000..e97c8183651e1fa45d0c0886e258aa564b701239
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dac/modeling_dac.py
@@ -0,0 +1,685 @@
+# coding=utf-8
+# Copyright 2024 Descript and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Transformers DAC model."""
+
+import math
+from dataclasses import dataclass
+from typing import Optional
+
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+
+from ...modeling_utils import PreTrainedAudioTokenizerBase
+from ...utils import ModelOutput, auto_docstring
+from .configuration_dac import DacConfig
+
+
+@dataclass
+@auto_docstring
+class DacOutput(ModelOutput):
+    r"""
+    loss (`torch.Tensor`):
+        Loss from the encoder model, comprising the weighted combination of the commitment and codebook losses.
+    audio_values (`torch.Tensor` of shape `(batch_size, input_length)`):
+        Reconstructed audio data.
+    quantized_representation (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`):
+        Quantized continuous representation of input.
+    audio_codes (`torch.LongTensor` of shape `(batch_size, num_codebooks, time_steps)`):
+        Codebook indices for each codebook (quantized discrete representation of input).
+    projected_latents (`torch.Tensor` of shape `(batch_size, num_codebooks * dimension, time_steps)`):
+        Projected latents (continuous representation of input before quantization).
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    audio_values: Optional[torch.FloatTensor] = None
+    quantized_representation: Optional[torch.FloatTensor] = None
+    audio_codes: Optional[torch.LongTensor] = None
+    projected_latents: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+@auto_docstring
+class DacEncoderOutput(ModelOutput):
+    r"""
+    loss (`torch.Tensor`):
+        Loss from the encoder model, comprising the weighted combination of the commitment and codebook losses.
+    quantized_representation (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`, *optional*):
+        Quantized continuous representation of input.
+    audio_codes (`torch.Tensor` of shape `(batch_size, num_codebooks, time_steps)`, *optional*):
+        Codebook indices for each codebook (quantized discrete representation of input).
+    projected_latents (`torch.Tensor` of shape `(batch_size, num_codebooks * dimension, time_steps)`, *optional*):
+        Projected latents (continuous representation of input before quantization).
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    quantized_representation: Optional[torch.FloatTensor] = None
+    audio_codes: Optional[torch.FloatTensor] = None
+    projected_latents: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+@auto_docstring
+# Copied from transformers.models.encodec.modeling_encodec.EncodecDecoderOutput with Encodec->Dac, segment_length->input_length
+class DacDecoderOutput(ModelOutput):
+    r"""
+    audio_values (`torch.FloatTensor`  of shape `(batch_size, input_length)`, *optional*):
+        Decoded audio values, obtained using the decoder part of Dac.
+    """
+
+    audio_values: Optional[torch.FloatTensor] = None
+
+
+class Snake1d(nn.Module):
+    """
+    A 1-dimensional Snake activation function module.
+    """
+
+    def __init__(self, hidden_dim):
+        super().__init__()
+        self.alpha = nn.Parameter(torch.ones(1, hidden_dim, 1))
+
+    def forward(self, hidden_states):
+        shape = hidden_states.shape
+        hidden_states = hidden_states.reshape(shape[0], shape[1], -1)
+        hidden_states = hidden_states + (self.alpha + 1e-9).reciprocal() * torch.sin(self.alpha * hidden_states).pow(2)
+        hidden_states = hidden_states.reshape(shape)
+        return hidden_states
+
+
+class DacVectorQuantize(nn.Module):
+    """
+    Implementation of VQ similar to Karpathy's repo (https://github.com/karpathy/deep-vector-quantization)
+
+    Additionally uses following tricks from improved VQGAN
+    (https://huggingface.co/papers/2110.04627):
+        1. Factorized codes: Perform nearest neighbor lookup in low-dimensional space
+            for improved codebook usage
+        2. l2-normalized codes: Converts euclidean distance to cosine similarity which
+            improves training stability
+    """
+
+    def __init__(self, config: DacConfig):
+        super().__init__()
+
+        self.codebook_dim = config.codebook_dim
+        self.in_proj = nn.Conv1d(config.hidden_size, config.codebook_dim, kernel_size=1)
+        self.out_proj = nn.Conv1d(config.codebook_dim, config.hidden_size, kernel_size=1)
+        self.codebook = nn.Embedding(config.codebook_size, config.codebook_dim)
+
+    def forward(self, hidden_state):
+        """
+        Quantizes the input tensor using a fixed codebook and returns the corresponding codebook vectors.
+
+        Args:
+            hidden_state (`torch.FloatTensor` of shape `(batch_size, dimension, time_steps)`):
+                Input tensor.
+
+        Returns:
+            quantized_representation (`torch.Tensor`of shape `(batch_size, dimension, time_steps)`):
+                Quantized continuous representation of input.
+            commitment_loss (`torch.FloatTensor`of shape `(1)`):
+                Commitment loss to train encoder to predict vectors closer to codebook entries.
+            codebook_loss (`torch.FloatTensor`of shape `(1)`):
+                Codebook loss to update the codebook.
+            audio_codes (`torch.LongTensor` of shape `(batch_size, time_steps)`):
+                Codebook indices for each codebook, quantized discrete representation of input.
+            projected_latents (torch.FloatTensor of shape `(batch_size, num_codebooks * dimension, time_steps)`):
+                Projected latents (continuous representation of input before quantization).
+        """
+
+        projected_latents = self.in_proj(hidden_state)
+        quantized_representation, audio_codes = self.decode_latents(projected_latents)
+
+        commitment_loss = F.mse_loss(projected_latents, quantized_representation.detach(), reduction="mean")
+        codebook_loss = F.mse_loss(quantized_representation, projected_latents.detach(), reduction="mean")
+        # noop in forward pass, straight-through gradient estimator in backward pass
+        quantized_representation = projected_latents + (quantized_representation - projected_latents).detach()
+        quantized_representation = self.out_proj(quantized_representation)
+
+        return quantized_representation, commitment_loss, codebook_loss, audio_codes, projected_latents
+
+    def decode_latents(self, hidden_states):
+        batch_size, hidden_dim, sequence_length = hidden_states.shape
+        encodings = hidden_states.permute(0, 2, 1).reshape(batch_size * sequence_length, hidden_dim)
+        codebook = self.codebook.weight  # codebook: (N x D)
+
+        # L2 normalize encodings and codebook (ViT-VQGAN)
+        encodings = F.normalize(encodings)
+        codebook = F.normalize(codebook)
+
+        # Compute euclidean distance with codebook
+        l2_norm = encodings.pow(2).sum(1, keepdim=True)
+        dist = -(l2_norm - 2 * encodings @ codebook.t()) + codebook.pow(2).sum(1, keepdim=True).t()
+
+        indices = dist.max(1)[1]
+        indices = indices.reshape(hidden_states.size(0), -1)
+        quantized_representation = self.codebook(indices).transpose(1, 2)
+        return quantized_representation, indices
+
+
+class DacResidualUnit(nn.Module):
+    """
+    A residual unit composed of Snake1d and weight-normalized Conv1d layers with dilations.
+    """
+
+    def __init__(self, dimension: int = 16, dilation: int = 1):
+        super().__init__()
+        pad = ((7 - 1) * dilation) // 2
+
+        self.snake1 = Snake1d(dimension)
+        self.conv1 = nn.Conv1d(dimension, dimension, kernel_size=7, dilation=dilation, padding=pad)
+        self.snake2 = Snake1d(dimension)
+        self.conv2 = nn.Conv1d(dimension, dimension, kernel_size=1)
+
+    def forward(self, hidden_state):
+        """
+        Forward pass through the residual unit.
+
+        Args:
+            hidden_state (`torch.Tensor` of shape `(batch_size, channels, time_steps)`):
+                Input tensor .
+
+        Returns:
+            output_tensor (`torch.Tensor` of shape `(batch_size, channels, time_steps)`):
+                Input tensor after passing through the residual unit.
+        """
+        output_tensor = hidden_state
+        output_tensor = self.conv1(self.snake1(output_tensor))
+        output_tensor = self.conv2(self.snake2(output_tensor))
+
+        padding = (hidden_state.shape[-1] - output_tensor.shape[-1]) // 2
+        if padding > 0:
+            hidden_state = hidden_state[..., padding:-padding]
+        output_tensor = hidden_state + output_tensor
+        return output_tensor
+
+
+class DacEncoderBlock(nn.Module):
+    """Encoder block used in DAC encoder."""
+
+    def __init__(self, config: DacConfig, stride: int = 1, stride_index: int = 1):
+        super().__init__()
+
+        dimension = config.encoder_hidden_size * 2**stride_index
+        self.res_unit1 = DacResidualUnit(dimension // 2, dilation=1)
+        self.res_unit2 = DacResidualUnit(dimension // 2, dilation=3)
+        self.res_unit3 = DacResidualUnit(dimension // 2, dilation=9)
+        self.snake1 = Snake1d(dimension // 2)
+        self.conv1 = nn.Conv1d(
+            dimension // 2, dimension, kernel_size=2 * stride, stride=stride, padding=math.ceil(stride / 2)
+        )
+
+    def forward(self, hidden_state):
+        hidden_state = self.res_unit1(hidden_state)
+        hidden_state = self.res_unit2(hidden_state)
+        hidden_state = self.snake1(self.res_unit3(hidden_state))
+        hidden_state = self.conv1(hidden_state)
+
+        return hidden_state
+
+
+class DacDecoderBlock(nn.Module):
+    """Decoder block used in DAC decoder."""
+
+    def __init__(self, config: DacConfig, stride: int = 1, stride_index: int = 1):
+        super().__init__()
+
+        input_dim = config.decoder_hidden_size // 2**stride_index
+        output_dim = config.decoder_hidden_size // 2 ** (stride_index + 1)
+        self.snake1 = Snake1d(input_dim)
+        self.conv_t1 = nn.ConvTranspose1d(
+            input_dim,
+            output_dim,
+            kernel_size=2 * stride,
+            stride=stride,
+            padding=math.ceil(stride / 2),
+        )
+
+        self.res_unit1 = DacResidualUnit(output_dim, dilation=1)
+        self.res_unit2 = DacResidualUnit(output_dim, dilation=3)
+        self.res_unit3 = DacResidualUnit(output_dim, dilation=9)
+
+    def forward(self, hidden_state):
+        hidden_state = self.snake1(hidden_state)
+        hidden_state = self.conv_t1(hidden_state)
+        hidden_state = self.res_unit1(hidden_state)
+        hidden_state = self.res_unit2(hidden_state)
+        hidden_state = self.res_unit3(hidden_state)
+
+        return hidden_state
+
+
+class DacResidualVectorQuantize(nn.Module):
+    """
+    ResidualVectorQuantize block - Introduced in SoundStream: An end2end neural audio codec (https://huggingface.co/papers/2107.03312)
+    """
+
+    def __init__(self, config: DacConfig):
+        super().__init__()
+
+        n_codebooks = config.n_codebooks
+        quantizer_dropout = config.quantizer_dropout
+
+        self.n_codebooks = n_codebooks
+
+        self.quantizers = nn.ModuleList([DacVectorQuantize(config) for i in range(config.n_codebooks)])
+        self.quantizer_dropout = quantizer_dropout
+
+    def forward(self, hidden_state, n_quantizers: Optional[int] = None):
+        """
+        Quantizes the input tensor using a fixed set of codebooks and returns corresponding codebook vectors.
+        Args:
+            hidden_state (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`):
+                Input tensor to be quantized.
+            n_quantizers (`int`, *optional*):
+                Number of quantizers to use. If specified and `self.quantizer_dropout` is True,
+                this argument is ignored during training, and a random number of quantizers is used.
+
+        Returns:
+            quantized_representation (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`):
+                Quantized continuous representation of input.
+            audio_codes (`torch.Tensor` of shape `(batch_size, num_codebooks, time_steps)`):
+                Codebook indices for each codebook (quantized discrete representation of input).
+            projected_latents (`torch.Tensor` of shape `(batch_size, num_codebooks * dimension, time_steps)`):
+                Projected latents (continuous representation of input before quantization).
+            commitment_loss (`torch.Tensor` of shape `(1)`):
+                Commitment loss to train the encoder to predict vectors closer to codebook entries.
+            codebook_loss (`torch.Tensor` of shape `(1)`):
+                Codebook loss to update the codebook.
+        """
+
+        quantized_representation = 0
+        residual = hidden_state
+        commitment_loss = 0
+        codebook_loss = 0
+
+        audio_codes = []
+        projected_latents = []
+
+        n_quantizers = n_quantizers if n_quantizers is not None else self.n_codebooks
+        if self.training:
+            n_quantizers = torch.ones((hidden_state.shape[0],)) * self.n_codebooks + 1
+            dropout = torch.randint(1, self.n_codebooks + 1, (hidden_state.shape[0],))
+            n_dropout = int(hidden_state.shape[0] * self.quantizer_dropout)
+            n_quantizers[:n_dropout] = dropout[:n_dropout]
+            n_quantizers = n_quantizers.to(hidden_state.device)
+
+        for i, quantizer in enumerate(self.quantizers):
+            if self.training is False and i >= n_quantizers:
+                break
+
+            quantized_representation_i, commitment_loss_i, codebook_loss_i, indices_i, projected_latents_i = quantizer(
+                residual
+            )
+
+            # Create mask to apply quantizer dropout
+            mask = torch.full((hidden_state.shape[0],), fill_value=i, device=hidden_state.device) < n_quantizers
+            quantized_representation = quantized_representation + quantized_representation_i * mask[:, None, None]
+            residual = residual - quantized_representation_i
+
+            # Sum losses
+            commitment_loss += commitment_loss_i * mask
+            codebook_loss += codebook_loss_i * mask
+
+            audio_codes.append(indices_i)
+            projected_latents.append(projected_latents_i)
+
+        audio_codes = torch.stack(audio_codes, dim=1)
+        projected_latents = torch.cat(projected_latents, dim=1)
+
+        return quantized_representation, audio_codes, projected_latents, commitment_loss, codebook_loss
+
+    def from_codes(self, audio_codes: torch.Tensor):
+        """
+        Reconstructs the continuous representation from quantized codes.
+
+        Args:
+            audio_codes (`torch.Tensor` of shape `(batch_size, num_codebooks, time_steps)`):
+                Quantized discrete representation of input.
+
+        Returns:
+            quantized_representation (`torch.Tensor`):
+                Quantized continuous representation of input.
+            projected_latents (`torch.Tensor`):
+                List of projected latents (continuous representations of input before quantization)
+                for each codebook.
+            audio_codes (`torch.Tensor`):
+                Codebook indices for each codebook.
+        """
+        quantized_representation = 0.0
+        projected_latents = []
+        n_codebooks = audio_codes.shape[1]
+        for i in range(n_codebooks):
+            projected_latents_i = self.quantizers[i].codebook(audio_codes[:, i, :]).transpose(1, 2)
+            projected_latents.append(projected_latents_i)
+            quantized_representation += self.quantizers[i].out_proj(projected_latents_i)
+        return quantized_representation, torch.cat(projected_latents, dim=1), audio_codes
+
+    def from_latents(self, latents: torch.Tensor):
+        """Reconstructs the quantized representation from unquantized latents.
+
+        Args:
+            latents (`torch.Tensor` of shape `(batch_size, total_latent_dimension, time_steps)`):
+                Continuous representation of input after projection.
+
+        Returns:
+            quantized_representation (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`):
+                Quantized representation of the full-projected space.
+            quantized_latents (`torch.Tensor` of shape `(batch_size, dimension, time_steps)`):
+                Quantized representation of the latent space (continuous representation before quantization).
+        """
+        quantized_representation = 0
+        quantized_latents = []
+        codes = []
+        codebook_dims_tensor = torch.tensor([0] + [q.codebook_dim for q in self.quantizers])
+        dims = torch.cumsum(codebook_dims_tensor, dim=0)
+
+        n_codebooks = np.where(dims <= latents.shape[1])[0].max(axis=0, keepdims=True)[0]
+        for i in range(n_codebooks):
+            hidden_dim_j, hidden_dim_k = dims[i], dims[i + 1]
+            quantized_latents_i, codes_i = self.quantizers[i].decode_latents(latents[:, hidden_dim_j:hidden_dim_k, :])
+            quantized_latents.append(quantized_latents_i)
+            codes.append(codes_i)
+
+            quantized_representation_i = self.quantizers[i].out_proj(quantized_latents_i)
+            quantized_representation = quantized_representation + quantized_representation_i
+
+        return quantized_representation, torch.cat(quantized_latents, dim=1)
+
+
+class DacDecoder(nn.Module):
+    """DAC Decoder"""
+
+    def __init__(self, config: DacConfig):
+        super().__init__()
+
+        input_channel = config.hidden_size
+        channels = config.decoder_hidden_size
+        strides = config.upsampling_ratios
+
+        # Add first conv layer
+        self.conv1 = nn.Conv1d(input_channel, channels, kernel_size=7, padding=3)
+
+        # Add upsampling + MRF blocks
+        block = []
+        for stride_index, stride in enumerate(strides):
+            block += [DacDecoderBlock(config, stride, stride_index)]
+
+        self.block = nn.ModuleList(block)
+        output_dim = config.decoder_hidden_size // 2 ** (stride_index + 1)
+        self.snake1 = Snake1d(output_dim)
+        self.conv2 = nn.Conv1d(output_dim, 1, kernel_size=7, padding=3)
+        self.tanh = nn.Tanh()
+
+    def forward(self, hidden_state):
+        hidden_state = self.conv1(hidden_state)
+
+        for layer in self.block:
+            hidden_state = layer(hidden_state)
+
+        hidden_state = self.snake1(hidden_state)
+        hidden_state = self.conv2(hidden_state)
+        hidden_state = self.tanh(hidden_state)
+
+        return hidden_state
+
+
+class DacEncoder(nn.Module):
+    """DAC Encoder"""
+
+    def __init__(self, config: DacConfig):
+        super().__init__()
+
+        strides = config.downsampling_ratios
+        # Create first convolution
+        self.conv1 = nn.Conv1d(1, config.encoder_hidden_size, kernel_size=7, padding=3)
+
+        self.block = []
+        # Create EncoderBlocks that double channels as they downsample by `stride`
+        for stride_index, stride in enumerate(strides):
+            stride_index = stride_index + 1
+            self.block += [DacEncoderBlock(config, stride=stride, stride_index=stride_index)]
+
+        self.block = nn.ModuleList(self.block)
+        d_model = config.encoder_hidden_size * 2**stride_index
+        self.snake1 = Snake1d(d_model)
+        self.conv2 = nn.Conv1d(d_model, config.hidden_size, kernel_size=3, padding=1)
+
+    def forward(self, hidden_state):
+        hidden_state = self.conv1(hidden_state)
+
+        for module in self.block:
+            hidden_state = module(hidden_state)
+
+        hidden_state = self.snake1(hidden_state)
+        hidden_state = self.conv2(hidden_state)
+
+        return hidden_state
+
+
+@auto_docstring
+class DacPreTrainedModel(PreTrainedAudioTokenizerBase):
+    config: DacConfig
+    base_model_prefix = "dac"
+    main_input_name = "input_values"
+
+    def _init_weights(self, module):
+        if isinstance(module, nn.Conv1d):
+            nn.init.trunc_normal_(module.weight, std=0.02)
+            nn.init.constant_(module.bias, 0)
+        elif isinstance(module, Snake1d):
+            module.alpha.data.fill_(1.0)
+        elif isinstance(module, nn.ConvTranspose1d):
+            module.reset_parameters()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+
+    def apply_weight_norm(self):
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
+        for layer in self.quantizer.quantizers:
+            weight_norm(layer.in_proj)
+            weight_norm(layer.out_proj)
+
+        weight_norm(self.encoder.conv1)
+        weight_norm(self.encoder.conv2)
+
+        for layer in self.encoder.block:
+            weight_norm(layer.conv1)
+            weight_norm(layer.res_unit1.conv1)
+            weight_norm(layer.res_unit1.conv2)
+            weight_norm(layer.res_unit2.conv1)
+            weight_norm(layer.res_unit2.conv2)
+            weight_norm(layer.res_unit3.conv1)
+            weight_norm(layer.res_unit3.conv2)
+
+        weight_norm(self.decoder.conv1)
+        weight_norm(self.decoder.conv2)
+
+        for layer in self.decoder.block:
+            weight_norm(layer.conv_t1)
+            weight_norm(layer.res_unit1.conv1)
+            weight_norm(layer.res_unit1.conv2)
+            weight_norm(layer.res_unit2.conv1)
+            weight_norm(layer.res_unit2.conv2)
+            weight_norm(layer.res_unit3.conv1)
+            weight_norm(layer.res_unit3.conv2)
+
+    def remove_weight_norm(self):
+        for layer in self.quantizer.quantizers:
+            nn.utils.remove_weight_norm(layer.in_proj)
+            nn.utils.remove_weight_norm(layer.out_proj)
+
+        nn.utils.remove_weight_norm(self.encoder.conv1)
+        nn.utils.remove_weight_norm(self.encoder.conv2)
+
+        for layer in self.encoder.block:
+            nn.utils.remove_weight_norm(layer.conv1)
+            nn.utils.remove_weight_norm(layer.res_unit1.conv1)
+            nn.utils.remove_weight_norm(layer.res_unit1.conv2)
+            nn.utils.remove_weight_norm(layer.res_unit2.conv1)
+            nn.utils.remove_weight_norm(layer.res_unit2.conv2)
+            nn.utils.remove_weight_norm(layer.res_unit3.conv1)
+            nn.utils.remove_weight_norm(layer.res_unit3.conv2)
+
+        nn.utils.remove_weight_norm(self.decoder.conv1)
+        nn.utils.remove_weight_norm(self.decoder.conv2)
+
+        for layer in self.decoder.block:
+            nn.utils.remove_weight_norm(layer.conv_t1)
+            nn.utils.remove_weight_norm(layer.res_unit1.conv1)
+            nn.utils.remove_weight_norm(layer.res_unit1.conv2)
+            nn.utils.remove_weight_norm(layer.res_unit2.conv1)
+            nn.utils.remove_weight_norm(layer.res_unit2.conv2)
+            nn.utils.remove_weight_norm(layer.res_unit3.conv1)
+            nn.utils.remove_weight_norm(layer.res_unit3.conv2)
+
+
+@auto_docstring(
+    custom_intro="""
+    The DAC (Descript Audio Codec) model.
+    """
+)
+class DacModel(DacPreTrainedModel):
+    def __init__(self, config: DacConfig):
+        super().__init__(config)
+        self.config = config
+
+        self.encoder = DacEncoder(config)
+        self.decoder = DacDecoder(config)
+
+        self.quantizer = DacResidualVectorQuantize(config)
+
+        self.bits_per_codebook = int(math.log2(self.config.codebook_size))
+        if 2**self.bits_per_codebook != self.config.codebook_size:
+            raise ValueError("The codebook_size must be a power of 2.")
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def encode(
+        self,
+        input_values: torch.Tensor,
+        n_quantizers: Optional[int] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        input_values (`torch.Tensor of shape `(batch_size, 1, time_steps)`):
+            Input audio data to encode,
+        n_quantizers (int, *optional*):
+            Number of quantizers to use. If None, all quantizers are used. Default is None.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        quantized_representation = self.encoder(input_values)
+        quantized_representation, audio_codes, projected_latents, commitment_loss, codebook_loss = self.quantizer(
+            quantized_representation, n_quantizers
+        )
+
+        loss = self.config.commitment_loss_weight * commitment_loss + self.config.codebook_loss_weight * codebook_loss
+
+        if not return_dict:
+            return (loss, quantized_representation, audio_codes, projected_latents)
+
+        return DacEncoderOutput(loss, quantized_representation, audio_codes, projected_latents)
+
+    @auto_docstring
+    def decode(
+        self,
+        quantized_representation: Optional[torch.Tensor] = None,
+        audio_codes: Optional[torch.Tensor] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        quantized_representation (torch.Tensor of shape `(batch_size, dimension, time_steps)`, *optional*):
+            Quantized continuous representation of input.
+        audio_codes (`torch.Tensor` of shape `(batch_size, num_codebooks, time_steps)`, *optional*):
+            The codebook indices for each codebook, representing the quantized discrete
+            representation of the input. This parameter should be provided if you want
+            to decode directly from the audio codes (it will overwrite quantized_representation).
+        return_dict (`bool`, *optional*, defaults to `True`):
+            Whether to return a [`DacDecoderOutput`] instead of a plain tuple.
+        """
+
+        if quantized_representation is None and audio_codes is None:
+            raise ValueError("Either `quantized_representation` or `audio_codes` must be provided.")
+
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        if audio_codes is not None:
+            quantized_representation = self.quantizer.from_codes(audio_codes)[0]
+
+        audio_values = self.decoder(quantized_representation).squeeze(1)
+
+        if not return_dict:
+            return (audio_values,)
+
+        return DacDecoderOutput(audio_values)
+
+    @auto_docstring
+    def forward(
+        self,
+        input_values: torch.Tensor,
+        n_quantizers: Optional[int] = None,
+        return_dict: Optional[bool] = None,
+    ):
+        r"""
+        input_values (`torch.Tensor` of shape `(batch_size, 1, time_steps)`):
+            Audio data to encode.
+        n_quantizers (`int`, *optional*):
+            Number of quantizers to use. If `None`, all quantizers are used. Default is `None`.
+
+        Examples:
+
+        ```python
+        >>> from datasets import load_dataset, Audio
+        >>> from transformers import DacModel, AutoProcessor
+        >>> librispeech_dummy = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+
+        >>> model = DacModel.from_pretrained("descript/dac_16khz")
+        >>> processor = AutoProcessor.from_pretrained("descript/dac_16khz")
+        >>> librispeech_dummy = librispeech_dummy.cast_column("audio", Audio(sampling_rate=processor.sampling_rate))
+        >>> audio_sample = librispeech_dummy[-1]["audio"]["array"]
+        >>> inputs = processor(raw_audio=audio_sample, sampling_rate=processor.sampling_rate, return_tensors="pt")
+
+        >>> encoder_outputs = model.encode(inputs["input_values"])
+        >>> # Get the intermediate audio codes
+        >>> audio_codes = encoder_outputs.audio_codes
+        >>> # Reconstruct the audio from its quantized representation
+        >>> audio_values = model.decode(encoder_outputs.quantized_representation)
+        >>> # or the equivalent with a forward pass
+        >>> audio_values = model(inputs["input_values"]).audio_values
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+        length = input_values.shape[-1]
+
+        loss, quantized_representation, audio_codes, projected_latents = self.encode(
+            input_values, n_quantizers, return_dict=False
+        )
+        audio_values = self.decode(quantized_representation, return_dict=False)[0][..., :length]
+
+        if not return_dict:
+            return (loss, audio_values, quantized_representation, audio_codes, projected_latents)
+
+        return DacOutput(loss, audio_values, quantized_representation, audio_codes, projected_latents)
+
+
+__all__ = ["DacModel", "DacPreTrainedModel"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..19116604f739c69a26cc582f599768e8f111b14e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/configuration_data2vec_audio.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/configuration_data2vec_audio.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b6c6023c8671a89b448c4c5597d837e34bb3e71f
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/configuration_data2vec_audio.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/configuration_data2vec_text.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/configuration_data2vec_text.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fb4536f0fa15ec414e51343ba2a86dbbace70974
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/configuration_data2vec_text.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/configuration_data2vec_vision.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/configuration_data2vec_vision.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9ba9dcb5920302549787ed75f3b9c68b9be8a0e3
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/configuration_data2vec_vision.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/modeling_data2vec_audio.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/modeling_data2vec_audio.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..97c3914b373f4dbc78c61ffb1f4bf458dfbd2f9c
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/modeling_data2vec_audio.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/modeling_data2vec_text.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/modeling_data2vec_text.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7e393d306b2a881a4b9d615b4ffa7be3e73e1a0a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/modeling_data2vec_text.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/modeling_data2vec_vision.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/modeling_data2vec_vision.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..71b52305aee18ed515fdbb1f631f7b3a1bea2368
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/modeling_data2vec_vision.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/modeling_tf_data2vec_vision.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/modeling_tf_data2vec_vision.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f804e768e2b834fe1f10cc1c4dfe42160f1ac495
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/modeling_tf_data2vec_vision.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/modular_data2vec_audio.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/modular_data2vec_audio.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..09052665af1120ac0685fce96052a20b62ecf793
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/data2vec/__pycache__/modular_data2vec_audio.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta_v2/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta_v2/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..72964d548b78905f10d2b729a7c29c67822ad7d6
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta_v2/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta_v2/__pycache__/configuration_deberta_v2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta_v2/__pycache__/configuration_deberta_v2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..99670af8103c2012aa728e436b41083dbb4124b2
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta_v2/__pycache__/configuration_deberta_v2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta_v2/__pycache__/modeling_deberta_v2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta_v2/__pycache__/modeling_deberta_v2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..54f555fad75eb9315ced68afe2308576812fe122
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta_v2/__pycache__/modeling_deberta_v2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta_v2/__pycache__/modeling_tf_deberta_v2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta_v2/__pycache__/modeling_tf_deberta_v2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..30b3bee5865a5ef47584dc588b5bdbb07e8eb398
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta_v2/__pycache__/modeling_tf_deberta_v2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta_v2/__pycache__/tokenization_deberta_v2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta_v2/__pycache__/tokenization_deberta_v2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3897946546bf6d334b1fc99f72e3bc84120d6acb
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta_v2/__pycache__/tokenization_deberta_v2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta_v2/__pycache__/tokenization_deberta_v2_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta_v2/__pycache__/tokenization_deberta_v2_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8b4b345e1468811044478395000885546da4c73e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deberta_v2/__pycache__/tokenization_deberta_v2_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/decision_transformer/configuration_decision_transformer.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/decision_transformer/configuration_decision_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..436834c7e5e038b79d9cf6ef21e34ba5eaf6ae64
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/decision_transformer/configuration_decision_transformer.py
@@ -0,0 +1,157 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Team and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Decision Transformer model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class DecisionTransformerConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`DecisionTransformerModel`]. It is used to
+    instantiate a Decision Transformer model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the standard
+    DecisionTransformer architecture. Many of the config options are used to instantiate the GPT2 model that is used as
+    part of the architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        state_dim (`int`, *optional*, defaults to 17):
+            The state size for the RL environment
+        act_dim (`int`, *optional*, defaults to 4):
+            The size of the output action space
+        hidden_size (`int`, *optional*, defaults to 128):
+            The size of the hidden layers
+        max_ep_len (`int`, *optional*, defaults to 4096):
+            The maximum length of an episode in the environment
+        action_tanh (`bool`, *optional*, defaults to True):
+            Whether to use a tanh activation on action prediction
+        vocab_size (`int`, *optional*, defaults to 50257):
+            Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`DecisionTransformerModel`].
+        n_positions (`int`, *optional*, defaults to 1024):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        n_layer (`int`, *optional*, defaults to 3):
+            Number of hidden layers in the Transformer encoder.
+        n_head (`int`, *optional*, defaults to 1):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        n_inner (`int`, *optional*):
+            Dimensionality of the inner feed-forward layers. If unset, will default to 4 times `n_embd`.
+        activation_function (`str`, *optional*, defaults to `"gelu"`):
+            Activation function, to be selected in the list `["relu", "silu", "gelu", "tanh", "gelu_new"]`.
+        resid_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        embd_pdrop (`int`, *optional*, defaults to 0.1):
+            The dropout ratio for the embeddings.
+        attn_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
+            The epsilon to use in the layer normalization layers.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        scale_attn_weights (`bool`, *optional*, defaults to `True`):
+            Scale attention weights by dividing by sqrt(hidden_size)..
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        scale_attn_by_inverse_layer_idx (`bool`, *optional*, defaults to `False`):
+            Whether to additionally scale attention weights by `1 / layer_idx + 1`.
+        reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
+            Whether to scale keys (K) prior to computing attention (dot-product) and upcast attention
+            dot-product/softmax to float() when training with mixed precision.
+
+    Example:
+
+    ```python
+    >>> from transformers import DecisionTransformerConfig, DecisionTransformerModel
+
+    >>> # Initializing a DecisionTransformer configuration
+    >>> configuration = DecisionTransformerConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = DecisionTransformerModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "decision_transformer"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "max_position_embeddings": "n_positions",
+        "num_attention_heads": "n_head",
+        "num_hidden_layers": "n_layer",
+    }
+
+    def __init__(
+        self,
+        state_dim=17,
+        act_dim=4,
+        hidden_size=128,
+        max_ep_len=4096,
+        action_tanh=True,
+        vocab_size=1,
+        n_positions=1024,
+        n_layer=3,
+        n_head=1,
+        n_inner=None,
+        activation_function="relu",
+        resid_pdrop=0.1,
+        embd_pdrop=0.1,
+        attn_pdrop=0.1,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        scale_attn_weights=True,
+        use_cache=True,
+        bos_token_id=50256,
+        eos_token_id=50256,
+        scale_attn_by_inverse_layer_idx=False,
+        reorder_and_upcast_attn=False,
+        **kwargs,
+    ):
+        self.state_dim = state_dim
+        self.act_dim = act_dim
+        self.hidden_size = hidden_size
+        self.max_ep_len = max_ep_len
+        self.action_tanh = action_tanh
+        self.vocab_size = vocab_size
+        self.n_positions = n_positions
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.n_inner = n_inner
+        self.activation_function = activation_function
+        self.resid_pdrop = resid_pdrop
+        self.embd_pdrop = embd_pdrop
+        self.attn_pdrop = attn_pdrop
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.scale_attn_weights = scale_attn_weights
+        self.use_cache = use_cache
+        self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
+        self.reorder_and_upcast_attn = reorder_and_upcast_attn
+
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+
+        super().__init__(bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+
+__all__ = ["DecisionTransformerConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/decision_transformer/modeling_decision_transformer.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/decision_transformer/modeling_decision_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9c68fcbdeaeca9fb89c160de8f440333befd5fd
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/decision_transformer/modeling_decision_transformer.py
@@ -0,0 +1,934 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Team The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch DecisionTransformer model."""
+
+import math
+import os
+from dataclasses import dataclass
+from typing import Callable, Optional, Union
+
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutputWithPastAndCrossAttentions
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...pytorch_utils import Conv1D, find_pruneable_heads_and_indices, prune_conv1d_layer
+from ...utils import (
+    ModelOutput,
+    auto_docstring,
+    logging,
+)
+from ...utils.deprecation import deprecate_kwarg
+from .configuration_decision_transformer import DecisionTransformerConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+# Copied from transformers.models.gpt2.modeling_gpt2.load_tf_weights_in_gpt2
+def load_tf_weights_in_gpt2(model, config, gpt2_checkpoint_path):
+    """Load tf checkpoints in a pytorch model"""
+    try:
+        import re
+
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
+        raise
+    tf_path = os.path.abspath(gpt2_checkpoint_path)
+    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        logger.info(f"Loading TF weight {name} with shape {shape}")
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array.squeeze())
+
+    for name, array in zip(names, arrays):
+        name = name[6:]  # skip "model/"
+        name = name.split("/")
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r"[A-Za-z]+\d+", m_name):
+                scope_names = re.split(r"(\d+)", m_name)
+            else:
+                scope_names = [m_name]
+            if scope_names[0] == "w" or scope_names[0] == "g":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "b":
+                pointer = getattr(pointer, "bias")
+            elif scope_names[0] == "wpe" or scope_names[0] == "wte":
+                pointer = getattr(pointer, scope_names[0])
+                pointer = getattr(pointer, "weight")
+            else:
+                pointer = getattr(pointer, scope_names[0])
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
+                pointer = pointer[num]
+        try:
+            if pointer.shape != array.shape:
+                raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
+        except ValueError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        logger.info(f"Initialize PyTorch weight {name}")
+        pointer.data = torch.from_numpy(array)
+    return model
+
+
+# Copied from transformers.models.gpt2.modeling_gpt2.eager_attention_forward
+def eager_attention_forward(module, query, key, value, attention_mask, head_mask=None, **kwargs):
+    attn_weights = torch.matmul(query, key.transpose(-1, -2))
+
+    if module.scale_attn_weights:
+        attn_weights = attn_weights / torch.full(
+            [], value.size(-1) ** 0.5, dtype=attn_weights.dtype, device=attn_weights.device
+        )
+
+    # Layer-wise attention scaling
+    if module.scale_attn_by_inverse_layer_idx:
+        attn_weights = attn_weights / float(module.layer_idx + 1)
+
+    if not module.is_cross_attention:
+        # if only "normal" attention layer implements causal mask
+        query_length, key_length = query.size(-2), key.size(-2)
+        causal_mask = module.bias[:, :, key_length - query_length : key_length, :key_length]
+        mask_value = torch.finfo(attn_weights.dtype).min
+        # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
+        # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
+        mask_value = torch.full([], mask_value, dtype=attn_weights.dtype, device=attn_weights.device)
+        attn_weights = torch.where(causal_mask, attn_weights.to(attn_weights.dtype), mask_value)
+
+    if attention_mask is not None:
+        # Apply the attention mask
+        causal_mask = attention_mask[:, :, :, : key.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+    # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op otherwise
+    attn_weights = attn_weights.type(value.dtype)
+    attn_weights = module.attn_dropout(attn_weights)
+
+    # Mask heads if we want to
+    if head_mask is not None:
+        attn_weights = attn_weights * head_mask
+
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2)
+
+    return attn_output, attn_weights
+
+
+# Copied from transformers.models.gpt2.modeling_gpt2.GPT2Attention with GPT2->DecisionTransformerGPT2
+class DecisionTransformerGPT2Attention(nn.Module):
+    def __init__(self, config, is_cross_attention=False, layer_idx=None):
+        super().__init__()
+        self.config = config
+        max_positions = config.max_position_embeddings
+        self.register_buffer(
+            "bias",
+            torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
+                1, 1, max_positions, max_positions
+            ),
+            persistent=False,
+        )
+        self.register_buffer("masked_bias", torch.tensor(-1e4), persistent=False)
+
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        self.split_size = self.embed_dim
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"`embed_dim` must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+
+        self.scale_attn_weights = config.scale_attn_weights
+        self.is_cross_attention = is_cross_attention
+
+        # Layer-wise attention scaling, reordering, and upcasting
+        self.scale_attn_by_inverse_layer_idx = config.scale_attn_by_inverse_layer_idx
+        self.layer_idx = layer_idx
+        self.reorder_and_upcast_attn = config.reorder_and_upcast_attn
+
+        if self.is_cross_attention:
+            self.c_attn = Conv1D(2 * self.embed_dim, self.embed_dim)
+            self.q_attn = Conv1D(self.embed_dim, self.embed_dim)
+        else:
+            self.c_attn = Conv1D(3 * self.embed_dim, self.embed_dim)
+        self.c_proj = Conv1D(self.embed_dim, self.embed_dim)
+
+        self.attn_dropout = nn.Dropout(config.attn_pdrop)
+        self.resid_dropout = nn.Dropout(config.resid_pdrop)
+        self.is_causal = True
+
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(heads, self.num_heads, self.head_dim, self.pruned_heads)
+        index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)])
+
+        # Prune conv1d layers
+        self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
+        self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)
+
+        # Update hyper params
+        self.split_size = (self.split_size // self.num_heads) * (self.num_heads - len(heads))
+        self.num_heads = self.num_heads - len(heads)
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def _upcast_and_reordered_attn(self, query, key, value, attention_mask=None, head_mask=None):
+        # Use `torch.baddbmm` (a bit more efficient w/ alpha param for scaling -- from Megatron-LM)
+        bsz, num_heads, q_seq_len, dk = query.size()
+        _, _, k_seq_len, _ = key.size()
+
+        # Preallocate attn_weights for `baddbmm`
+        attn_weights = torch.empty(bsz * num_heads, q_seq_len, k_seq_len, dtype=torch.float32, device=query.device)
+
+        # Compute Scale Factor
+        scale_factor = 1.0
+        if self.scale_attn_weights:
+            scale_factor /= float(value.size(-1)) ** 0.5
+
+        if self.scale_attn_by_inverse_layer_idx:
+            scale_factor /= float(self.layer_idx + 1)
+
+        # Upcast (turn off autocast) and reorder (Scale K by 1 / root(dk))
+        with torch.autocast(query.device.type, enabled=False):
+            q, k = query.reshape(-1, q_seq_len, dk), key.transpose(-1, -2).reshape(-1, dk, k_seq_len)
+            attn_weights = torch.baddbmm(attn_weights, q.float(), k.float(), beta=0, alpha=scale_factor)
+            attn_weights = attn_weights.reshape(bsz, num_heads, q_seq_len, k_seq_len)
+
+        if not self.is_cross_attention:
+            # if only "normal" attention layer implements causal mask
+            query_length, key_length = query.size(-2), key.size(-2)
+            causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]
+            mask_value = torch.finfo(attn_weights.dtype).min
+            # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
+            # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
+            mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype, device=attn_weights.device)
+            attn_weights = torch.where(causal_mask, attn_weights, mask_value)
+
+        if attention_mask is not None:
+            # Apply the attention mask
+            attn_weights = attn_weights + attention_mask
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op if otherwise
+        if attn_weights.dtype != torch.float32:
+            raise RuntimeError("Error with upcasting, attn_weights does not have dtype torch.float32")
+        attn_weights = attn_weights.type(value.dtype)
+        attn_weights = self.attn_dropout(attn_weights)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_weights = attn_weights * head_mask
+
+        attn_output = torch.matmul(attn_weights, value)
+        attn_output = attn_output.transpose(1, 2)
+
+        return attn_output, attn_weights
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: Optional[tuple[torch.FloatTensor]],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = False,
+        **kwargs,
+    ) -> tuple[Union[torch.Tensor, tuple[torch.Tensor]], ...]:
+        is_cross_attention = encoder_hidden_states is not None
+        if past_key_values is not None:
+            if isinstance(past_key_values, EncoderDecoderCache):
+                is_updated = past_key_values.is_updated.get(self.layer_idx)
+                if is_cross_attention:
+                    # after the first generated id, we can subsequently re-use all key/value_layer from cache
+                    curr_past_key_value = past_key_values.cross_attention_cache
+                else:
+                    curr_past_key_value = past_key_values.self_attention_cache
+            else:
+                curr_past_key_value = past_key_values
+
+        if is_cross_attention:
+            if not hasattr(self, "q_attn"):
+                raise ValueError(
+                    "If class is used as cross attention, the weights `q_attn` have to be defined. "
+                    "Please make sure to instantiate class with `DecisionTransformerGPT2Attention(..., is_cross_attention=True)`."
+                )
+            query_states = self.q_attn(hidden_states)
+            attention_mask = encoder_attention_mask
+
+            # Try to get key/value states from cache if possible
+            if past_key_values is not None and is_updated:
+                key_states = curr_past_key_value.layers[self.layer_idx].keys
+                value_states = curr_past_key_value.layers[self.layer_idx].values
+            else:
+                key_states, value_states = self.c_attn(encoder_hidden_states).split(self.split_size, dim=2)
+                shape_kv = (*key_states.shape[:-1], -1, self.head_dim)
+                key_states = key_states.view(shape_kv).transpose(1, 2)
+                value_states = value_states.view(shape_kv).transpose(1, 2)
+        else:
+            query_states, key_states, value_states = self.c_attn(hidden_states).split(self.split_size, dim=2)
+            shape_kv = (*key_states.shape[:-1], -1, self.head_dim)
+            key_states = key_states.view(shape_kv).transpose(1, 2)
+            value_states = value_states.view(shape_kv).transpose(1, 2)
+
+        shape_q = (*query_states.shape[:-1], -1, self.head_dim)
+        query_states = query_states.view(shape_q).transpose(1, 2)
+
+        if (past_key_values is not None and not is_cross_attention) or (
+            past_key_values is not None and is_cross_attention and not is_updated
+        ):
+            # save all key/value_layer to cache to be re-used for fast auto-regressive generation
+            cache_position = cache_position if not is_cross_attention else None
+            key_states, value_states = curr_past_key_value.update(
+                key_states, value_states, self.layer_idx, {"cache_position": cache_position}
+            )
+            # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls
+            if is_cross_attention:
+                past_key_values.is_updated[self.layer_idx] = True
+
+        is_causal = attention_mask is None and query_states.shape[-2] > 1 and not is_cross_attention
+
+        using_eager = self.config._attn_implementation == "eager"
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        if using_eager and self.reorder_and_upcast_attn:
+            attn_output, attn_weights = self._upcast_and_reordered_attn(
+                query_states, key_states, value_states, attention_mask, head_mask
+            )
+        else:
+            attn_output, attn_weights = attention_interface(
+                self,
+                query_states,
+                key_states,
+                value_states,
+                attention_mask,
+                head_mask=head_mask,
+                dropout=self.attn_dropout.p if self.training else 0.0,
+                is_causal=is_causal,
+                **kwargs,
+            )
+
+        attn_output = attn_output.reshape(*attn_output.shape[:-2], -1).contiguous()
+        attn_output = self.c_proj(attn_output)
+        attn_output = self.resid_dropout(attn_output)
+
+        return attn_output, attn_weights
+
+
+# Copied from transformers.models.gpt2.modeling_gpt2.GPT2MLP with GPT2->DecisionTransformerGPT2
+class DecisionTransformerGPT2MLP(nn.Module):
+    def __init__(self, intermediate_size, config):
+        super().__init__()
+        embed_dim = config.hidden_size
+        self.c_fc = Conv1D(intermediate_size, embed_dim)
+        self.c_proj = Conv1D(embed_dim, intermediate_size)
+        self.act = ACT2FN[config.activation_function]
+        self.dropout = nn.Dropout(config.resid_pdrop)
+
+    def forward(self, hidden_states: Optional[tuple[torch.FloatTensor]]) -> torch.FloatTensor:
+        hidden_states = self.c_fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.c_proj(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.gpt2.modeling_gpt2.GPT2Block with GPT2->DecisionTransformerGPT2
+class DecisionTransformerGPT2Block(GradientCheckpointingLayer):
+    # Ignore copy
+    def __init__(self, config, layer_idx=None):
+        super().__init__()
+        hidden_size = config.hidden_size
+        inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size
+
+        self.ln_1 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        self.attn = DecisionTransformerGPT2Attention(config, layer_idx=layer_idx)
+        self.ln_2 = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+
+        if config.add_cross_attention:
+            self.crossattention = DecisionTransformerGPT2Attention(
+                config, is_cross_attention=True, layer_idx=layer_idx
+            )
+            self.ln_cross_attn = nn.LayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+
+        self.mlp = DecisionTransformerGPT2MLP(inner_dim, config)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: Optional[tuple[torch.FloatTensor]],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+        **kwargs,
+    ) -> Union[tuple[torch.Tensor], Optional[tuple[torch.Tensor, tuple[torch.FloatTensor, ...]]]]:
+        residual = hidden_states
+        hidden_states = self.ln_1(hidden_states)
+        attn_output, self_attn_weights = self.attn(
+            hidden_states,
+            past_key_values=past_key_values,
+            cache_position=cache_position,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            **kwargs,
+        )
+        # residual connection
+        hidden_states = attn_output + residual
+
+        if encoder_hidden_states is not None:
+            # add one self-attention block for cross-attention
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with "
+                    "cross-attention layers by setting `config.add_cross_attention=True`"
+                )
+            residual = hidden_states
+            hidden_states = self.ln_cross_attn(hidden_states)
+            cross_attn_output, cross_attn_weights = self.crossattention(
+                hidden_states,
+                past_key_values=past_key_values,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                output_attentions=output_attentions,
+            )
+            # residual connection
+            hidden_states = residual + cross_attn_output
+
+        residual = hidden_states
+        hidden_states = self.ln_2(hidden_states)
+        feed_forward_hidden_states = self.mlp(hidden_states)
+        # residual connection
+        hidden_states = residual + feed_forward_hidden_states
+
+        outputs = (hidden_states,)
+        if output_attentions:
+            outputs += (self_attn_weights,)
+            if encoder_hidden_states is not None:
+                outputs += (cross_attn_weights,)
+
+        return outputs
+
+
+@auto_docstring
+class DecisionTransformerGPT2PreTrainedModel(PreTrainedModel):
+    config: DecisionTransformerConfig
+    load_tf_weights = load_tf_weights_in_gpt2
+    base_model_prefix = "transformer"
+    is_parallelizable = True
+    supports_gradient_checkpointing = True
+
+    _can_compile_fullgraph = False
+
+    def __init__(self, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, (nn.Linear, Conv1D)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+        # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
+        #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
+        #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
+        #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
+        #
+        # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
+        for name, p in module.named_parameters():
+            if "c_proj" in name and "weight" in name:
+                # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
+                p.data.normal_(mean=0.0, std=(self.config.initializer_range / math.sqrt(2 * self.config.n_layer)))
+
+
+class DecisionTransformerGPT2Model(DecisionTransformerGPT2PreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.embed_dim = config.hidden_size
+
+        self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
+        self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
+
+        self.drop = nn.Dropout(config.embd_pdrop)
+        self.h = nn.ModuleList(
+            [DecisionTransformerGPT2Block(config, layer_idx=i) for i in range(config.num_hidden_layers)]
+        )
+        self.ln_f = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.wte
+
+    def set_input_embeddings(self, new_embeddings):
+        self.wte = new_embeddings
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+            batch_size = input_ids.shape[0]
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            batch_size = inputs_embeds.shape[0]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.view(-1, input_shape[-1])
+
+        # based on pattern from src/transformers/models/whisper/modeling_whisper.py::WhisperDecoder and similar addition in GPT2Model
+        if use_cache:
+            if past_key_values is None:
+                past_key_values = DynamicCache(config=self.config)
+
+            if self.config.add_cross_attention and not isinstance(past_key_values, EncoderDecoderCache):
+                past_key_values = EncoderDecoderCache(past_key_values, DynamicCache(config=self.config))
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        # Attention mask.
+        if attention_mask is not None:
+            if batch_size <= 0:
+                raise ValueError("batch_size has to be defined and > 0")
+            attention_mask = attention_mask.view(batch_size, -1)
+            # We create a 3D attention mask from a 2D tensor mask.
+            # Sizes are [batch_size, 1, 1, to_seq_length]
+            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+            # this attention mask is more simple than the triangular masking of causal attention
+            # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+            attention_mask = attention_mask[:, None, None, :]
+
+            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+            # masked positions, this operation will create a tensor which is 0.0 for
+            # positions we want to attend and the dtype's smallest value for masked positions.
+            # Since we are adding it to the raw scores before the softmax, this is
+            # effectively the same as removing these entirely.
+            attention_mask = attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+            attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.add_cross_attention and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            encoder_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # head_mask has shape n_layer x batch x n_heads x N x N
+        head_mask = self.get_head_mask(head_mask, self.config.n_layer)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.wte(input_ids)
+        position_embeds = self.wpe(position_ids)
+        hidden_states = inputs_embeds + position_embeds
+
+        if token_type_ids is not None:
+            token_type_embeds = self.wte(token_type_ids)
+            hidden_states = hidden_states + token_type_embeds
+
+        hidden_states = self.drop(hidden_states)
+
+        output_shape = (-1,) + input_shape[1:] + (hidden_states.size(-1),)
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+        all_hidden_states = () if output_hidden_states else None
+        for i, block in enumerate(self.h):
+            # Model parallel
+            if self.model_parallel:
+                torch.cuda.set_device(hidden_states.device)
+                # Ensure that attention_mask is always on the same device as hidden_states
+                if attention_mask is not None:
+                    attention_mask = attention_mask.to(hidden_states.device)
+                if isinstance(head_mask, torch.Tensor):
+                    head_mask = head_mask.to(hidden_states.device)
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            outputs = block(
+                hidden_states,
+                past_key_values if not (self.gradient_checkpointing and self.training) else None,
+                cache_position,
+                attention_mask,
+                head_mask[i],
+                encoder_hidden_states,  # as a positional argument for gradient checkpointing
+                encoder_attention_mask=encoder_attention_mask,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+            )
+
+            hidden_states = outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (outputs[2],)
+
+            # Model Parallel: If it's the last layer for that device, put things on the next device
+            if self.model_parallel:
+                for k, v in self.device_map.items():
+                    if i == v[-1] and "cuda:" + str(k) != self.last_device:
+                        hidden_states = hidden_states.to("cuda:" + str(k + 1))
+
+        hidden_states = self.ln_f(hidden_states)
+
+        hidden_states = hidden_states.view(output_shape)
+        # Add last hidden state
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        past_key_values = past_key_values if use_cache else None
+        # no return to legacy cache
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, past_key_values, all_hidden_states, all_self_attentions, all_cross_attentions]
+                if v is not None
+            )
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for model's outputs that also contains a pooling of the last hidden states.
+    """
+)
+class DecisionTransformerOutput(ModelOutput):
+    r"""
+    state_preds (`torch.FloatTensor` of shape `(batch_size, sequence_length, state_dim)`):
+        Environment state predictions
+    action_preds (`torch.FloatTensor` of shape `(batch_size, sequence_length, action_dim)`):
+        Model action predictions
+    return_preds (`torch.FloatTensor` of shape `(batch_size, sequence_length, 1)`):
+        Predicted returns for each state
+    """
+
+    state_preds: Optional[torch.FloatTensor] = None
+    action_preds: Optional[torch.FloatTensor] = None
+    return_preds: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[torch.FloatTensor] = None
+    attentions: Optional[torch.FloatTensor] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+
+
+class DecisionTransformerPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config: DecisionTransformerConfig
+    base_model_prefix = "decision_transformer"
+    main_input_name = "states"
+    supports_gradient_checkpointing = False
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+@auto_docstring(
+    custom_intro="""
+    The Decision Transformer Model
+    """
+)
+class DecisionTransformerModel(DecisionTransformerPreTrainedModel):
+    """
+
+    The model builds upon the GPT2 architecture to perform autoregressive prediction of actions in an offline RL
+    setting. Refer to the paper for more details: https://huggingface.co/papers/2106.01345
+
+    """
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.hidden_size = config.hidden_size
+        # note: the only difference between this GPT2Model and the default Huggingface version
+        # is that the positional embeddings are removed (since we'll add those ourselves)
+        self.encoder = DecisionTransformerGPT2Model(config)
+
+        self.embed_timestep = nn.Embedding(config.max_ep_len, config.hidden_size)
+        self.embed_return = torch.nn.Linear(1, config.hidden_size)
+        self.embed_state = torch.nn.Linear(config.state_dim, config.hidden_size)
+        self.embed_action = torch.nn.Linear(config.act_dim, config.hidden_size)
+
+        self.embed_ln = nn.LayerNorm(config.hidden_size)
+
+        # note: we don't predict states or returns for the paper
+        self.predict_state = torch.nn.Linear(config.hidden_size, config.state_dim)
+        self.predict_action = nn.Sequential(
+            *([nn.Linear(config.hidden_size, config.act_dim)] + ([nn.Tanh()] if config.action_tanh else []))
+        )
+        self.predict_return = torch.nn.Linear(config.hidden_size, 1)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        states: Optional[torch.FloatTensor] = None,
+        actions: Optional[torch.FloatTensor] = None,
+        rewards: Optional[torch.FloatTensor] = None,
+        returns_to_go: Optional[torch.FloatTensor] = None,
+        timesteps: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.FloatTensor], DecisionTransformerOutput]:
+        r"""
+        states (`torch.FloatTensor` of shape `(batch_size, episode_length, state_dim)`):
+            The states for each step in the trajectory
+        actions (`torch.FloatTensor` of shape `(batch_size, episode_length, act_dim)`):
+            The actions taken by the "expert" policy for the current state, these are masked for auto regressive
+            prediction
+        rewards (`torch.FloatTensor` of shape `(batch_size, episode_length, 1)`):
+            The rewards for each state, action
+        returns_to_go (`torch.FloatTensor` of shape `(batch_size, episode_length, 1)`):
+            The returns for each state in the trajectory
+        timesteps (`torch.LongTensor` of shape `(batch_size, episode_length)`):
+            The timestep for each step in the trajectory
+
+        Examples:
+
+        ```python
+        >>> from transformers import DecisionTransformerModel
+        >>> import torch
+
+        >>> model = DecisionTransformerModel.from_pretrained("edbeeching/decision-transformer-gym-hopper-medium")
+        >>> # evaluation
+        >>> model = model.to(device)
+        >>> model.eval()
+
+        >>> env = gym.make("Hopper-v3")
+        >>> state_dim = env.observation_space.shape[0]
+        >>> act_dim = env.action_space.shape[0]
+
+        >>> state = env.reset()
+        >>> states = torch.from_numpy(state).reshape(1, 1, state_dim).to(device=device, dtype=torch.float32)
+        >>> actions = torch.zeros((1, 1, act_dim), device=device, dtype=torch.float32)
+        >>> rewards = torch.zeros(1, 1, device=device, dtype=torch.float32)
+        >>> target_return = torch.tensor(TARGET_RETURN, dtype=torch.float32).reshape(1, 1)
+        >>> timesteps = torch.tensor(0, device=device, dtype=torch.long).reshape(1, 1)
+        >>> attention_mask = torch.zeros(1, 1, device=device, dtype=torch.float32)
+
+        >>> # forward pass
+        >>> with torch.no_grad():
+        ...     state_preds, action_preds, return_preds = model(
+        ...         states=states,
+        ...         actions=actions,
+        ...         rewards=rewards,
+        ...         returns_to_go=target_return,
+        ...         timesteps=timesteps,
+        ...         attention_mask=attention_mask,
+        ...         return_dict=False,
+        ...     )
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        batch_size, seq_length = states.shape[0], states.shape[1]
+
+        if attention_mask is None:
+            # attention mask for GPT: 1 if can be attended to, 0 if not
+            attention_mask = torch.ones((batch_size, seq_length), dtype=torch.long)
+
+        # embed each modality with a different head
+        state_embeddings = self.embed_state(states)
+        action_embeddings = self.embed_action(actions)
+        returns_embeddings = self.embed_return(returns_to_go)
+        time_embeddings = self.embed_timestep(timesteps)
+
+        # time embeddings are treated similar to positional embeddings
+        state_embeddings = state_embeddings + time_embeddings
+        action_embeddings = action_embeddings + time_embeddings
+        returns_embeddings = returns_embeddings + time_embeddings
+
+        # this makes the sequence look like (R_1, s_1, a_1, R_2, s_2, a_2, ...)
+        # which works nice in an autoregressive sense since states predict actions
+        stacked_inputs = (
+            torch.stack((returns_embeddings, state_embeddings, action_embeddings), dim=1)
+            .permute(0, 2, 1, 3)
+            .reshape(batch_size, 3 * seq_length, self.hidden_size)
+        )
+        stacked_inputs = self.embed_ln(stacked_inputs)
+
+        # to make the attention mask fit the stacked inputs, have to stack it as well
+        stacked_attention_mask = (
+            torch.stack((attention_mask, attention_mask, attention_mask), dim=1)
+            .permute(0, 2, 1)
+            .reshape(batch_size, 3 * seq_length)
+        )
+        device = stacked_inputs.device
+        # we feed in the input embeddings (not word indices as in NLP) to the model
+        encoder_outputs = self.encoder(
+            inputs_embeds=stacked_inputs,
+            attention_mask=stacked_attention_mask,
+            position_ids=torch.zeros(stacked_attention_mask.shape, device=device, dtype=torch.long),
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        x = encoder_outputs[0]
+
+        # reshape x so that the second dimension corresponds to the original
+        # returns (0), states (1), or actions (2); i.e. x[:,1,t] is the token for s_t
+        x = x.reshape(batch_size, seq_length, 3, self.hidden_size).permute(0, 2, 1, 3)
+
+        # get predictions
+        return_preds = self.predict_return(x[:, 2])  # predict next return given state and action
+        state_preds = self.predict_state(x[:, 2])  # predict next state given state and action
+        action_preds = self.predict_action(x[:, 1])  # predict next action given state
+        if not return_dict:
+            return (state_preds, action_preds, return_preds)
+
+        return DecisionTransformerOutput(
+            last_hidden_state=encoder_outputs.last_hidden_state,
+            state_preds=state_preds,
+            action_preds=action_preds,
+            return_preds=return_preds,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+__all__ = [
+    "DecisionTransformerGPT2Model",
+    "DecisionTransformerGPT2PreTrainedModel",
+    "DecisionTransformerModel",
+    "DecisionTransformerPreTrainedModel",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deepseek_vl/configuration_deepseek_vl.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deepseek_vl/configuration_deepseek_vl.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3abae5af0a7f7613432e71ef13a1c3b9774176c
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deepseek_vl/configuration_deepseek_vl.py
@@ -0,0 +1,97 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/deepseek_vl/modular_deepseek_vl.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_deepseek_vl.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ..auto import CONFIG_MAPPING, AutoConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class DeepseekVLConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DeepseekVLModel`]. It is used to instantiate a
+    DeepseekVL model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the DeepseekVL
+    [deepseek-community/deepseek-vl-1.3b-chat](https://huggingface.co/deepseek-community/deepseek-vl-1.3b-chat) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`):
+            The config object or dictionary of the text backbone.
+        vision_config (`Union[AutoConfig, dict]`,  *optional*, defaults to `SiglipVisionConfig`):
+            The config object or dictionary of the vision backbone.
+        image_token_id (`int`, *optional*, defaults to 100015):
+            The index representing image tokens in the model's token vocabulary.
+
+    Example:
+
+    ```python
+    >>> from transformers import DeepseekVLConfig, DeepseekVLModel
+
+    >>> # Initializing a DeepseekVL deepseek-community/deepseek-vl-1.3b-chat style configuration
+    >>> configuration = DeepseekVLConfig()
+
+    >>> # Initializing a model (with random weights) from the deepseek-community/deepseek-vl-1.3b-chat style configuration
+    >>> model = DeepseekVLModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "deepseek_vl"
+    sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
+
+    def __init__(
+        self,
+        text_config: Optional[AutoConfig] = None,
+        vision_config: Optional[AutoConfig] = None,
+        image_token_id: int = 100015,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        if text_config is None:
+            text_config = {}
+            logger.info("`text_config` is `None`. Initializing the `LlamaConfig` with default values.")
+
+        if vision_config is None:
+            vision_config = {}
+            logger.info("`vision_config` is `None`. Initializing the `SiglipVisionConfig` with default values.")
+
+        if isinstance(text_config, dict):
+            text_config["model_type"] = text_config.get("model_type", "llama")
+            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+
+        if isinstance(vision_config, dict):
+            vision_config["model_type"] = vision_config.get("model_type", "siglip_vision_model")
+            vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
+
+        self.text_config = text_config
+        self.vision_config = vision_config
+        self.image_token_id = image_token_id
+
+
+__all__ = ["DeepseekVLConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deepseek_vl/image_processing_deepseek_vl.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deepseek_vl/image_processing_deepseek_vl.py
new file mode 100644
index 0000000000000000000000000000000000000000..12aa7caf892e52262c5d612551a19e8f9bd4f58d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deepseek_vl/image_processing_deepseek_vl.py
@@ -0,0 +1,425 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/deepseek_vl/modular_deepseek_vl.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_deepseek_vl.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import convert_to_rgb, resize, to_channel_dimension_format
+from ...image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_flat_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
+
+
+if is_vision_available():
+    import PIL
+
+
+logger = logging.get_logger(__name__)
+
+
+class DeepseekVLImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a DEEPSEEK_VL image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
+            `do_resize` parameter in the `preprocess` method.
+        size (`dict`, *optional*, defaults to `{"height": 384, "width": 384}`):
+            Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
+            method.
+        min_size (`int`, *optional*, defaults to 14):
+            The minimum allowed size for the resized image. Ensures that neither the height nor width
+            falls below this value after resizing.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+            Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`. Can be
+            overridden by the `resample` parameter in the `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
+            `do_rescale` parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Only has an effect if `do_rescale` is set to `True`. Can be
+            overridden by the `rescale_factor` parameter in the `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method. Can be overridden by the `do_normalize` parameter in the `preprocess` method.
+        image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method. Can be
+            overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+        do_pad (`bool`, *optional*, defaults to `True`):
+            Whether to pad the image to square or not.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Optional[dict[str, int]] = None,
+        min_size: int = 14,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_convert_rgb: Optional[bool] = None,
+        do_pad: Optional[bool] = True,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 384, "width": 384}
+        size = get_size_dict(size, default_to_square=True)
+
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
+        self.do_convert_rgb = do_convert_rgb
+
+        self.do_pad = do_pad
+        self.min_size = min_size
+        if image_mean is None:
+            self.background_color = (127, 127, 127)
+        else:
+            self.background_color = tuple(int(x * 255) for x in image_mean)
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: Union[dict[str, int], int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image to dynamically calculated size.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`dict[str, int]` or `int`):
+                The size to resize the image to. If a dictionary, it should have the keys `"height"` and `"width"`.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BICUBIC`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `None`: will be inferred from input
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+
+        Returns:
+            `np.ndarray`: The resized image.
+        """
+        if input_data_format is None:
+            input_data_format = infer_channel_dimension_format(image)
+
+        height, width = get_image_size(image, input_data_format)
+        max_size = max(height, width)
+
+        size = get_size_dict(size, default_to_square=True)
+        if size["height"] != size["width"]:
+            raise ValueError(
+                f"Output height and width must be the same. Got height={size['height']} and width={size['width']}"
+            )
+        size = size["height"]
+
+        delta = size / max_size
+        # Largest side becomes `size` and the other side is scaled according to the aspect ratio.
+        output_size_nonpadded = [
+            max(int(height * delta), self.min_size),
+            max(int(width * delta), self.min_size),
+        ]
+
+        image = resize(
+            image,
+            size=output_size_nonpadded,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+        return image
+
+    @filter_out_non_signature_kwargs()
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample: Optional[PILImageResampling] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        do_convert_rgb: Optional[bool] = None,
+        background_color: Optional[Union[int, tuple[int, int, int]]] = None,
+        do_pad: Optional[bool] = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Controls the size of the image after `resize`. The shortest edge of the image is resized to
+                `size["shortest_edge"]` whilst preserving the aspect ratio. If the longest edge of this resized image
+                is > `int(size["shortest_edge"] * (1333 / 800))`, then the image is resized again to make the longest
+                edge equal to `int(size["shortest_edge"] * (1333 / 800))`.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. Only has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to normalize the image by if `do_normalize` is set to `True`.
+            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to normalize the image by if `do_normalize` is set to `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            background_color (`tuple[int, int, int]`):
+                The background color to use for the padding.
+            do_pad (`bool`, *optional*, defaults to `self.do_pad`):
+                Whether to pad the image to square or not.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+        do_pad = do_pad if do_pad is not None else self.do_pad
+        background_color = background_color if background_color is not None else self.background_color
+
+        size = size if size is not None else self.size
+        size = get_size_dict(size, default_to_square=False)
+        images = self.fetch_images(images)
+        images = make_flat_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+        # PIL RGBA images are converted to RGB
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if do_rescale and is_scaled_image(images[0]):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        if do_resize:
+            images = [
+                self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        if do_pad:
+            # Expand and pad the images to obtain a square image of dimensions `size x size`
+            images = [
+                self.pad_to_square(
+                    image=image,
+                    background_color=background_color,
+                    input_data_format=input_data_format,
+                )
+                for image in images
+            ]
+
+        if do_rescale:
+            images = [
+                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        if do_normalize:
+            images = [
+                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
+
+        encoded_outputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
+
+        return encoded_outputs
+
+    def pad_to_square(
+        self,
+        image: np.ndarray,
+        background_color: Union[int, tuple[int, int, int]] = 0,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """
+        Pads an image to a square based on the longest edge.
+
+        Args:
+            image (`np.ndarray`):
+                The image to pad.
+            background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0):
+                The color to use for the padding. Can be an integer for single channel or a
+                tuple of integers representing for multi-channel images. If passed as integer
+                in multi-channel mode, it will default to `0` in subsequent channels.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. Can be one of:
+                    - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                    - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                If unset, will use same as the input image.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the input image. Can be one of:
+                    - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                    - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+
+        Returns:
+            `np.ndarray`: The padded image.
+        """
+        height, width = get_image_size(image, input_data_format)
+        num_channels = image.shape[0] if input_data_format == ChannelDimension.FIRST else image.shape[-1]
+
+        if height == width:
+            image = (
+                to_channel_dimension_format(image, data_format, input_data_format)
+                if data_format is not None
+                else image
+            )
+            return image
+
+        max_dim = max(height, width)
+
+        # Ensure background_color is the correct shape
+        if isinstance(background_color, int):
+            background_color = [background_color]
+        elif len(background_color) != num_channels:
+            raise ValueError(
+                f"background_color must have no more than {num_channels} elements to match the number of channels"
+            )
+
+        if input_data_format == ChannelDimension.FIRST:
+            result = np.zeros((num_channels, max_dim, max_dim), dtype=image.dtype)
+            for i, color in enumerate(background_color):
+                result[i, :, :] = color
+            if width > height:
+                start = (max_dim - height) // 2
+                result[:, start : start + height, :] = image
+            else:
+                start = (max_dim - width) // 2
+                result[:, :, start : start + width] = image
+        else:
+            result = np.zeros((max_dim, max_dim, num_channels), dtype=image.dtype)
+            for i, color in enumerate(background_color):
+                result[:, :, i] = color
+            if width > height:
+                start = (max_dim - height) // 2
+                result[start : start + height, :, :] = image
+            else:
+                start = (max_dim - width) // 2
+                result[:, start : start + width, :] = image
+
+        return result
+
+
+__all__ = ["DeepseekVLImageProcessor"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deepseek_vl/modeling_deepseek_vl.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deepseek_vl/modeling_deepseek_vl.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce884da8d08bb8680152bd2a58e537f876919f0f
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deepseek_vl/modeling_deepseek_vl.py
@@ -0,0 +1,350 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/deepseek_vl/modular_deepseek_vl.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_deepseek_vl.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+
+from ...cache_utils import Cache
+from ...generation import GenerationMixin
+from ...modeling_outputs import ModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
+from ..auto import AutoModel
+from .configuration_deepseek_vl import DeepseekVLConfig
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for DeepseekVL model's outputs that may also contain a past key/values (to speed up sequential decoding).
+    """
+)
+class DeepseekVLBaseModelOutputWithPast(ModelOutput):
+    r"""
+    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+        Sequence of hidden-states at the output of the last layer of the model.
+
+        If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
+        hidden_size)` is output.
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+
+        Contains pre-computed hidden-states (key and values in the self-attention blocks and optionally if
+        `config.is_encoder_decoder=True` in the cross-attention blocks) that can be used (see `past_key_values`
+        input) to speed up sequential decoding.
+    image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
+        Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
+        sequence_length, hidden_size)`.
+
+        image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
+    """
+
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[Cache] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for DeepseekVL causal language model (or autoregressive) outputs.
+    """
+)
+class DeepseekVLCausalLMOutputWithPast(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Language modeling loss (for next-token prediction).
+    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    image_hidden_states (`tuple(torch.FloatTensor)`, *optional*):
+        Tuple of `torch.FloatTensor` (one for the output of the image embeddings, `(batch_size, num_images,
+        sequence_length, hidden_size)`.
+
+        image_hidden_states of the model produced by the vision encoder, and optionally by the perceiver
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[Cache] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+
+
+class DeepseekVLAligner(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+
+        in_features = config.vision_config.hidden_size
+        out_features = config.text_config.hidden_size
+
+        self.linear1 = nn.Linear(in_features, out_features)
+        self.activation = nn.GELU()
+        self.linear2 = nn.Linear(out_features, out_features)
+
+    def forward(self, vision_encodings: torch.Tensor) -> torch.Tensor:
+        x = self.linear1(vision_encodings)
+        x = self.activation(x)
+        x = self.linear2(x)
+        return x
+
+
+@auto_docstring
+class DeepseekVLPreTrainedModel(PreTrainedModel):
+    config: DeepseekVLConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["LlamaDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values", "causal_mask"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+
+    _can_compile_fullgraph = True
+    _supports_param_buffer_assignment = False
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        # Required only for Linear layer in DeepseekVLAligner
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.text_config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+
+
+@auto_docstring
+class DeepseekVLModel(DeepseekVLPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+
+        self.vision_model = AutoModel.from_config(config.vision_config)
+        self.aligner = DeepseekVLAligner(config)
+
+        self.language_model = AutoModel.from_config(config=config.text_config)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing.
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+
+    def get_image_features(self, pixel_values):
+        image_embeds = self.vision_model(pixel_values)
+        image_embeds = self.aligner(image_embeds.last_hidden_state)
+        return image_embeds
+
+    def get_placeholder_mask(
+        self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor
+    ):
+        """
+        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
+        equal to the length of multimodal features. If the lengths are different, an error is raised.
+        """
+        if input_ids is None:
+            special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_image_mask = special_image_mask.all(-1)
+        else:
+            special_image_mask = input_ids == self.config.image_token_id
+
+        n_image_tokens = special_image_mask.sum()
+        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        if inputs_embeds[special_image_mask].numel() != image_features.numel():
+            n_image_features = image_features.shape[0] * image_features.shape[1]
+            raise ValueError(
+                f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+            )
+        return special_image_mask
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs,
+    ):
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+        if pixel_values is not None:
+            image_embeds = self.get_image_features(pixel_values)
+            image_features = image_embeds.reshape(-1, inputs_embeds.shape[-1])
+            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            image_attention_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, image_features=image_features
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(image_attention_mask, image_features)
+
+        lm_output = self.language_model(
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
+
+        return DeepseekVLBaseModelOutputWithPast(
+            last_hidden_state=lm_output.last_hidden_state,
+            past_key_values=lm_output.past_key_values,
+            hidden_states=lm_output.hidden_states,
+            attentions=lm_output.attentions,
+            image_hidden_states=image_embeds if pixel_values is not None else None,
+        )
+
+
+class DeepseekVLForConditionalGeneration(DeepseekVLPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["model.language_model.embed_tokens.weight", "lm_head.weight"]
+    _can_compile_fullgraph = True
+
+    def __init__(self, config: DeepseekVLConfig):
+        super().__init__(config)
+        self.config = config
+        self.model = DeepseekVLModel(config)
+        self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing.
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.model.language_model.set_input_embeddings(value)
+
+    def prepare_embeddings_for_image_generation(self) -> torch.Tensor:
+        raise AttributeError("Not needed for DeepseekVL")
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ):
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        """
+        outputs = self.model(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(
+                logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
+            )
+
+        return DeepseekVLCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=outputs.image_hidden_states,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        pixel_values=None,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        logits_to_keep=None,
+        **kwargs,
+    ):
+        # Overwritten -- extra custom processing
+
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
+
+        # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+        # Otherwise we need pixel values to be passed to model
+        if cache_position[0] == 0:
+            model_inputs["pixel_values"] = pixel_values
+
+        return model_inputs
+
+
+__all__ = ["DeepseekVLPreTrainedModel", "DeepseekVLModel", "DeepseekVLForConditionalGeneration"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deepseek_vl/modular_deepseek_vl.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deepseek_vl/modular_deepseek_vl.py
new file mode 100644
index 0000000000000000000000000000000000000000..5bfc0ae7d74c3b48f9a009edcc7daba60ccc3382
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deepseek_vl/modular_deepseek_vl.py
@@ -0,0 +1,333 @@
+# Copyright 2025 Deepseek AI and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+
+from ...configuration_utils import PretrainedConfig
+from ...image_processing_utils import BatchFeature
+from ...image_utils import ImageInput
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
+from ...tokenization_utils_base import (
+    PreTokenizedInput,
+    TextInput,
+)
+from ...utils import (
+    auto_docstring,
+    logging,
+)
+from ..auto import CONFIG_MAPPING, AutoConfig, AutoModel
+from ..idefics.modeling_idefics import IdeficsBaseModelOutputWithPast, IdeficsCausalLMOutputWithPast
+from ..janus.image_processing_janus import JanusImageProcessor
+from ..janus.image_processing_janus_fast import JanusImageProcessorFast
+from ..janus.modeling_janus import JanusForConditionalGeneration, JanusModel, JanusPreTrainedModel
+
+
+logger = logging.get_logger(__name__)
+
+
+class DeepseekVLConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DeepseekVLModel`]. It is used to instantiate a
+    DeepseekVL model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the DeepseekVL
+    [deepseek-community/deepseek-vl-1.3b-chat](https://huggingface.co/deepseek-community/deepseek-vl-1.3b-chat) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`):
+            The config object or dictionary of the text backbone.
+        vision_config (`Union[AutoConfig, dict]`,  *optional*, defaults to `SiglipVisionConfig`):
+            The config object or dictionary of the vision backbone.
+        image_token_id (`int`, *optional*, defaults to 100015):
+            The index representing image tokens in the model's token vocabulary.
+
+    Example:
+
+    ```python
+    >>> from transformers import DeepseekVLConfig, DeepseekVLModel
+
+    >>> # Initializing a DeepseekVL deepseek-community/deepseek-vl-1.3b-chat style configuration
+    >>> configuration = DeepseekVLConfig()
+
+    >>> # Initializing a model (with random weights) from the deepseek-community/deepseek-vl-1.3b-chat style configuration
+    >>> model = DeepseekVLModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "deepseek_vl"
+    sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
+
+    def __init__(
+        self,
+        text_config: Optional[AutoConfig] = None,
+        vision_config: Optional[AutoConfig] = None,
+        image_token_id: int = 100015,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        if text_config is None:
+            text_config = {}
+            logger.info("`text_config` is `None`. Initializing the `LlamaConfig` with default values.")
+
+        if vision_config is None:
+            vision_config = {}
+            logger.info("`vision_config` is `None`. Initializing the `SiglipVisionConfig` with default values.")
+
+        if isinstance(text_config, dict):
+            text_config["model_type"] = text_config.get("model_type", "llama")
+            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+
+        if isinstance(vision_config, dict):
+            vision_config["model_type"] = vision_config.get("model_type", "siglip_vision_model")
+            vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
+
+        self.text_config = text_config
+        self.vision_config = vision_config
+        self.image_token_id = image_token_id
+
+
+class DeepseekVLBaseModelOutputWithPast(IdeficsBaseModelOutputWithPast):
+    pass
+
+
+class DeepseekVLCausalLMOutputWithPast(IdeficsCausalLMOutputWithPast):
+    pass
+
+
+class DeepseekVLAligner(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+
+        in_features = config.vision_config.hidden_size
+        out_features = config.text_config.hidden_size
+
+        self.linear1 = nn.Linear(in_features, out_features)
+        self.activation = nn.GELU()
+        self.linear2 = nn.Linear(out_features, out_features)
+
+    def forward(self, vision_encodings: torch.Tensor) -> torch.Tensor:
+        x = self.linear1(vision_encodings)
+        x = self.activation(x)
+        x = self.linear2(x)
+        return x
+
+
+class DeepseekVLPreTrainedModel(JanusPreTrainedModel):
+    _no_split_modules = ["LlamaDecoderLayer"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        # Required only for Linear layer in DeepseekVLAligner
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.text_config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+
+
+@auto_docstring
+class DeepseekVLModel(JanusModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+
+        self.vision_model = AutoModel.from_config(config.vision_config)
+        self.aligner = DeepseekVLAligner(config)
+
+        self.language_model = AutoModel.from_config(config=config.text_config)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing.
+        self.post_init()
+
+        del self.vqmodel
+        del self.generation_embeddings
+        del self.generation_aligner
+        del self.generation_head
+
+
+class DeepseekVLForConditionalGeneration(JanusForConditionalGeneration):
+    def prepare_embeddings_for_image_generation(self):
+        raise AttributeError("Not needed for DeepseekVL")
+
+    def decode_image_tokens(self):
+        raise AttributeError("Not needed for DeepseekVL")
+
+    def generate(self):
+        raise AttributeError("Not needed for DeepseekVL")
+
+
+class DeepseekVLImageProcessor(JanusImageProcessor):
+    def __init__(self, **super_kwargs):
+        super().__init__(**super_kwargs)
+
+    def postprocess(self):
+        raise AttributeError("Not needed for DeepseekVL")
+
+    def unnormalize(self):
+        raise AttributeError("Not needed for DeepseekVL")
+
+
+class DeepseekVLImageProcessorFast(JanusImageProcessorFast):
+    def __init__(self, **super_kwargs):
+        super().__init__(**super_kwargs)
+
+    def postprocess(self):
+        raise AttributeError("Not needed for DeepseekVL")
+
+
+class DeepseekVLProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "text_kwargs": {"padding": False},
+        "common_kwargs": {"return_tensors": "pt"},
+    }
+
+
+class DeepseekVLProcessor(ProcessorMixin):
+    r"""
+    Constructs a DeepseekVL processor which wraps a DeepseekVL Image Processor and a Llama tokenizer into a single processor.
+
+    [`DeepseekVLProcessor`] offers all the functionalities of [`DeepseekVLImageProcessor`] and [`LlamaTokenizerFast`]. See the
+    [`~DeepseekVLProcessor.__call__`] and [`~DeepseekVLProcessor.decode`] for more information.
+
+    Args:
+        image_processor ([`DeepseekVLImageProcessor`]):
+            The image processor is a required input.
+        tokenizer ([`LlamaTokenizerFast`]):
+            The tokenizer is a required input.
+        chat_template (`str`, *optional*):
+            A Jinja template which will be used to convert lists of messages
+            in a chat into a tokenizable string.
+        num_image_tokens (`int`, *optional*, defaults to 576):
+            The number of special image tokens used as placeholders for visual content in text sequences.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    valid_kwargs = ["chat_template", "num_image_tokens"]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(
+        self,
+        image_processor,
+        tokenizer,
+        chat_template=None,
+        num_image_tokens=576,
+    ):
+        self.image_token = tokenizer.image_token
+        self.num_image_tokens = num_image_tokens
+
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+
+    def __call__(
+        self,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
+        images: Optional[ImageInput] = None,
+        **kwargs: Unpack[DeepseekVLProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
+        DeepseekVLImageProcessor's [`~DeepseekVLImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
+        of the above two methods for more information.
+
+        Args:
+            text (`str`, `List[str]`, `List[List[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+        """
+        output_kwargs = self._merge_kwargs(
+            DeepseekVLProcessorKwargs, tokenizer_init_kwargs=self.tokenizer.init_kwargs, **kwargs
+        )
+        if text is None and images is None:
+            raise ValueError("You must specify either text or images.")
+
+        if text is not None:
+            if isinstance(text, str):
+                text = [text]
+            elif not (isinstance(text, (list, tuple)) and all(isinstance(t, str) for t in text)):
+                raise ValueError("Invalid input text. Please provide a string, or a list of strings")
+
+        prompt_strings = []
+        one_img_tokens = self.image_token * self.num_image_tokens
+        for prompt in text:
+            prompt = prompt.replace(self.image_token, one_img_tokens)
+            prompt_strings.append(prompt)
+
+        data = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
+
+        # process images if pixel_values are provided
+        if images is not None:
+            data["pixel_values"] = self.image_processor(images, **output_kwargs["images_kwargs"])["pixel_values"]
+
+        return BatchFeature(data=data)
+
+    def batch_decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        return self.tokenizer.batch_decode(*args, **kwargs)
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
+        the docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @property
+    def model_input_names(self):
+        tokenizer_input_names = self.tokenizer.model_input_names
+        image_processor_input_names = self.image_processor.model_input_names
+        return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
+
+
+__all__ = [
+    "DeepseekVLConfig",
+    "DeepseekVLPreTrainedModel",
+    "DeepseekVLModel",
+    "DeepseekVLForConditionalGeneration",
+    "DeepseekVLImageProcessor",
+    "DeepseekVLImageProcessorFast",
+    "DeepseekVLProcessor",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deformable_detr/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deformable_detr/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..94189b16d9784bf35d24d52c602514f6b387afc3
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deformable_detr/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deformable_detr/__pycache__/configuration_deformable_detr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deformable_detr/__pycache__/configuration_deformable_detr.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3a4707d31dc609dd91977f3a5ede00a0f0c92d10
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deformable_detr/__pycache__/configuration_deformable_detr.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deformable_detr/__pycache__/feature_extraction_deformable_detr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deformable_detr/__pycache__/feature_extraction_deformable_detr.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d26e0670a59457103cd7c92101a95b182e712b61
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deformable_detr/__pycache__/feature_extraction_deformable_detr.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deformable_detr/__pycache__/image_processing_deformable_detr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deformable_detr/__pycache__/image_processing_deformable_detr.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..55ac45c6bdc722224bb6f7b612411cd2fc1e28cb
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deformable_detr/__pycache__/image_processing_deformable_detr.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deformable_detr/__pycache__/image_processing_deformable_detr_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deformable_detr/__pycache__/image_processing_deformable_detr_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b76c6a46be7da65cb30209dd386edc0e03fc5ccc
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deformable_detr/__pycache__/image_processing_deformable_detr_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deformable_detr/__pycache__/modeling_deformable_detr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deformable_detr/__pycache__/modeling_deformable_detr.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b9ec83ab52900788222d9b6913b6b2c3930c7640
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deformable_detr/__pycache__/modeling_deformable_detr.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deformable_detr/__pycache__/modular_deformable_detr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deformable_detr/__pycache__/modular_deformable_detr.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..17fad64a6922343e6f6472dd637b8d9559a1e9f0
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deformable_detr/__pycache__/modular_deformable_detr.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b8ef20e7e9a7174f90a132cda36ed9591f211c10
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/bort/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/bort/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/deta/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/deta/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e06e186741991b2f9183f30b6591477bc60dfb5
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/deta/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ....utils import _LazyModule
+from ....utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_deta import *
+    from .image_processing_deta import *
+    from .modeling_deta import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/deta/configuration_deta.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/deta/configuration_deta.py
new file mode 100644
index 0000000000000000000000000000000000000000..2109902ac06546ce4e235403dd3b95bf27486272
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/deta/configuration_deta.py
@@ -0,0 +1,278 @@
+# coding=utf-8
+# Copyright 2022 SenseTime and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""DETA model configuration"""
+
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
+from ...auto import CONFIG_MAPPING
+
+
+logger = logging.get_logger(__name__)
+
+
+class DetaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DetaModel`]. It is used to instantiate a DETA
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the DETA
+    [SenseTime/deformable-detr](https://huggingface.co/SenseTime/deformable-detr) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        backbone_config (`PretrainedConfig` or `dict`, *optional*, defaults to `ResNetConfig()`):
+            The configuration of the backbone model.
+        backbone (`str`, *optional*):
+            Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
+            will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
+            is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
+        use_pretrained_backbone (`bool`, *optional*, `False`):
+            Whether to use pretrained weights for the backbone.
+        use_timm_backbone (`bool`, *optional*, `False`):
+            Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
+            library.
+        backbone_kwargs (`dict`, *optional*):
+            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
+            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
+        num_queries (`int`, *optional*, defaults to 900):
+            Number of object queries, i.e. detection slots. This is the maximal number of objects [`DetaModel`] can
+            detect in a single image. In case `two_stage` is set to `True`, we use `two_stage_num_proposals` instead.
+        d_model (`int`, *optional*, defaults to 256):
+            Dimension of the layers.
+        encoder_layers (`int`, *optional*, defaults to 6):
+            Number of encoder layers.
+        decoder_layers (`int`, *optional*, defaults to 6):
+            Number of decoder layers.
+        encoder_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 2048):
+            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 2048):
+            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+        activation_function (`str` or `function`, *optional*, defaults to `"relu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        init_xavier_std (`float`, *optional*, defaults to 1):
+            The scaling factor used for the Xavier initialization gain in the HM Attention map module.
+        encoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://huggingface.co/papers/1909.11556)
+            for more details.
+        auxiliary_loss (`bool`, *optional*, defaults to `False`):
+            Whether auxiliary decoding losses (loss at each decoder layer) are to be used.
+        position_embedding_type (`str`, *optional*, defaults to `"sine"`):
+            Type of position embeddings to be used on top of the image features. One of `"sine"` or `"learned"`.
+        class_cost (`float`, *optional*, defaults to 1):
+            Relative weight of the classification error in the Hungarian matching cost.
+        bbox_cost (`float`, *optional*, defaults to 5):
+            Relative weight of the L1 error of the bounding box coordinates in the Hungarian matching cost.
+        giou_cost (`float`, *optional*, defaults to 2):
+            Relative weight of the generalized IoU loss of the bounding box in the Hungarian matching cost.
+        mask_loss_coefficient (`float`, *optional*, defaults to 1):
+            Relative weight of the Focal loss in the panoptic segmentation loss.
+        dice_loss_coefficient (`float`, *optional*, defaults to 1):
+            Relative weight of the DICE/F-1 loss in the panoptic segmentation loss.
+        bbox_loss_coefficient (`float`, *optional*, defaults to 5):
+            Relative weight of the L1 bounding box loss in the object detection loss.
+        giou_loss_coefficient (`float`, *optional*, defaults to 2):
+            Relative weight of the generalized IoU loss in the object detection loss.
+        eos_coefficient (`float`, *optional*, defaults to 0.1):
+            Relative classification weight of the 'no-object' class in the object detection loss.
+        num_feature_levels (`int`, *optional*, defaults to 5):
+            The number of input feature levels.
+        encoder_n_points (`int`, *optional*, defaults to 4):
+            The number of sampled keys in each feature level for each attention head in the encoder.
+        decoder_n_points (`int`, *optional*, defaults to 4):
+            The number of sampled keys in each feature level for each attention head in the decoder.
+        two_stage (`bool`, *optional*, defaults to `True`):
+            Whether to apply a two-stage deformable DETR, where the region proposals are also generated by a variant of
+            DETA, which are further fed into the decoder for iterative bounding box refinement.
+        two_stage_num_proposals (`int`, *optional*, defaults to 300):
+            The number of region proposals to be generated, in case `two_stage` is set to `True`.
+        with_box_refine (`bool`, *optional*, defaults to `True`):
+            Whether to apply iterative bounding box refinement, where each decoder layer refines the bounding boxes
+            based on the predictions from the previous layer.
+        focal_alpha (`float`, *optional*, defaults to 0.25):
+            Alpha parameter in the focal loss.
+        assign_first_stage (`bool`, *optional*, defaults to `True`):
+            Whether to assign each prediction i to the highest overlapping ground truth object if the overlap is larger than a threshold 0.7.
+        assign_second_stage (`bool`, *optional*, defaults to `True`):
+            Whether to assign second assignment procedure in the second stage closely follows the first stage assignment procedure.
+        disable_custom_kernels (`bool`, *optional*, defaults to `True`):
+            Disable the use of custom CUDA and CPU kernels. This option is necessary for the ONNX export, as custom
+            kernels are not supported by PyTorch ONNX export.
+
+    Examples:
+
+    ```python
+    >>> from transformers import DetaConfig, DetaModel
+
+    >>> # Initializing a DETA SenseTime/deformable-detr style configuration
+    >>> configuration = DetaConfig()
+
+    >>> # Initializing a model (with random weights) from the SenseTime/deformable-detr style configuration
+    >>> model = DetaModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "deta"
+    attribute_map = {
+        "hidden_size": "d_model",
+        "num_attention_heads": "encoder_attention_heads",
+    }
+
+    def __init__(
+        self,
+        backbone_config=None,
+        backbone=None,
+        use_pretrained_backbone=False,
+        use_timm_backbone=False,
+        backbone_kwargs=None,
+        num_queries=900,
+        max_position_embeddings=2048,
+        encoder_layers=6,
+        encoder_ffn_dim=2048,
+        encoder_attention_heads=8,
+        decoder_layers=6,
+        decoder_ffn_dim=1024,
+        decoder_attention_heads=8,
+        encoder_layerdrop=0.0,
+        is_encoder_decoder=True,
+        activation_function="relu",
+        d_model=256,
+        dropout=0.1,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        init_std=0.02,
+        init_xavier_std=1.0,
+        return_intermediate=True,
+        auxiliary_loss=False,
+        position_embedding_type="sine",
+        num_feature_levels=5,
+        encoder_n_points=4,
+        decoder_n_points=4,
+        two_stage=True,
+        two_stage_num_proposals=300,
+        with_box_refine=True,
+        assign_first_stage=True,
+        assign_second_stage=True,
+        class_cost=1,
+        bbox_cost=5,
+        giou_cost=2,
+        mask_loss_coefficient=1,
+        dice_loss_coefficient=1,
+        bbox_loss_coefficient=5,
+        giou_loss_coefficient=2,
+        eos_coefficient=0.1,
+        focal_alpha=0.25,
+        disable_custom_kernels=True,
+        **kwargs,
+    ):
+        if use_pretrained_backbone:
+            raise ValueError("Pretrained backbones are not supported yet.")
+
+        if backbone_config is not None and backbone is not None:
+            raise ValueError("You can't specify both `backbone` and `backbone_config`.")
+
+        if backbone_config is None and backbone is None:
+            logger.info("`backbone_config` is `None`. Initializing the config with the default `ResNet` backbone.")
+            backbone_config = CONFIG_MAPPING["resnet"](out_features=["stage2", "stage3", "stage4"])
+        else:
+            if isinstance(backbone_config, dict):
+                backbone_model_type = backbone_config.pop("model_type")
+                config_class = CONFIG_MAPPING[backbone_model_type]
+                backbone_config = config_class.from_dict(backbone_config)
+
+        if backbone_kwargs is not None and backbone_kwargs and backbone_config is not None:
+            raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.")
+
+        self.backbone_config = backbone_config
+        self.backbone = backbone
+        self.use_pretrained_backbone = use_pretrained_backbone
+        self.use_timm_backbone = use_timm_backbone
+        self.backbone_kwargs = backbone_kwargs
+        self.num_queries = num_queries
+        self.max_position_embeddings = max_position_embeddings
+        self.d_model = d_model
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.init_xavier_std = init_xavier_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.auxiliary_loss = auxiliary_loss
+        self.position_embedding_type = position_embedding_type
+        # deformable attributes
+        self.num_feature_levels = num_feature_levels
+        self.encoder_n_points = encoder_n_points
+        self.decoder_n_points = decoder_n_points
+        self.two_stage = two_stage
+        self.two_stage_num_proposals = two_stage_num_proposals
+        self.with_box_refine = with_box_refine
+        self.assign_first_stage = assign_first_stage
+        self.assign_second_stage = assign_second_stage
+        if two_stage is True and with_box_refine is False:
+            raise ValueError("If two_stage is True, with_box_refine must be True.")
+        # Hungarian matcher
+        self.class_cost = class_cost
+        self.bbox_cost = bbox_cost
+        self.giou_cost = giou_cost
+        # Loss coefficients
+        self.mask_loss_coefficient = mask_loss_coefficient
+        self.dice_loss_coefficient = dice_loss_coefficient
+        self.bbox_loss_coefficient = bbox_loss_coefficient
+        self.giou_loss_coefficient = giou_loss_coefficient
+        self.eos_coefficient = eos_coefficient
+        self.focal_alpha = focal_alpha
+        self.disable_custom_kernels = disable_custom_kernels
+        super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
+
+    @property
+    def num_attention_heads(self) -> int:
+        return self.encoder_attention_heads
+
+    @property
+    def hidden_size(self) -> int:
+        return self.d_model
+
+    @property
+    def sub_configs(self):
+        return (
+            {"backbone_config": type(self.backbone_config)}
+            if getattr(self, "backbone_config", None) is not None
+            else {}
+        )
+
+
+__all__ = ["DetaConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/deta/image_processing_deta.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/deta/image_processing_deta.py
new file mode 100644
index 0000000000000000000000000000000000000000..434d25a1ab5152b0de5e74463c3fe15eb16f901b
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/deta/image_processing_deta.py
@@ -0,0 +1,1228 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for Deformable DETR."""
+
+import pathlib
+from collections.abc import Iterable
+from typing import Any, Callable, Optional, Union
+
+import numpy as np
+
+from ....feature_extraction_utils import BatchFeature
+from ....image_processing_utils import BaseImageProcessor, get_size_dict
+from ....image_transforms import (
+    PaddingMode,
+    center_to_corners_format,
+    corners_to_center_format,
+    pad,
+    rescale,
+    resize,
+    rgb_to_id,
+    to_channel_dimension_format,
+)
+from ....image_utils import (
+    IMAGENET_DEFAULT_MEAN,
+    IMAGENET_DEFAULT_STD,
+    AnnotationFormat,
+    AnnotationType,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_batched,
+    is_scaled_image,
+    to_numpy_array,
+    valid_images,
+    validate_annotations,
+    validate_preprocess_arguments,
+)
+from ....utils import (
+    is_flax_available,
+    is_jax_tensor,
+    is_tf_available,
+    is_tf_tensor,
+    is_torch_available,
+    is_torch_tensor,
+    is_torchvision_available,
+    is_vision_available,
+    logging,
+)
+from ....utils.generic import TensorType
+
+
+if is_torch_available():
+    import torch
+
+
+if is_torchvision_available():
+    from torchvision.ops.boxes import batched_nms
+
+if is_vision_available():
+    import PIL
+
+
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+
+SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
+
+
+def get_size_with_aspect_ratio(image_size, size, max_size=None) -> tuple[int, int]:
+    """
+    Computes the output image size given the input image size and the desired output size.
+
+    Args:
+        image_size (`tuple[int, int]`):
+            The input image size.
+        size (`int`):
+            The desired output size.
+        max_size (`int`, *optional*):
+            The maximum allowed output size.
+    """
+    height, width = image_size
+    raw_size = None
+    if max_size is not None:
+        min_original_size = float(min((height, width)))
+        max_original_size = float(max((height, width)))
+        if max_original_size / min_original_size * size > max_size:
+            raw_size = max_size * min_original_size / max_original_size
+            size = int(round(raw_size))
+
+    if (height <= width and height == size) or (width <= height and width == size):
+        oh, ow = height, width
+    elif width < height:
+        ow = size
+        if max_size is not None and raw_size is not None:
+            oh = int(raw_size * height / width)
+        else:
+            oh = int(size * height / width)
+    else:
+        oh = size
+        if max_size is not None and raw_size is not None:
+            ow = int(raw_size * width / height)
+        else:
+            ow = int(size * width / height)
+
+    return (oh, ow)
+
+
+def get_resize_output_image_size(
+    input_image: np.ndarray,
+    size: Union[int, tuple[int, int], list[int]],
+    max_size: Optional[int] = None,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> tuple[int, int]:
+    """
+    Computes the output image size given the input image size and the desired output size. If the desired output size
+    is a tuple or list, the output image size is returned as is. If the desired output size is an integer, the output
+    image size is computed by keeping the aspect ratio of the input image size.
+
+    Args:
+        input_image (`np.ndarray`):
+            The image to resize.
+        size (`int` or `tuple[int, int]` or `list[int]`):
+            The desired output size.
+        max_size (`int`, *optional*):
+            The maximum allowed output size.
+        input_data_format (`ChannelDimension` or `str`, *optional*):
+            The channel dimension format of the input image. If not provided, it will be inferred from the input image.
+    """
+    image_size = get_image_size(input_image, input_data_format)
+    if isinstance(size, (list, tuple)):
+        return size
+
+    return get_size_with_aspect_ratio(image_size, size, max_size)
+
+
+def get_image_size_for_max_height_width(
+    input_image: np.ndarray,
+    max_height: int,
+    max_width: int,
+    input_data_format: Optional[Union[str, ChannelDimension]] = None,
+) -> tuple[int, int]:
+    """
+    Computes the output image size given the input image and the maximum allowed height and width. Keep aspect ratio.
+    Important, even if image_height < max_height and image_width < max_width, the image will be resized
+    to at least one of the edges be equal to max_height or max_width.
+
+    For example:
+        - input_size: (100, 200), max_height: 50, max_width: 50 -> output_size: (25, 50)
+        - input_size: (100, 200), max_height: 200, max_width: 500 -> output_size: (200, 400)
+
+    Args:
+        input_image (`np.ndarray`):
+            The image to resize.
+        max_height (`int`):
+            The maximum allowed height.
+        max_width (`int`):
+            The maximum allowed width.
+        input_data_format (`ChannelDimension` or `str`, *optional*):
+            The channel dimension format of the input image. If not provided, it will be inferred from the input image.
+    """
+    image_size = get_image_size(input_image, input_data_format)
+    height, width = image_size
+    height_scale = max_height / height
+    width_scale = max_width / width
+    min_scale = min(height_scale, width_scale)
+    new_height = int(height * min_scale)
+    new_width = int(width * min_scale)
+    return new_height, new_width
+
+
+def get_numpy_to_framework_fn(arr) -> Callable:
+    """
+    Returns a function that converts a numpy array to the framework of the input array.
+
+    Args:
+        arr (`np.ndarray`): The array to convert.
+    """
+    if isinstance(arr, np.ndarray):
+        return np.array
+    if is_tf_available() and is_tf_tensor(arr):
+        import tensorflow as tf
+
+        return tf.convert_to_tensor
+    if is_torch_available() and is_torch_tensor(arr):
+        import torch
+
+        return torch.tensor
+    if is_flax_available() and is_jax_tensor(arr):
+        import jax.numpy as jnp
+
+        return jnp.array
+    raise ValueError(f"Cannot convert arrays of type {type(arr)}")
+
+
+def safe_squeeze(arr: np.ndarray, axis: Optional[int] = None) -> np.ndarray:
+    """
+    Squeezes an array, but only if the axis specified has dim 1.
+    """
+    if axis is None:
+        return arr.squeeze()
+
+    try:
+        return arr.squeeze(axis=axis)
+    except ValueError:
+        return arr
+
+
+def normalize_annotation(annotation: dict, image_size: tuple[int, int]) -> dict:
+    image_height, image_width = image_size
+    norm_annotation = {}
+    for key, value in annotation.items():
+        if key == "boxes":
+            boxes = value
+            boxes = corners_to_center_format(boxes)
+            boxes /= np.asarray([image_width, image_height, image_width, image_height], dtype=np.float32)
+            norm_annotation[key] = boxes
+        else:
+            norm_annotation[key] = value
+    return norm_annotation
+
+
+def max_across_indices(values: Iterable[Any]) -> list[Any]:
+    """
+    Return the maximum value across all indices of an iterable of values.
+    """
+    return [max(values_i) for values_i in zip(*values)]
+
+
+def get_max_height_width(
+    images: list[np.ndarray], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> list[int]:
+    """
+    Get the maximum height and width across all images in a batch.
+    """
+    if input_data_format is None:
+        input_data_format = infer_channel_dimension_format(images[0])
+
+    if input_data_format == ChannelDimension.FIRST:
+        _, max_height, max_width = max_across_indices([img.shape for img in images])
+    elif input_data_format == ChannelDimension.LAST:
+        max_height, max_width, _ = max_across_indices([img.shape for img in images])
+    else:
+        raise ValueError(f"Invalid channel dimension format: {input_data_format}")
+    return (max_height, max_width)
+
+
+def make_pixel_mask(
+    image: np.ndarray, output_size: tuple[int, int], input_data_format: Optional[Union[str, ChannelDimension]] = None
+) -> np.ndarray:
+    """
+    Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
+
+    Args:
+        image (`np.ndarray`):
+            Image to make the pixel mask for.
+        output_size (`tuple[int, int]`):
+            Output size of the mask.
+    """
+    input_height, input_width = get_image_size(image, channel_dim=input_data_format)
+    mask = np.zeros(output_size, dtype=np.int64)
+    mask[:input_height, :input_width] = 1
+    return mask
+
+
+def convert_coco_poly_to_mask(segmentations, height: int, width: int) -> np.ndarray:
+    """
+    Convert a COCO polygon annotation to a mask.
+
+    Args:
+        segmentations (`list[list[float]]`):
+            List of polygons, each polygon represented by a list of x-y coordinates.
+        height (`int`):
+            Height of the mask.
+        width (`int`):
+            Width of the mask.
+    """
+    try:
+        from pycocotools import mask as coco_mask
+    except ImportError:
+        raise ImportError("Pycocotools is not installed in your environment.")
+
+    masks = []
+    for polygons in segmentations:
+        rles = coco_mask.frPyObjects(polygons, height, width)
+        mask = coco_mask.decode(rles)
+        if len(mask.shape) < 3:
+            mask = mask[..., None]
+        mask = np.asarray(mask, dtype=np.uint8)
+        mask = np.any(mask, axis=2)
+        masks.append(mask)
+    if masks:
+        masks = np.stack(masks, axis=0)
+    else:
+        masks = np.zeros((0, height, width), dtype=np.uint8)
+
+    return masks
+
+
+def prepare_coco_detection_annotation(
+    image,
+    target,
+    return_segmentation_masks: bool = False,
+    input_data_format: Optional[Union[ChannelDimension, str]] = None,
+):
+    """
+    Convert the target in COCO format into the format expected by DETA.
+    """
+    image_height, image_width = get_image_size(image, channel_dim=input_data_format)
+
+    image_id = target["image_id"]
+    image_id = np.asarray([image_id], dtype=np.int64)
+
+    # Get all COCO annotations for the given image.
+    annotations = target["annotations"]
+    annotations = [obj for obj in annotations if "iscrowd" not in obj or obj["iscrowd"] == 0]
+
+    classes = [obj["category_id"] for obj in annotations]
+    classes = np.asarray(classes, dtype=np.int64)
+
+    # for conversion to coco api
+    area = np.asarray([obj["area"] for obj in annotations], dtype=np.float32)
+    iscrowd = np.asarray([obj.get("iscrowd", 0) for obj in annotations], dtype=np.int64)
+
+    boxes = [obj["bbox"] for obj in annotations]
+    # guard against no boxes via resizing
+    boxes = np.asarray(boxes, dtype=np.float32).reshape(-1, 4)
+    boxes[:, 2:] += boxes[:, :2]
+    boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width)
+    boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height)
+
+    keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+
+    new_target = {}
+    new_target["image_id"] = image_id
+    new_target["class_labels"] = classes[keep]
+    new_target["boxes"] = boxes[keep]
+    new_target["area"] = area[keep]
+    new_target["iscrowd"] = iscrowd[keep]
+    new_target["orig_size"] = np.asarray([int(image_height), int(image_width)], dtype=np.int64)
+
+    if annotations and "keypoints" in annotations[0]:
+        keypoints = [obj["keypoints"] for obj in annotations]
+        # Converting the filtered keypoints list to a numpy array
+        keypoints = np.asarray(keypoints, dtype=np.float32)
+        # Apply the keep mask here to filter the relevant annotations
+        keypoints = keypoints[keep]
+        num_keypoints = keypoints.shape[0]
+        keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints
+        new_target["keypoints"] = keypoints
+
+    if return_segmentation_masks:
+        segmentation_masks = [obj["segmentation"] for obj in annotations]
+        masks = convert_coco_poly_to_mask(segmentation_masks, image_height, image_width)
+        new_target["masks"] = masks[keep]
+
+    return new_target
+
+
+def masks_to_boxes(masks: np.ndarray) -> np.ndarray:
+    """
+    Compute the bounding boxes around the provided panoptic segmentation masks.
+
+    Args:
+        masks: masks in format `[number_masks, height, width]` where N is the number of masks
+
+    Returns:
+        boxes: bounding boxes in format `[number_masks, 4]` in xyxy format
+    """
+    if masks.size == 0:
+        return np.zeros((0, 4))
+
+    h, w = masks.shape[-2:]
+    y = np.arange(0, h, dtype=np.float32)
+    x = np.arange(0, w, dtype=np.float32)
+    # see https://github.com/pytorch/pytorch/issues/50276
+    y, x = np.meshgrid(y, x, indexing="ij")
+
+    x_mask = masks * np.expand_dims(x, axis=0)
+    x_max = x_mask.reshape(x_mask.shape[0], -1).max(-1)
+    x = np.ma.array(x_mask, mask=~(np.array(masks, dtype=bool)))
+    x_min = x.filled(fill_value=1e8)
+    x_min = x_min.reshape(x_min.shape[0], -1).min(-1)
+
+    y_mask = masks * np.expand_dims(y, axis=0)
+    y_max = y_mask.reshape(x_mask.shape[0], -1).max(-1)
+    y = np.ma.array(y_mask, mask=~(np.array(masks, dtype=bool)))
+    y_min = y.filled(fill_value=1e8)
+    y_min = y_min.reshape(y_min.shape[0], -1).min(-1)
+
+    return np.stack([x_min, y_min, x_max, y_max], 1)
+
+
+def prepare_coco_panoptic_annotation(
+    image: np.ndarray,
+    target: dict,
+    masks_path: Union[str, pathlib.Path],
+    return_masks: bool = True,
+    input_data_format: Union[ChannelDimension, str] = None,
+) -> dict:
+    """
+    Prepare a coco panoptic annotation for DETA.
+    """
+    image_height, image_width = get_image_size(image, channel_dim=input_data_format)
+    annotation_path = pathlib.Path(masks_path) / target["file_name"]
+
+    new_target = {}
+    new_target["image_id"] = np.asarray([target["image_id"] if "image_id" in target else target["id"]], dtype=np.int64)
+    new_target["size"] = np.asarray([image_height, image_width], dtype=np.int64)
+    new_target["orig_size"] = np.asarray([image_height, image_width], dtype=np.int64)
+
+    if "segments_info" in target:
+        masks = np.asarray(PIL.Image.open(annotation_path), dtype=np.uint32)
+        masks = rgb_to_id(masks)
+
+        ids = np.array([segment_info["id"] for segment_info in target["segments_info"]])
+        masks = masks == ids[:, None, None]
+        masks = masks.astype(np.uint8)
+        if return_masks:
+            new_target["masks"] = masks
+        new_target["boxes"] = masks_to_boxes(masks)
+        new_target["class_labels"] = np.array(
+            [segment_info["category_id"] for segment_info in target["segments_info"]], dtype=np.int64
+        )
+        new_target["iscrowd"] = np.asarray(
+            [segment_info["iscrowd"] for segment_info in target["segments_info"]], dtype=np.int64
+        )
+        new_target["area"] = np.asarray(
+            [segment_info["area"] for segment_info in target["segments_info"]], dtype=np.float32
+        )
+
+    return new_target
+
+
+def resize_annotation(
+    annotation: dict[str, Any],
+    orig_size: tuple[int, int],
+    target_size: tuple[int, int],
+    threshold: float = 0.5,
+    resample: PILImageResampling = PILImageResampling.NEAREST,
+):
+    """
+    Resizes an annotation to a target size.
+
+    Args:
+        annotation (`dict[str, Any]`):
+            The annotation dictionary.
+        orig_size (`tuple[int, int]`):
+            The original size of the input image.
+        target_size (`tuple[int, int]`):
+            The target size of the image, as returned by the preprocessing `resize` step.
+        threshold (`float`, *optional*, defaults to 0.5):
+            The threshold used to binarize the segmentation masks.
+        resample (`PILImageResampling`, defaults to `PILImageResampling.NEAREST`):
+            The resampling filter to use when resizing the masks.
+    """
+    ratios = tuple(float(s) / float(s_orig) for s, s_orig in zip(target_size, orig_size))
+    ratio_height, ratio_width = ratios
+
+    new_annotation = {}
+    new_annotation["size"] = target_size
+
+    for key, value in annotation.items():
+        if key == "boxes":
+            boxes = value
+            scaled_boxes = boxes * np.asarray([ratio_width, ratio_height, ratio_width, ratio_height], dtype=np.float32)
+            new_annotation["boxes"] = scaled_boxes
+        elif key == "area":
+            area = value
+            scaled_area = area * (ratio_width * ratio_height)
+            new_annotation["area"] = scaled_area
+        elif key == "masks":
+            masks = value[:, None]
+            masks = np.array([resize(mask, target_size, resample=resample) for mask in masks])
+            masks = masks.astype(np.float32)
+            masks = masks[:, 0] > threshold
+            new_annotation["masks"] = masks
+        elif key == "size":
+            new_annotation["size"] = target_size
+        else:
+            new_annotation[key] = value
+
+    return new_annotation
+
+
+class DetaImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a Deformable DETR image processor.
+
+    Args:
+        format (`str`, *optional*, defaults to `"coco_detection"`):
+            Data format of the annotations. One of "coco_detection" or "coco_panoptic".
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Controls whether to resize the image's (height, width) dimensions to the specified `size`. Can be
+            overridden by the `do_resize` parameter in the `preprocess` method.
+        size (`dict[str, int]` *optional*, defaults to `{"shortest_edge": 800, "longest_edge": 1333}`):
+            Size of the image's `(height, width)` dimensions after resizing. Can be overridden by the `size` parameter
+            in the `preprocess` method. Available options are:
+                - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
+                    Do NOT keep the aspect ratio.
+                - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
+                    the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
+                    less or equal to `longest_edge`.
+                - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
+                    aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
+                    `max_width`.
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+            Resampling filter to use if resizing the image.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Controls whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the
+            `do_rescale` parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
+            `preprocess` method.
+        do_normalize:
+            Controls whether to normalize the image. Can be overridden by the `do_normalize` parameter in the
+            `preprocess` method.
+        image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_DEFAULT_MEAN`):
+            Mean values to use when normalizing the image. Can be a single value or a list of values, one for each
+            channel. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_DEFAULT_STD`):
+            Standard deviation values to use when normalizing the image. Can be a single value or a list of values, one
+            for each channel. Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_annotations (`bool`, *optional*, defaults to `True`):
+            Controls whether to convert the annotations to the format expected by the DETR model. Converts the
+            bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
+            Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
+        do_pad (`bool`, *optional*, defaults to `True`):
+            Controls whether to pad the image. Can be overridden by the `do_pad` parameter in the `preprocess`
+            method. If `True`, padding will be applied to the bottom and right of the image with zeros.
+            If `pad_size` is provided, the image will be padded to the specified dimensions.
+            Otherwise, the image will be padded to the maximum height and width of the batch.
+        pad_size (`dict[str, int]`, *optional*):
+            The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
+            provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
+            height and width in the batch.
+    """
+
+    model_input_names = ["pixel_values", "pixel_mask"]
+
+    def __init__(
+        self,
+        format: Union[str, AnnotationFormat] = AnnotationFormat.COCO_DETECTION,
+        do_resize: bool = True,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_convert_annotations: bool = True,
+        do_pad: bool = True,
+        pad_size: Optional[dict[str, int]] = None,
+        **kwargs,
+    ) -> None:
+        if "pad_and_return_pixel_mask" in kwargs:
+            do_pad = kwargs.pop("pad_and_return_pixel_mask")
+
+        size = size if size is not None else {"shortest_edge": 800, "longest_edge": 1333}
+        size = get_size_dict(size, default_to_square=False)
+
+        if do_convert_annotations is None:
+            do_convert_annotations = do_normalize
+
+        super().__init__(**kwargs)
+        self.format = format
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.do_convert_annotations = do_convert_annotations
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
+        self.do_pad = do_pad
+        self.pad_size = pad_size
+
+    def prepare_annotation(
+        self,
+        image: np.ndarray,
+        target: dict,
+        format: Optional[AnnotationFormat] = None,
+        return_segmentation_masks: Optional[bool] = None,
+        masks_path: Optional[Union[str, pathlib.Path]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> dict:
+        """
+        Prepare an annotation for feeding into DETA model.
+        """
+        format = format if format is not None else self.format
+
+        if format == AnnotationFormat.COCO_DETECTION:
+            return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
+            target = prepare_coco_detection_annotation(
+                image, target, return_segmentation_masks, input_data_format=input_data_format
+            )
+        elif format == AnnotationFormat.COCO_PANOPTIC:
+            return_segmentation_masks = True if return_segmentation_masks is None else return_segmentation_masks
+            target = prepare_coco_panoptic_annotation(
+                image,
+                target,
+                masks_path=masks_path,
+                return_masks=return_segmentation_masks,
+                input_data_format=input_data_format,
+            )
+        else:
+            raise ValueError(f"Format {format} is not supported.")
+        return target
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an
+        int, smaller edge of the image will be matched to this number.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`dict[str, int]`):
+                Size of the image's `(height, width)` dimensions after resizing. Available options are:
+                    - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
+                        Do NOT keep the aspect ratio.
+                    - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
+                        the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
+                        less or equal to `longest_edge`.
+                    - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
+                        aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
+                        `max_width`.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+                Resampling filter to use if resizing the image.
+            data_format (`ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred from the input
+                image.
+        """
+        size = get_size_dict(size, default_to_square=False)
+        if "shortest_edge" in size and "longest_edge" in size:
+            new_size = get_resize_output_image_size(
+                image, size["shortest_edge"], size["longest_edge"], input_data_format=input_data_format
+            )
+        elif "height" in size and "width" in size:
+            new_size = (size["height"], size["width"])
+        elif "max_height" in size and "max_width" in size:
+            new_size = get_image_size_for_max_height_width(
+                image, size["max_height"], size["max_width"], input_data_format=input_data_format
+            )
+        else:
+            raise ValueError(
+                "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
+                f" {size.keys()}."
+            )
+        image = resize(
+            image, size=new_size, resample=resample, data_format=data_format, input_data_format=input_data_format
+        )
+        return image
+
+    def resize_annotation(
+        self,
+        annotation,
+        orig_size,
+        size,
+        resample: PILImageResampling = PILImageResampling.NEAREST,
+    ) -> dict:
+        """
+        Resize the annotation to match the resized image. If size is an int, smaller edge of the mask will be matched
+        to this number.
+        """
+        return resize_annotation(annotation, orig_size=orig_size, target_size=size, resample=resample)
+
+    def rescale(
+        self,
+        image: np.ndarray,
+        rescale_factor: float,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """
+        Rescale the image by the given factor. image = image * rescale_factor.
+
+        Args:
+            image (`np.ndarray`):
+                Image to rescale.
+            rescale_factor (`float`):
+                The value to use for rescaling.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the input image. If unset, is inferred from the input image. Can be
+                one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+        """
+        return rescale(image, rescale_factor, data_format=data_format, input_data_format=input_data_format)
+
+    def normalize_annotation(self, annotation: dict, image_size: tuple[int, int]) -> dict:
+        """
+        Normalize the boxes in the annotation from `[top_left_x, top_left_y, bottom_right_x, bottom_right_y]` to
+        `[center_x, center_y, width, height]` format and from absolute to relative pixel values.
+        """
+        return normalize_annotation(annotation, image_size=image_size)
+
+    def _update_annotation_for_padded_image(
+        self,
+        annotation: dict,
+        input_image_size: tuple[int, int],
+        output_image_size: tuple[int, int],
+        padding,
+        update_bboxes,
+    ) -> dict:
+        """
+        Update the annotation for a padded image.
+        """
+        new_annotation = {}
+        new_annotation["size"] = output_image_size
+
+        for key, value in annotation.items():
+            if key == "masks":
+                masks = value
+                masks = pad(
+                    masks,
+                    padding,
+                    mode=PaddingMode.CONSTANT,
+                    constant_values=0,
+                    input_data_format=ChannelDimension.FIRST,
+                )
+                masks = safe_squeeze(masks, 1)
+                new_annotation["masks"] = masks
+            elif key == "boxes" and update_bboxes:
+                boxes = value
+                boxes *= np.asarray(
+                    [
+                        input_image_size[1] / output_image_size[1],
+                        input_image_size[0] / output_image_size[0],
+                        input_image_size[1] / output_image_size[1],
+                        input_image_size[0] / output_image_size[0],
+                    ]
+                )
+                new_annotation["boxes"] = boxes
+            elif key == "size":
+                new_annotation["size"] = output_image_size
+            else:
+                new_annotation[key] = value
+        return new_annotation
+
+    def _pad_image(
+        self,
+        image: np.ndarray,
+        output_size: tuple[int, int],
+        annotation: Optional[dict[str, Any]] = None,
+        constant_values: Union[float, Iterable[float]] = 0,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        update_bboxes: bool = True,
+    ) -> np.ndarray:
+        """
+        Pad an image with zeros to the given size.
+        """
+        input_height, input_width = get_image_size(image, channel_dim=input_data_format)
+        output_height, output_width = output_size
+
+        pad_bottom = output_height - input_height
+        pad_right = output_width - input_width
+        padding = ((0, pad_bottom), (0, pad_right))
+        padded_image = pad(
+            image,
+            padding,
+            mode=PaddingMode.CONSTANT,
+            constant_values=constant_values,
+            data_format=data_format,
+            input_data_format=input_data_format,
+        )
+        if annotation is not None:
+            annotation = self._update_annotation_for_padded_image(
+                annotation, (input_height, input_width), (output_height, output_width), padding, update_bboxes
+            )
+        return padded_image, annotation
+
+    def pad(
+        self,
+        images: list[np.ndarray],
+        annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None,
+        constant_values: Union[float, Iterable[float]] = 0,
+        return_pixel_mask: bool = True,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        update_bboxes: bool = True,
+        pad_size: Optional[dict[str, int]] = None,
+    ) -> BatchFeature:
+        """
+        Pads a batch of images to the bottom and right of the image with zeros to the size of largest height and width
+        in the batch and optionally returns their corresponding pixel mask.
+
+        Args:
+            images (list[`np.ndarray`]):
+                Images to pad.
+            annotations (`AnnotationType` or `list[AnnotationType]`, *optional*):
+                Annotations to transform according to the padding that is applied to the images.
+            constant_values (`float` or `Iterable[float]`, *optional*):
+                The value to use for the padding if `mode` is `"constant"`.
+            return_pixel_mask (`bool`, *optional*, defaults to `True`):
+                Whether to return a pixel mask.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+            update_bboxes (`bool`, *optional*, defaults to `True`):
+                Whether to update the bounding boxes in the annotations to match the padded images. If the
+                bounding boxes have not been converted to relative coordinates and `(centre_x, centre_y, width, height)`
+                format, the bounding boxes will not be updated.
+            pad_size (`dict[str, int]`, *optional*):
+                The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
+                provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
+                height and width in the batch.
+        """
+        pad_size = pad_size if pad_size is not None else self.pad_size
+        if pad_size is not None:
+            padded_size = (pad_size["height"], pad_size["width"])
+        else:
+            padded_size = get_max_height_width(images, input_data_format=input_data_format)
+
+        annotation_list = annotations if annotations is not None else [None] * len(images)
+        padded_images = []
+        padded_annotations = []
+        for image, annotation in zip(images, annotation_list):
+            padded_image, padded_annotation = self._pad_image(
+                image,
+                padded_size,
+                annotation,
+                constant_values=constant_values,
+                data_format=data_format,
+                input_data_format=input_data_format,
+                update_bboxes=update_bboxes,
+            )
+            padded_images.append(padded_image)
+            padded_annotations.append(padded_annotation)
+
+        data = {"pixel_values": padded_images}
+
+        if return_pixel_mask:
+            masks = [
+                make_pixel_mask(image=image, output_size=padded_size, input_data_format=input_data_format)
+                for image in images
+            ]
+            data["pixel_mask"] = masks
+
+        encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
+
+        if annotations is not None:
+            encoded_inputs["labels"] = [
+                BatchFeature(annotation, tensor_type=return_tensors) for annotation in padded_annotations
+            ]
+
+        return encoded_inputs
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        annotations: Optional[Union[list[dict], list[list[dict]]]] = None,
+        return_segmentation_masks: Optional[bool] = None,
+        masks_path: Optional[Union[str, pathlib.Path]] = None,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample=None,  # PILImageResampling
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[Union[int, float]] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_convert_annotations: Optional[bool] = None,
+        do_pad: Optional[bool] = None,
+        format: Optional[Union[str, AnnotationFormat]] = None,
+        return_tensors: Optional[Union[TensorType, str]] = None,
+        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        pad_size: Optional[dict[str, int]] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Preprocess an image or a batch of images so that it can be used by the model.
+
+        Args:
+            images (`ImageInput`):
+                Image or batch of images to preprocess. Expects a single or batch of images with pixel values ranging
+                from 0 to 255. If passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            annotations (`list[Dict]` or `list[list[Dict]]`, *optional*):
+                List of annotations associated with the image or batch of images. If annotation is for object
+                detection, the annotations should be a dictionary with the following keys:
+                - "image_id" (`int`): The image id.
+                - "annotations" (`list[Dict]`): List of annotations for an image. Each annotation should be a
+                  dictionary. An image can have no annotations, in which case the list should be empty.
+                If annotation is for segmentation, the annotations should be a dictionary with the following keys:
+                - "image_id" (`int`): The image id.
+                - "segments_info" (`list[Dict]`): List of segments for an image. Each segment should be a dictionary.
+                  An image can have no segments, in which case the list should be empty.
+                - "file_name" (`str`): The file name of the image.
+            return_segmentation_masks (`bool`, *optional*, defaults to self.return_segmentation_masks):
+                Whether to return segmentation masks.
+            masks_path (`str` or `pathlib.Path`, *optional*):
+                Path to the directory containing the segmentation masks.
+            do_resize (`bool`, *optional*, defaults to self.do_resize):
+                Whether to resize the image.
+            size (`dict[str, int]`, *optional*, defaults to self.size):
+                Size of the image's `(height, width)` dimensions after resizing. Available options are:
+                    - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
+                        Do NOT keep the aspect ratio.
+                    - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
+                        the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
+                        less or equal to `longest_edge`.
+                    - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
+                        aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
+                        `max_width`.
+            resample (`PILImageResampling`, *optional*, defaults to self.resample):
+                Resampling filter to use when resizing the image.
+            do_rescale (`bool`, *optional*, defaults to self.do_rescale):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to self.rescale_factor):
+                Rescale factor to use when rescaling the image.
+            do_normalize (`bool`, *optional*, defaults to self.do_normalize):
+                Whether to normalize the image.
+            image_mean (`float` or `list[float]`, *optional*, defaults to self.image_mean):
+                Mean to use when normalizing the image.
+            image_std (`float` or `list[float]`, *optional*, defaults to self.image_std):
+                Standard deviation to use when normalizing the image.
+            do_convert_annotations (`bool`, *optional*, defaults to self.do_convert_annotations):
+                Whether to convert the annotations to the format expected by the model. Converts the bounding
+                boxes from the format `(top_left_x, top_left_y, width, height)` to `(center_x, center_y, width, height)`
+                and in relative coordinates.
+            do_pad (`bool`, *optional*, defaults to self.do_pad):
+                Whether to pad the image. If `True`, padding will be applied to the bottom and right of
+                the image with zeros. If `pad_size` is provided, the image will be padded to the specified
+                dimensions. Otherwise, the image will be padded to the maximum height and width of the batch.
+            format (`str` or `AnnotationFormat`, *optional*, defaults to self.format):
+                Format of the annotations.
+            return_tensors (`str` or `TensorType`, *optional*, defaults to self.return_tensors):
+                Type of tensors to return. If `None`, will return the list of images.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            pad_size (`dict[str, int]`, *optional*):
+                The size `{"height": int, "width" int}` to pad the images to. Must be larger than any image size
+                provided for preprocessing. If `pad_size` is not provided, images will be padded to the largest
+                height and width in the batch.
+        """
+        if "pad_and_return_pixel_mask" in kwargs:
+            logger.warning_once(
+                "The `pad_and_return_pixel_mask` argument is deprecated and will be removed in a future version, "
+                "use `do_pad` instead.",
+            )
+            do_pad = kwargs.pop("pad_and_return_pixel_mask")
+
+        do_resize = self.do_resize if do_resize is None else do_resize
+        size = self.size if size is None else size
+        size = get_size_dict(size=size, default_to_square=False)
+        resample = self.resample if resample is None else resample
+        do_rescale = self.do_rescale if do_rescale is None else do_rescale
+        rescale_factor = self.rescale_factor if rescale_factor is None else rescale_factor
+        do_normalize = self.do_normalize if do_normalize is None else do_normalize
+        image_mean = self.image_mean if image_mean is None else image_mean
+        image_std = self.image_std if image_std is None else image_std
+        do_convert_annotations = (
+            self.do_convert_annotations if do_convert_annotations is None else do_convert_annotations
+        )
+        do_pad = self.do_pad if do_pad is None else do_pad
+        pad_size = self.pad_size if pad_size is None else pad_size
+        format = self.format if format is None else format
+
+        # Here, the pad() method pads to the maximum of (width, height). It does not need to be validated.
+
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+
+        if not is_batched(images):
+            images = [images]
+            annotations = [annotations] if annotations is not None else None
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        if annotations is not None and len(images) != len(annotations):
+            raise ValueError(
+                f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match."
+            )
+
+        format = AnnotationFormat(format)
+        if annotations is not None:
+            validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations)
+
+        if (
+            masks_path is not None
+            and format == AnnotationFormat.COCO_PANOPTIC
+            and not isinstance(masks_path, (pathlib.Path, str))
+        ):
+            raise ValueError(
+                "The path to the directory containing the mask PNG files should be provided as a"
+                f" `pathlib.Path` or string object, but is {type(masks_path)} instead."
+            )
+
+        # All transformations expect numpy arrays
+        images = [to_numpy_array(image) for image in images]
+
+        if do_rescale and is_scaled_image(images[0]):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image)
+        if annotations is not None:
+            prepared_images = []
+            prepared_annotations = []
+            for image, target in zip(images, annotations):
+                target = self.prepare_annotation(
+                    image,
+                    target,
+                    format,
+                    return_segmentation_masks=return_segmentation_masks,
+                    masks_path=masks_path,
+                    input_data_format=input_data_format,
+                )
+                prepared_images.append(image)
+                prepared_annotations.append(target)
+            images = prepared_images
+            annotations = prepared_annotations
+            del prepared_images, prepared_annotations
+
+        # transformations
+        if do_resize:
+            if annotations is not None:
+                resized_images, resized_annotations = [], []
+                for image, target in zip(images, annotations):
+                    orig_size = get_image_size(image, input_data_format)
+                    resized_image = self.resize(
+                        image, size=size, resample=resample, input_data_format=input_data_format
+                    )
+                    resized_annotation = self.resize_annotation(
+                        target, orig_size, get_image_size(resized_image, input_data_format)
+                    )
+                    resized_images.append(resized_image)
+                    resized_annotations.append(resized_annotation)
+                images = resized_images
+                annotations = resized_annotations
+                del resized_images, resized_annotations
+            else:
+                images = [
+                    self.resize(image, size=size, resample=resample, input_data_format=input_data_format)
+                    for image in images
+                ]
+
+        if do_rescale:
+            images = [self.rescale(image, rescale_factor, input_data_format=input_data_format) for image in images]
+
+        if do_normalize:
+            images = [
+                self.normalize(image, image_mean, image_std, input_data_format=input_data_format) for image in images
+            ]
+
+        if do_convert_annotations and annotations is not None:
+            annotations = [
+                self.normalize_annotation(annotation, get_image_size(image, input_data_format))
+                for annotation, image in zip(annotations, images)
+            ]
+
+        if do_pad:
+            # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
+            encoded_inputs = self.pad(
+                images,
+                annotations=annotations,
+                return_pixel_mask=True,
+                data_format=data_format,
+                input_data_format=input_data_format,
+                return_tensors=return_tensors,
+                update_bboxes=do_convert_annotations,
+                pad_size=pad_size,
+            )
+        else:
+            images = [
+                to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+                for image in images
+            ]
+            encoded_inputs = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
+            if annotations is not None:
+                encoded_inputs["labels"] = [
+                    BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
+                ]
+
+        return encoded_inputs
+
+    def post_process_object_detection(
+        self,
+        outputs,
+        threshold: float = 0.5,
+        target_sizes: Union[TensorType, list[tuple]] = None,
+        nms_threshold: float = 0.7,
+    ):
+        """
+        Converts the output of [`DetaForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
+        bottom_right_x, bottom_right_y) format. Only supports PyTorch.
+
+        Args:
+            outputs ([`DetrObjectDetectionOutput`]):
+                Raw outputs of the model.
+            threshold (`float`, *optional*, defaults to 0.5):
+                Score threshold to keep object detection predictions.
+            target_sizes (`torch.Tensor` or `list[tuple[int, int]]`, *optional*):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`tuple[int, int]`) containing the target size
+                (height, width) of each image in the batch. If left to None, predictions will not be resized.
+            nms_threshold (`float`, *optional*, defaults to 0.7):
+                NMS threshold.
+
+        Returns:
+            `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+            in the batch as predicted by the model.
+        """
+        out_logits, out_bbox = outputs.logits, outputs.pred_boxes
+        batch_size, num_queries, num_labels = out_logits.shape
+
+        if target_sizes is not None:
+            if len(out_logits) != len(target_sizes):
+                raise ValueError(
+                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
+                )
+
+        prob = out_logits.sigmoid()
+
+        all_scores = prob.view(batch_size, num_queries * num_labels).to(out_logits.device)
+        all_indexes = torch.arange(num_queries * num_labels)[None].repeat(batch_size, 1).to(out_logits.device)
+        all_boxes = torch.div(all_indexes, out_logits.shape[2], rounding_mode="floor")
+        all_labels = all_indexes % out_logits.shape[2]
+
+        boxes = center_to_corners_format(out_bbox)
+        boxes = torch.gather(boxes, 1, all_boxes.unsqueeze(-1).repeat(1, 1, 4))
+
+        # and from relative [0, 1] to absolute [0, height] coordinates
+        if target_sizes is not None:
+            if isinstance(target_sizes, list):
+                img_h = torch.Tensor([i[0] for i in target_sizes])
+                img_w = torch.Tensor([i[1] for i in target_sizes])
+            else:
+                img_h, img_w = target_sizes.unbind(1)
+
+            scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
+            boxes = boxes * scale_fct[:, None, :]
+
+        results = []
+        for b in range(batch_size):
+            box = boxes[b]
+            score = all_scores[b]
+            lbls = all_labels[b]
+
+            pre_topk = score.topk(min(10000, num_queries * num_labels)).indices
+            box = box[pre_topk]
+            score = score[pre_topk]
+            lbls = lbls[pre_topk]
+
+            # apply NMS
+            keep_inds = batched_nms(box, score, lbls, nms_threshold)[:100]
+            score = score[keep_inds]
+            lbls = lbls[keep_inds]
+            box = box[keep_inds]
+
+            results.append(
+                {
+                    "scores": score[score > threshold],
+                    "labels": lbls[score > threshold],
+                    "boxes": box[score > threshold],
+                }
+            )
+
+        return results
+
+
+__all__ = ["DetaImageProcessor"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/deta/modeling_deta.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/deta/modeling_deta.py
new file mode 100644
index 0000000000000000000000000000000000000000..c4f6f5c65dedc4e7a4d6e5151cff61226ea4a7ed
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/deta/modeling_deta.py
@@ -0,0 +1,2816 @@
+# coding=utf-8
+# Copyright 2022 SenseTime and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch DETA model."""
+
+import copy
+import math
+import os
+import warnings
+from dataclasses import dataclass
+from pathlib import Path
+from typing import Optional, Union
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+from torch.autograd import Function
+from torch.autograd.function import once_differentiable
+
+from ....activations import ACT2FN
+from ....file_utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_scipy_available,
+    is_torch_cuda_available,
+    is_vision_available,
+    replace_return_docstrings,
+)
+from ....modeling_attn_mask_utils import _prepare_4d_attention_mask
+from ....modeling_layers import GradientCheckpointingLayer
+from ....modeling_outputs import BaseModelOutput
+from ....modeling_utils import PreTrainedModel
+from ....pytorch_utils import meshgrid
+from ....utils import is_accelerate_available, is_ninja_available, is_torchvision_available, logging, requires_backends
+from ....utils.backbone_utils import load_backbone
+from .configuration_deta import DetaConfig
+
+
+logger = logging.get_logger(__name__)
+
+MultiScaleDeformableAttention = None
+
+
+def load_cuda_kernels():
+    from torch.utils.cpp_extension import load
+
+    global MultiScaleDeformableAttention
+
+    root = Path(__file__).resolve().parent.parent.parent.parent / "kernels" / "deta"
+    src_files = [
+        root / filename
+        for filename in [
+            "vision.cpp",
+            os.path.join("cpu", "ms_deform_attn_cpu.cpp"),
+            os.path.join("cuda", "ms_deform_attn_cuda.cu"),
+        ]
+    ]
+
+    MultiScaleDeformableAttention = load(
+        "MultiScaleDeformableAttention",
+        src_files,
+        with_cuda=True,
+        extra_include_paths=[str(root)],
+        extra_cflags=["-DWITH_CUDA=1"],
+        extra_cuda_cflags=[
+            "-DCUDA_HAS_FP16=1",
+            "-D__CUDA_NO_HALF_OPERATORS__",
+            "-D__CUDA_NO_HALF_CONVERSIONS__",
+            "-D__CUDA_NO_HALF2_OPERATORS__",
+        ],
+    )
+
+
+class MultiScaleDeformableAttentionFunction(Function):
+    @staticmethod
+    def forward(
+        context,
+        value,
+        value_spatial_shapes,
+        value_level_start_index,
+        sampling_locations,
+        attention_weights,
+        im2col_step,
+    ):
+        context.im2col_step = im2col_step
+        output = MultiScaleDeformableAttention.ms_deform_attn_forward(
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+            context.im2col_step,
+        )
+        context.save_for_backward(
+            value, value_spatial_shapes, value_level_start_index, sampling_locations, attention_weights
+        )
+        return output
+
+    @staticmethod
+    @once_differentiable
+    def backward(context, grad_output):
+        (
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+        ) = context.saved_tensors
+        grad_value, grad_sampling_loc, grad_attn_weight = MultiScaleDeformableAttention.ms_deform_attn_backward(
+            value,
+            value_spatial_shapes,
+            value_level_start_index,
+            sampling_locations,
+            attention_weights,
+            grad_output,
+            context.im2col_step,
+        )
+
+        return grad_value, None, None, grad_sampling_loc, grad_attn_weight, None
+
+
+if is_accelerate_available():
+    from accelerate import PartialState
+    from accelerate.utils import reduce
+
+if is_vision_available():
+    from transformers.image_transforms import center_to_corners_format
+
+if is_torchvision_available():
+    from torchvision.ops.boxes import batched_nms
+
+if is_scipy_available():
+    from scipy.optimize import linear_sum_assignment
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "DetaConfig"
+_CHECKPOINT_FOR_DOC = "jozhang97/deta-swin-large-o365"
+
+
+@dataclass
+class DetaDecoderOutput(ModelOutput):
+    """
+    Base class for outputs of the DetaDecoder. This class adds two attributes to
+    BaseModelOutputWithCrossAttentions, namely:
+    - a stacked tensor of intermediate decoder hidden states (i.e. the output of each decoder layer)
+    - a stacked tensor of intermediate reference points.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
+            Stacked intermediate hidden states (output of each layer of the decoder).
+        intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, hidden_size)`):
+            Stacked intermediate reference points (reference points of each layer of the decoder).
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
+            plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
+            used to compute the weighted average in the cross-attention heads.
+    """
+
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+    intermediate_reference_points: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class DetaModelOutput(ModelOutput):
+    """
+    Base class for outputs of the Deformable DETR encoder-decoder model.
+
+    Args:
+        init_reference_points (`torch.FloatTensor` of shape  `(batch_size, num_queries, 4)`):
+            Initial reference points sent through the Transformer decoder.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the decoder of the model.
+        intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
+            Stacked intermediate hidden states (output of each layer of the decoder).
+        intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
+            Stacked intermediate reference points (reference points of each layer of the decoder).
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer
+            plus the initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries,
+            num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted
+            average in the self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
+            layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
+            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+            Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
+            picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
+            foreground and background).
+        enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+            Logits of predicted bounding boxes coordinates in the first stage.
+        output_proposals (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.two_stage=True`):
+            Logits of proposal bounding boxes coordinates in the gen_encoder_output_proposals.
+    """
+
+    init_reference_points: Optional[torch.FloatTensor] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+    intermediate_reference_points: Optional[torch.FloatTensor] = None
+    decoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[tuple[torch.FloatTensor]] = None
+    enc_outputs_class: Optional[torch.FloatTensor] = None
+    enc_outputs_coord_logits: Optional[torch.FloatTensor] = None
+    output_proposals: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+class DetaObjectDetectionOutput(ModelOutput):
+    """
+    Output type of [`DetaForObjectDetection`].
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
+            Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
+            bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
+            scale-invariant IoU loss.
+        loss_dict (`Dict`, *optional*):
+            A dictionary containing the individual losses. Useful for logging.
+        logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
+            Classification logits (including no-object) for all queries.
+        pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
+            Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
+            values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
+            possible padding). You can use [`~DetaProcessor.post_process_object_detection`] to retrieve the
+            unnormalized bounding boxes.
+        auxiliary_outputs (`list[Dict]`, *optional*):
+            Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
+            and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
+            `pred_boxes`) for each decoder layer.
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the decoder of the model.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, num_queries, hidden_size)`. Hidden-states of the decoder at the output of each layer
+            plus the initial embedding outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, num_queries,
+            num_queries)`. Attentions weights of the decoder, after the attention softmax, used to compute the weighted
+            average in the self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_queries, num_heads, 4, 4)`.
+            Attentions weights of the decoder's cross-attention layer, after the attention softmax, used to compute the
+            weighted average in the cross-attention heads.
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the encoder at the output of each
+            layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_heads, 4,
+            4)`. Attentions weights of the encoder, after the attention softmax, used to compute the weighted average
+            in the self-attention heads.
+        intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
+            Stacked intermediate hidden states (output of each layer of the decoder).
+        intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
+            Stacked intermediate reference points (reference points of each layer of the decoder).
+        init_reference_points (`torch.FloatTensor` of shape  `(batch_size, num_queries, 4)`):
+            Initial reference points sent through the Transformer decoder.
+        enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+            Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
+            picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
+            foreground and background).
+        enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+            Logits of predicted bounding boxes coordinates in the first stage.
+        output_proposals (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.two_stage=True`):
+            Logits of proposal bounding boxes coordinates in the gen_encoder_output_proposals.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    loss_dict: Optional[dict] = None
+    logits: Optional[torch.FloatTensor] = None
+    pred_boxes: Optional[torch.FloatTensor] = None
+    auxiliary_outputs: Optional[list[dict]] = None
+    init_reference_points: Optional[torch.FloatTensor] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+    intermediate_reference_points: Optional[torch.FloatTensor] = None
+    decoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[tuple[torch.FloatTensor]] = None
+    enc_outputs_class: Optional[torch.FloatTensor] = None
+    enc_outputs_coord_logits: Optional[torch.FloatTensor] = None
+    output_proposals: Optional[torch.FloatTensor] = None
+
+
+def _get_clones(module, N):
+    return nn.ModuleList([copy.deepcopy(module) for i in range(N)])
+
+
+def inverse_sigmoid(x, eps=1e-5):
+    x = x.clamp(min=0, max=1)
+    x1 = x.clamp(min=eps)
+    x2 = (1 - x).clamp(min=eps)
+    return torch.log(x1 / x2)
+
+
+class DetaFrozenBatchNorm2d(nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+
+    Copy-paste from torchvision.misc.ops with added eps before rqsrt, without which any other models than
+    torchvision.models.resnet[18,34,50,101] produce nans.
+    """
+
+    def __init__(self, n):
+        super().__init__()
+        self.register_buffer("weight", torch.ones(n))
+        self.register_buffer("bias", torch.zeros(n))
+        self.register_buffer("running_mean", torch.zeros(n))
+        self.register_buffer("running_var", torch.ones(n))
+
+    def _load_from_state_dict(
+        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+    ):
+        num_batches_tracked_key = prefix + "num_batches_tracked"
+        if num_batches_tracked_key in state_dict:
+            del state_dict[num_batches_tracked_key]
+
+        super()._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+        )
+
+    def forward(self, x):
+        # move reshapes to the beginning
+        # to make it user-friendly
+        weight = self.weight.reshape(1, -1, 1, 1)
+        bias = self.bias.reshape(1, -1, 1, 1)
+        running_var = self.running_var.reshape(1, -1, 1, 1)
+        running_mean = self.running_mean.reshape(1, -1, 1, 1)
+        epsilon = 1e-5
+        scale = weight * (running_var + epsilon).rsqrt()
+        bias = bias - running_mean * scale
+        return x * scale + bias
+
+
+def replace_batch_norm(model):
+    r"""
+    Recursively replace all `torch.nn.BatchNorm2d` with `DetaFrozenBatchNorm2d`.
+
+    Args:
+        model (torch.nn.Module):
+            input model
+    """
+    for name, module in model.named_children():
+        if isinstance(module, nn.BatchNorm2d):
+            new_module = DetaFrozenBatchNorm2d(module.num_features)
+
+            if module.weight.device != torch.device("meta"):
+                new_module.weight.data.copy_(module.weight)
+                new_module.bias.data.copy_(module.bias)
+                new_module.running_mean.data.copy_(module.running_mean)
+                new_module.running_var.data.copy_(module.running_var)
+
+            model._modules[name] = new_module
+
+        if len(list(module.children())) > 0:
+            replace_batch_norm(module)
+
+
+class DetaBackboneWithPositionalEncodings(nn.Module):
+    """
+    Backbone model with positional embeddings.
+
+    nn.BatchNorm2d layers are replaced by DetaFrozenBatchNorm2d as defined above.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        backbone = load_backbone(config)
+        with torch.no_grad():
+            replace_batch_norm(backbone)
+        self.model = backbone
+        self.intermediate_channel_sizes = self.model.channels
+
+        # TODO fix this
+        if config.backbone_config.model_type == "resnet":
+            for name, parameter in self.model.named_parameters():
+                if "stages.1" not in name and "stages.2" not in name and "stages.3" not in name:
+                    parameter.requires_grad_(False)
+
+        self.position_embedding = build_position_encoding(config)
+
+    def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
+        """
+        Outputs feature maps of latter stages C_3 through C_5 in ResNet if `config.num_feature_levels > 1`, otherwise
+        outputs feature maps of C_5.
+        """
+        # first, send pixel_values through the backbone to get list of feature maps
+        features = self.model(pixel_values).feature_maps
+
+        # next, create position embeddings
+        out = []
+        pos = []
+        for feature_map in features:
+            # downsample pixel_mask to match shape of corresponding feature_map
+            mask = nn.functional.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0]
+            position_embeddings = self.position_embedding(feature_map, mask).to(feature_map.dtype)
+            out.append((feature_map, mask))
+            pos.append(position_embeddings)
+
+        return out, pos
+
+
+class DetaSinePositionEmbedding(nn.Module):
+    """
+    This is a more standard version of the position embedding, very similar to the one used by the Attention is all you
+    need paper, generalized to work on images.
+    """
+
+    def __init__(self, embedding_dim=64, temperature=10000, normalize=False, scale=None):
+        super().__init__()
+        self.embedding_dim = embedding_dim
+        self.temperature = temperature
+        self.normalize = normalize
+        if scale is not None and normalize is False:
+            raise ValueError("normalize should be True if scale is passed")
+        if scale is None:
+            scale = 2 * math.pi
+        self.scale = scale
+
+    def forward(self, pixel_values, pixel_mask):
+        if pixel_mask is None:
+            raise ValueError("No pixel mask provided")
+        y_embed = pixel_mask.cumsum(1, dtype=torch.float32)
+        x_embed = pixel_mask.cumsum(2, dtype=torch.float32)
+        if self.normalize:
+            eps = 1e-6
+            y_embed = (y_embed - 0.5) / (y_embed[:, -1:, :] + eps) * self.scale
+            x_embed = (x_embed - 0.5) / (x_embed[:, :, -1:] + eps) * self.scale
+
+        dim_t = torch.arange(self.embedding_dim, dtype=torch.int64, device=pixel_values.device).float()
+        dim_t = self.temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / self.embedding_dim)
+
+        pos_x = x_embed[:, :, :, None] / dim_t
+        pos_y = y_embed[:, :, :, None] / dim_t
+        pos_x = torch.stack((pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos_y = torch.stack((pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4).flatten(3)
+        pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2)
+        return pos
+
+
+class DetaLearnedPositionEmbedding(nn.Module):
+    """
+    This module learns positional embeddings up to a fixed maximum size.
+    """
+
+    def __init__(self, embedding_dim=256):
+        super().__init__()
+        self.row_embeddings = nn.Embedding(50, embedding_dim)
+        self.column_embeddings = nn.Embedding(50, embedding_dim)
+
+    def forward(self, pixel_values, pixel_mask=None):
+        height, width = pixel_values.shape[-2:]
+        width_values = torch.arange(width, device=pixel_values.device)
+        height_values = torch.arange(height, device=pixel_values.device)
+        x_emb = self.column_embeddings(width_values)
+        y_emb = self.row_embeddings(height_values)
+        pos = torch.cat([x_emb.unsqueeze(0).repeat(height, 1, 1), y_emb.unsqueeze(1).repeat(1, width, 1)], dim=-1)
+        pos = pos.permute(2, 0, 1)
+        pos = pos.unsqueeze(0)
+        pos = pos.repeat(pixel_values.shape[0], 1, 1, 1)
+        return pos
+
+
+def build_position_encoding(config):
+    n_steps = config.d_model // 2
+    if config.position_embedding_type == "sine":
+        # TODO find a better way of exposing other arguments
+        position_embedding = DetaSinePositionEmbedding(n_steps, normalize=True)
+    elif config.position_embedding_type == "learned":
+        position_embedding = DetaLearnedPositionEmbedding(n_steps)
+    else:
+        raise ValueError(f"Not supported {config.position_embedding_type}")
+
+    return position_embedding
+
+
+def multi_scale_deformable_attention(
+    value: Tensor, value_spatial_shapes: Tensor, sampling_locations: Tensor, attention_weights: Tensor
+) -> Tensor:
+    batch_size, _, num_heads, hidden_dim = value.shape
+    _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
+    value_list = value.split([height.item() * width.item() for height, width in value_spatial_shapes], dim=1)
+    sampling_grids = 2 * sampling_locations - 1
+    sampling_value_list = []
+    for level_id, (height, width) in enumerate(value_spatial_shapes):
+        # batch_size, height*width, num_heads, hidden_dim
+        # -> batch_size, height*width, num_heads*hidden_dim
+        # -> batch_size, num_heads*hidden_dim, height*width
+        # -> batch_size*num_heads, hidden_dim, height, width
+        value_l_ = (
+            value_list[level_id].flatten(2).transpose(1, 2).reshape(batch_size * num_heads, hidden_dim, height, width)
+        )
+        # batch_size, num_queries, num_heads, num_points, 2
+        # -> batch_size, num_heads, num_queries, num_points, 2
+        # -> batch_size*num_heads, num_queries, num_points, 2
+        sampling_grid_l_ = sampling_grids[:, :, :, level_id].transpose(1, 2).flatten(0, 1)
+        # batch_size*num_heads, hidden_dim, num_queries, num_points
+        sampling_value_l_ = nn.functional.grid_sample(
+            value_l_, sampling_grid_l_, mode="bilinear", padding_mode="zeros", align_corners=False
+        )
+        sampling_value_list.append(sampling_value_l_)
+    # (batch_size, num_queries, num_heads, num_levels, num_points)
+    # -> (batch_size, num_heads, num_queries, num_levels, num_points)
+    # -> (batch_size, num_heads, 1, num_queries, num_levels*num_points)
+    attention_weights = attention_weights.transpose(1, 2).reshape(
+        batch_size * num_heads, 1, num_queries, num_levels * num_points
+    )
+    output = (
+        (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights)
+        .sum(-1)
+        .view(batch_size, num_heads * hidden_dim, num_queries)
+    )
+    return output.transpose(1, 2).contiguous()
+
+
+class DetaMultiscaleDeformableAttention(nn.Module):
+    """
+    Multiscale deformable attention as proposed in Deformable DETR.
+    """
+
+    def __init__(self, config: DetaConfig, num_heads: int, n_points: int):
+        super().__init__()
+
+        kernel_loaded = MultiScaleDeformableAttention is not None
+        if is_torch_cuda_available() and is_ninja_available() and not kernel_loaded:
+            try:
+                load_cuda_kernels()
+            except Exception as e:
+                logger.warning(f"Could not load the custom kernel for multi-scale deformable attention: {e}")
+
+        if config.d_model % num_heads != 0:
+            raise ValueError(
+                f"embed_dim (d_model) must be divisible by num_heads, but got {config.d_model} and {num_heads}"
+            )
+        dim_per_head = config.d_model // num_heads
+        # check if dim_per_head is power of 2
+        if not ((dim_per_head & (dim_per_head - 1) == 0) and dim_per_head != 0):
+            warnings.warn(
+                "You'd better set embed_dim (d_model) in DetaMultiscaleDeformableAttention to make the"
+                " dimension of each attention head a power of 2 which is more efficient in the authors' CUDA"
+                " implementation."
+            )
+
+        self.im2col_step = 64
+
+        self.d_model = config.d_model
+        self.n_levels = config.num_feature_levels
+        self.n_heads = num_heads
+        self.n_points = n_points
+
+        self.sampling_offsets = nn.Linear(config.d_model, num_heads * self.n_levels * n_points * 2)
+        self.attention_weights = nn.Linear(config.d_model, num_heads * self.n_levels * n_points)
+        self.value_proj = nn.Linear(config.d_model, config.d_model)
+        self.output_proj = nn.Linear(config.d_model, config.d_model)
+
+        self.disable_custom_kernels = config.disable_custom_kernels
+
+        self._reset_parameters()
+
+    def _reset_parameters(self):
+        nn.init.constant_(self.sampling_offsets.weight.data, 0.0)
+        default_dtype = torch.get_default_dtype()
+        thetas = torch.arange(self.n_heads, dtype=torch.int64).to(default_dtype) * (2.0 * math.pi / self.n_heads)
+        grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+        grid_init = (
+            (grid_init / grid_init.abs().max(-1, keepdim=True)[0])
+            .view(self.n_heads, 1, 1, 2)
+            .repeat(1, self.n_levels, self.n_points, 1)
+        )
+        for i in range(self.n_points):
+            grid_init[:, :, i, :] *= i + 1
+        with torch.no_grad():
+            self.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
+        nn.init.constant_(self.attention_weights.weight.data, 0.0)
+        nn.init.constant_(self.attention_weights.bias.data, 0.0)
+        nn.init.xavier_uniform_(self.value_proj.weight.data)
+        nn.init.constant_(self.value_proj.bias.data, 0.0)
+        nn.init.xavier_uniform_(self.output_proj.weight.data)
+        nn.init.constant_(self.output_proj.bias.data, 0.0)
+
+    def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
+        return tensor if position_embeddings is None else tensor + position_embeddings
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        position_embeddings: Optional[torch.Tensor] = None,
+        reference_points=None,
+        spatial_shapes=None,
+        level_start_index=None,
+        output_attentions: bool = False,
+    ):
+        # add position embeddings to the hidden states before projecting to queries and keys
+        if position_embeddings is not None:
+            hidden_states = self.with_pos_embed(hidden_states, position_embeddings)
+
+        batch_size, num_queries, _ = hidden_states.shape
+        batch_size, sequence_length, _ = encoder_hidden_states.shape
+        if (spatial_shapes[:, 0] * spatial_shapes[:, 1]).sum() != sequence_length:
+            raise ValueError(
+                "Make sure to align the spatial shapes with the sequence length of the encoder hidden states"
+            )
+
+        value = self.value_proj(encoder_hidden_states)
+        if attention_mask is not None:
+            # we invert the attention_mask
+            value = value.masked_fill(~attention_mask[..., None], float(0))
+        value = value.view(batch_size, sequence_length, self.n_heads, self.d_model // self.n_heads)
+        sampling_offsets = self.sampling_offsets(hidden_states).view(
+            batch_size, num_queries, self.n_heads, self.n_levels, self.n_points, 2
+        )
+        attention_weights = self.attention_weights(hidden_states).view(
+            batch_size, num_queries, self.n_heads, self.n_levels * self.n_points
+        )
+        attention_weights = F.softmax(attention_weights, -1).view(
+            batch_size, num_queries, self.n_heads, self.n_levels, self.n_points
+        )
+        # batch_size, num_queries, n_heads, n_levels, n_points, 2
+        num_coordinates = reference_points.shape[-1]
+        if num_coordinates == 2:
+            offset_normalizer = torch.stack([spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
+            sampling_locations = (
+                reference_points[:, :, None, :, None, :]
+                + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
+            )
+        elif num_coordinates == 4:
+            sampling_locations = (
+                reference_points[:, :, None, :, None, :2]
+                + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
+            )
+        else:
+            raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}")
+
+        if self.disable_custom_kernels:
+            # PyTorch implementation
+            output = multi_scale_deformable_attention(value, spatial_shapes, sampling_locations, attention_weights)
+        else:
+            try:
+                # custom kernel
+                output = MultiScaleDeformableAttentionFunction.apply(
+                    value,
+                    spatial_shapes,
+                    level_start_index,
+                    sampling_locations,
+                    attention_weights,
+                    self.im2col_step,
+                )
+            except Exception:
+                # PyTorch implementation
+                output = multi_scale_deformable_attention(value, spatial_shapes, sampling_locations, attention_weights)
+        output = self.output_proj(output)
+
+        return output, attention_weights
+
+
+class DetaMultiheadAttention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper.
+
+    Here, we add position embeddings to the queries and keys (as explained in the Deformable DETR paper).
+    """
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        if self.head_dim * num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
+        return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
+        return tensor if position_embeddings is None else tensor + position_embeddings
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        batch_size, target_len, embed_dim = hidden_states.size()
+        # add position embeddings to the hidden states before projecting to queries and keys
+        if position_embeddings is not None:
+            hidden_states_original = hidden_states
+            hidden_states = self.with_pos_embed(hidden_states, position_embeddings)
+
+        # get queries, keys and values
+        query_states = self.q_proj(hidden_states) * self.scaling
+        key_states = self._shape(self.k_proj(hidden_states), -1, batch_size)
+        value_states = self._shape(self.v_proj(hidden_states_original), -1, batch_size)
+
+        proj_shape = (batch_size * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, target_len, batch_size).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        source_len = key_states.size(1)
+
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (batch_size * self.num_heads, target_len, source_len):
+            raise ValueError(
+                f"Attention weights should be of size {(batch_size * self.num_heads, target_len, source_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        # expand attention_mask
+        if attention_mask is not None:
+            # [batch_size, seq_len] -> [batch_size, 1, target_seq_len, source_seq_len]
+            attention_mask = _prepare_4d_attention_mask(attention_mask, hidden_states.dtype)
+
+        if attention_mask is not None:
+            if attention_mask.size() != (batch_size, 1, target_len, source_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(batch_size, 1, target_len, source_len)}, but is"
+                    f" {attention_mask.size()}"
+                )
+            if attention_mask.dtype == torch.bool:
+                attention_mask = torch.zeros_like(attention_mask, dtype=attn_weights.dtype).masked_fill_(
+                    attention_mask, -torch.inf
+                )
+            attn_weights = attn_weights.view(batch_size, self.num_heads, target_len, source_len) + attention_mask
+            attn_weights = attn_weights.view(batch_size * self.num_heads, target_len, source_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(batch_size, self.num_heads, target_len, source_len)
+            attn_weights = attn_weights_reshaped.view(batch_size * self.num_heads, target_len, source_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (batch_size * self.num_heads, target_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(batch_size, self.num_heads, target_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(batch_size, self.num_heads, target_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(batch_size, target_len, embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped
+
+
+class DetaEncoderLayer(nn.Module):
+    def __init__(self, config: DetaConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.self_attn = DetaMultiscaleDeformableAttention(
+            config,
+            num_heads=config.encoder_attention_heads,
+            n_points=config.encoder_n_points,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        position_embeddings: Optional[torch.Tensor] = None,
+        reference_points=None,
+        spatial_shapes=None,
+        level_start_index=None,
+        output_attentions: bool = False,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Input to the layer.
+            attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+                Attention mask.
+            position_embeddings (`torch.FloatTensor`, *optional*):
+                Position embeddings, to be added to `hidden_states`.
+            reference_points (`torch.FloatTensor`, *optional*):
+                Reference points.
+            spatial_shapes (`torch.LongTensor`, *optional*):
+                Spatial shapes of the backbone feature maps.
+            level_start_index (`torch.LongTensor`, *optional*):
+                Level start index.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        # Apply Multi-scale Deformable Attention Module on the multi-scale feature maps.
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            encoder_hidden_states=hidden_states,
+            encoder_attention_mask=attention_mask,
+            position_embeddings=position_embeddings,
+            reference_points=reference_points,
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        if self.training:
+            if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any():
+                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class DetaDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: DetaConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        # self-attention
+        self.self_attn = DetaMultiheadAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        # cross-attention
+        self.encoder_attn = DetaMultiscaleDeformableAttention(
+            config,
+            num_heads=config.decoder_attention_heads,
+            n_points=config.decoder_n_points,
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        # feedforward neural networks
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Optional[torch.Tensor] = None,
+        reference_points=None,
+        spatial_shapes=None,
+        level_start_index=None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                Input to the layer of shape `(batch, seq_len, embed_dim)`.
+            position_embeddings (`torch.FloatTensor`, *optional*):
+                Position embeddings that are added to the queries and keys in the self-attention layer.
+            reference_points (`torch.FloatTensor`, *optional*):
+                Reference points.
+            spatial_shapes (`torch.LongTensor`, *optional*):
+                Spatial shapes.
+            level_start_index (`torch.LongTensor`, *optional*):
+                Level start index.
+            encoder_hidden_states (`torch.FloatTensor`):
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
+            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+                `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
+                values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            position_embeddings=position_embeddings,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        second_residual = hidden_states
+
+        # Cross-Attention
+        cross_attn_weights = None
+        hidden_states, cross_attn_weights = self.encoder_attn(
+            hidden_states=hidden_states,
+            attention_mask=encoder_attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            position_embeddings=position_embeddings,
+            reference_points=reference_points,
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = second_residual + hidden_states
+
+        hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        return outputs
+
+
+class DetaPreTrainedModel(PreTrainedModel):
+    config: DetaConfig
+    base_model_prefix = "model"
+    main_input_name = "pixel_values"
+    _no_split_modules = [r"DetaBackboneWithPositionalEncodings", r"DetaEncoderLayer", r"DetaDecoderLayer"]
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        std = self.config.init_std
+
+        if isinstance(module, DetaLearnedPositionEmbedding):
+            nn.init.uniform_(module.row_embeddings.weight)
+            nn.init.uniform_(module.column_embeddings.weight)
+        elif isinstance(module, DetaMultiscaleDeformableAttention):
+            module._reset_parameters()
+        elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        if hasattr(module, "reference_points") and not self.config.two_stage:
+            nn.init.xavier_uniform_(module.reference_points.weight.data, gain=1.0)
+            nn.init.constant_(module.reference_points.bias.data, 0.0)
+        if hasattr(module, "level_embed"):
+            nn.init.normal_(module.level_embed)
+
+
+DETA_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`DetaConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+DETA_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Padding will be ignored by default should you provide it.
+
+            Pixel values can be obtained using [`AutoImageProcessor`]. See [`AutoImageProcessor.__call__`] for details.
+
+        pixel_mask (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
+            Mask to avoid performing attention on padding pixel values. Mask values selected in `[0, 1]`:
+
+            - 1 for pixels that are real (i.e. **not masked**),
+            - 0 for pixels that are padding (i.e. **masked**).
+
+            [What are attention masks?](../glossary#attention-mask)
+
+        decoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_queries)`, *optional*):
+            Not used by default. Can be used to mask object queries.
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
+            can choose to directly pass a flattened representation of an image.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
+            Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
+            embedded representation.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class DetaEncoder(DetaPreTrainedModel):
+    """
+    Transformer encoder consisting of *config.encoder_layers* deformable attention layers. Each layer is a
+    [`DetaEncoderLayer`].
+
+    The encoder updates the flattened multi-scale feature maps through multiple deformable attention layers.
+
+    Args:
+        config: DetaConfig
+    """
+
+    def __init__(self, config: DetaConfig):
+        super().__init__(config)
+
+        self.dropout = config.dropout
+        self.layers = nn.ModuleList([DetaEncoderLayer(config) for _ in range(config.encoder_layers)])
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @staticmethod
+    def get_reference_points(spatial_shapes, valid_ratios, device):
+        """
+        Get reference points for each feature map. Used in decoder.
+
+        Args:
+            spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`):
+                Spatial shapes of each feature map.
+            valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`):
+                Valid ratios of each feature map.
+            device (`torch.device`):
+                Device on which to create the tensors.
+        Returns:
+            `torch.FloatTensor` of shape `(batch_size, num_queries, num_feature_levels, 2)`
+        """
+        reference_points_list = []
+        for level, (height, width) in enumerate(spatial_shapes):
+            ref_y, ref_x = meshgrid(
+                torch.linspace(0.5, height - 0.5, height, dtype=torch.float32, device=device),
+                torch.linspace(0.5, width - 0.5, width, dtype=torch.float32, device=device),
+                indexing="ij",
+            )
+            # TODO: valid_ratios could be useless here. check https://github.com/fundamentalvision/Deformable-DETR/issues/36
+            ref_y = ref_y.reshape(-1)[None] / (valid_ratios[:, None, level, 1] * height)
+            ref_x = ref_x.reshape(-1)[None] / (valid_ratios[:, None, level, 0] * width)
+            ref = torch.stack((ref_x, ref_y), -1)
+            reference_points_list.append(ref)
+        reference_points = torch.cat(reference_points_list, 1)
+        reference_points = reference_points[:, :, None] * valid_ratios[:, None]
+        return reference_points
+
+    def forward(
+        self,
+        inputs_embeds=None,
+        attention_mask=None,
+        position_embeddings=None,
+        spatial_shapes=None,
+        level_start_index=None,
+        valid_ratios=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Flattened feature map (output of the backbone + projection layer) that is passed to the encoder.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`:
+                - 1 for pixel features that are real (i.e. **not masked**),
+                - 0 for pixel features that are padding (i.e. **masked**).
+                [What are attention masks?](../glossary#attention-mask)
+            position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Position embeddings that are added to the queries and keys in each self-attention layer.
+            spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`):
+                Spatial shapes of each feature map.
+            level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`):
+                Starting index of each feature map.
+            valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`):
+                Ratio of valid area in each feature level.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        hidden_states = inputs_embeds
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        reference_points = self.get_reference_points(spatial_shapes, valid_ratios, device=inputs_embeds.device)
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        for i, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            layer_outputs = encoder_layer(
+                hidden_states,
+                attention_mask,
+                position_embeddings=position_embeddings,
+                reference_points=reference_points,
+                spatial_shapes=spatial_shapes,
+                level_start_index=level_start_index,
+                output_attentions=output_attentions,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class DetaDecoder(DetaPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`DetaDecoderLayer`].
+
+    The decoder updates the query embeddings through multiple self-attention and cross-attention layers.
+
+    Some tweaks for Deformable DETR:
+
+    - `position_embeddings`, `reference_points`, `spatial_shapes` and `valid_ratios` are added to the forward pass.
+    - it also returns a stack of intermediate outputs and reference points from all decoding layers.
+
+    Args:
+        config: DetaConfig
+    """
+
+    def __init__(self, config: DetaConfig):
+        super().__init__(config)
+
+        self.dropout = config.dropout
+        self.layers = nn.ModuleList([DetaDecoderLayer(config) for _ in range(config.decoder_layers)])
+        self.gradient_checkpointing = False
+
+        # hack implementation for iterative bounding box refinement and two-stage Deformable DETR
+        self.bbox_embed = None
+        self.class_embed = None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        position_embeddings=None,
+        reference_points=None,
+        spatial_shapes=None,
+        level_start_index=None,
+        valid_ratios=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
+                The query embeddings that are passed into the decoder.
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing cross-attention on padding pixel_values of the encoder. Mask values selected
+                in `[0, 1]`:
+                - 1 for pixels that are real (i.e. **not masked**),
+                - 0 for pixels that are padding (i.e. **masked**).
+            position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
+                Position embeddings that are added to the queries and keys in each self-attention layer.
+            reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)` is `as_two_stage` else `(batch_size, num_queries, 2)` or , *optional*):
+                Reference point in range `[0, 1]`, top-left (0,0), bottom-right (1, 1), including padding area.
+            spatial_shapes (`torch.FloatTensor` of shape `(num_feature_levels, 2)`):
+                Spatial shapes of the feature maps.
+            level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`, *optional*):
+                Indexes for the start of each feature level. In range `[0, sequence_length]`.
+            valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`, *optional*):
+                Ratio of valid area in each feature level.
+
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+        intermediate = ()
+        intermediate_reference_points = ()
+
+        for idx, decoder_layer in enumerate(self.layers):
+            if reference_points.shape[-1] == 4:
+                reference_points_input = (
+                    reference_points[:, :, None] * torch.cat([valid_ratios, valid_ratios], -1)[:, None]
+                )
+            else:
+                if reference_points.shape[-1] != 2:
+                    raise ValueError("Reference points' last dimension must be of size 2")
+                reference_points_input = reference_points[:, :, None] * valid_ratios[:, None]
+
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                position_embeddings=position_embeddings,
+                encoder_hidden_states=encoder_hidden_states,
+                reference_points=reference_points_input,
+                spatial_shapes=spatial_shapes,
+                level_start_index=level_start_index,
+                encoder_attention_mask=encoder_attention_mask,
+                output_attentions=output_attentions,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            # hack implementation for iterative bounding box refinement
+            if self.bbox_embed is not None:
+                tmp = self.bbox_embed[idx](hidden_states)
+                if reference_points.shape[-1] == 4:
+                    new_reference_points = tmp + inverse_sigmoid(reference_points)
+                    new_reference_points = new_reference_points.sigmoid()
+                else:
+                    if reference_points.shape[-1] != 2:
+                        raise ValueError(
+                            f"Reference points' last dimension must be of size 2, but is {reference_points.shape[-1]}"
+                        )
+                    new_reference_points = tmp
+                    new_reference_points[..., :2] = tmp[..., :2] + inverse_sigmoid(reference_points)
+                    new_reference_points = new_reference_points.sigmoid()
+                reference_points = new_reference_points.detach()
+
+            intermediate += (hidden_states,)
+            intermediate_reference_points += (reference_points,)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        # Keep batch_size as first dimension
+        intermediate = torch.stack(intermediate, dim=1)
+        intermediate_reference_points = torch.stack(intermediate_reference_points, dim=1)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    intermediate,
+                    intermediate_reference_points,
+                    all_hidden_states,
+                    all_self_attns,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return DetaDecoderOutput(
+            last_hidden_state=hidden_states,
+            intermediate_hidden_states=intermediate,
+            intermediate_reference_points=intermediate_reference_points,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    The bare DETA Model (consisting of a backbone and encoder-decoder Transformer) outputting raw hidden-states without
+    any specific head on top.
+    """,
+    DETA_START_DOCSTRING,
+)
+class DetaModel(DetaPreTrainedModel):
+    def __init__(self, config: DetaConfig):
+        super().__init__(config)
+
+        if config.two_stage:
+            requires_backends(self, ["torchvision"])
+
+        # Create backbone with positional encoding
+        self.backbone = DetaBackboneWithPositionalEncodings(config)
+        intermediate_channel_sizes = self.backbone.intermediate_channel_sizes
+
+        # Create input projection layers
+        if config.num_feature_levels > 1:
+            num_backbone_outs = len(intermediate_channel_sizes)
+            input_proj_list = []
+            for _ in range(num_backbone_outs):
+                in_channels = intermediate_channel_sizes[_]
+                input_proj_list.append(
+                    nn.Sequential(
+                        nn.Conv2d(in_channels, config.d_model, kernel_size=1),
+                        nn.GroupNorm(32, config.d_model),
+                    )
+                )
+            for _ in range(config.num_feature_levels - num_backbone_outs):
+                input_proj_list.append(
+                    nn.Sequential(
+                        nn.Conv2d(in_channels, config.d_model, kernel_size=3, stride=2, padding=1),
+                        nn.GroupNorm(32, config.d_model),
+                    )
+                )
+                in_channels = config.d_model
+            self.input_proj = nn.ModuleList(input_proj_list)
+        else:
+            self.input_proj = nn.ModuleList(
+                [
+                    nn.Sequential(
+                        nn.Conv2d(intermediate_channel_sizes[-1], config.d_model, kernel_size=1),
+                        nn.GroupNorm(32, config.d_model),
+                    )
+                ]
+            )
+
+        if not config.two_stage:
+            self.query_position_embeddings = nn.Embedding(config.num_queries, config.d_model * 2)
+
+        self.encoder = DetaEncoder(config)
+        self.decoder = DetaDecoder(config)
+
+        self.level_embed = nn.Parameter(torch.Tensor(config.num_feature_levels, config.d_model))
+
+        if config.two_stage:
+            self.enc_output = nn.Linear(config.d_model, config.d_model)
+            self.enc_output_norm = nn.LayerNorm(config.d_model)
+            self.pos_trans = nn.Linear(config.d_model * 2, config.d_model * 2)
+            self.pos_trans_norm = nn.LayerNorm(config.d_model * 2)
+            self.pix_trans = nn.Linear(config.d_model, config.d_model)
+            self.pix_trans_norm = nn.LayerNorm(config.d_model)
+        else:
+            self.reference_points = nn.Linear(config.d_model, 2)
+
+        self.assign_first_stage = config.assign_first_stage
+        self.two_stage_num_proposals = config.two_stage_num_proposals
+
+        self.post_init()
+
+    def get_encoder(self):
+        return self.encoder
+
+    def freeze_backbone(self):
+        for name, param in self.backbone.model.named_parameters():
+            param.requires_grad_(False)
+
+    def unfreeze_backbone(self):
+        for name, param in self.backbone.model.named_parameters():
+            param.requires_grad_(True)
+
+    def get_valid_ratio(self, mask, dtype=torch.float32):
+        """Get the valid ratio of all feature maps."""
+
+        _, height, width = mask.shape
+        valid_height = torch.sum(mask[:, :, 0], 1)
+        valid_width = torch.sum(mask[:, 0, :], 1)
+        valid_ratio_height = valid_height.to(dtype) / height
+        valid_ratio_width = valid_width.to(dtype) / width
+        valid_ratio = torch.stack([valid_ratio_width, valid_ratio_height], -1)
+        return valid_ratio
+
+    def get_proposal_pos_embed(self, proposals):
+        """Get the position embedding of the proposals."""
+
+        num_pos_feats = self.config.d_model // 2
+        temperature = 10000
+        scale = 2 * math.pi
+
+        dim_t = torch.arange(num_pos_feats, dtype=torch.int64, device=proposals.device).float()
+        dim_t = temperature ** (2 * torch.div(dim_t, 2, rounding_mode="floor") / num_pos_feats)
+        # batch_size, num_queries, 4
+        proposals = proposals.sigmoid() * scale
+        # batch_size, num_queries, 4, 128
+        pos = proposals[:, :, :, None] / dim_t
+        # batch_size, num_queries, 4, 64, 2 -> batch_size, num_queries, 512
+        pos = torch.stack((pos[:, :, :, 0::2].sin(), pos[:, :, :, 1::2].cos()), dim=4).flatten(2)
+        return pos
+
+    def gen_encoder_output_proposals(self, enc_output, padding_mask, spatial_shapes):
+        """Generate the encoder output proposals from encoded enc_output.
+
+        Args:
+            enc_output (Tensor[batch_size, sequence_length, hidden_size]): Output of the encoder.
+            padding_mask (Tensor[batch_size, sequence_length]): Padding mask for `enc_output`.
+            spatial_shapes (Tensor[num_feature_levels, 2]): Spatial shapes of the feature maps.
+
+        Returns:
+            `tuple(torch.FloatTensor)`: A tuple of feature map and bbox prediction.
+                - object_query (Tensor[batch_size, sequence_length, hidden_size]): Object query features. Later used to
+                  directly predict a bounding box. (without the need of a decoder)
+                - output_proposals (Tensor[batch_size, sequence_length, 4]): Normalized proposals, after an inverse
+                  sigmoid.
+        """
+        batch_size = enc_output.shape[0]
+        proposals = []
+        _cur = 0
+        level_ids = []
+        for level, (height, width) in enumerate(spatial_shapes):
+            mask_flatten_ = padding_mask[:, _cur : (_cur + height * width)].view(batch_size, height, width, 1)
+            valid_height = torch.sum(~mask_flatten_[:, :, 0, 0], 1)
+            valid_width = torch.sum(~mask_flatten_[:, 0, :, 0], 1)
+
+            grid_y, grid_x = meshgrid(
+                torch.linspace(0, height - 1, height, dtype=torch.float32, device=enc_output.device),
+                torch.linspace(0, width - 1, width, dtype=torch.float32, device=enc_output.device),
+                indexing="ij",
+            )
+            grid = torch.cat([grid_x.unsqueeze(-1), grid_y.unsqueeze(-1)], -1)
+
+            scale = torch.cat([valid_width.unsqueeze(-1), valid_height.unsqueeze(-1)], 1).view(batch_size, 1, 1, 2)
+            grid = (grid.unsqueeze(0).expand(batch_size, -1, -1, -1) + 0.5) / scale
+            width_height = torch.ones_like(grid) * 0.05 * (2.0**level)
+            proposal = torch.cat((grid, width_height), -1).view(batch_size, -1, 4)
+            proposals.append(proposal)
+            _cur += height * width
+            level_ids.append(grid.new_ones(height * width, dtype=torch.long) * level)
+        output_proposals = torch.cat(proposals, 1)
+        output_proposals_valid = ((output_proposals > 0.01) & (output_proposals < 0.99)).all(-1, keepdim=True)
+        output_proposals = torch.log(output_proposals / (1 - output_proposals))  # inverse sigmoid
+        output_proposals = output_proposals.masked_fill(padding_mask.unsqueeze(-1), float("inf"))
+        output_proposals = output_proposals.masked_fill(~output_proposals_valid, float("inf"))
+
+        # assign each pixel as an object query
+        object_query = enc_output
+        object_query = object_query.masked_fill(padding_mask.unsqueeze(-1), float(0))
+        object_query = object_query.masked_fill(~output_proposals_valid, float(0))
+        object_query = self.enc_output_norm(self.enc_output(object_query))
+        level_ids = torch.cat(level_ids)
+        return object_query, output_proposals, level_ids
+
+    @add_start_docstrings_to_model_forward(DETA_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=DetaModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        pixel_mask: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_outputs: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.FloatTensor], DetaModelOutput]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, DetaModel
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("jozhang97/deta-swin-large-o365")
+        >>> model = DetaModel.from_pretrained("jozhang97/deta-swin-large-o365", two_stage=False)
+
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+
+        >>> last_hidden_states = outputs.last_hidden_state
+        >>> list(last_hidden_states.shape)
+        [1, 900, 256]
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        batch_size, num_channels, height, width = pixel_values.shape
+        device = pixel_values.device
+
+        if pixel_mask is None:
+            pixel_mask = torch.ones(((batch_size, height, width)), dtype=torch.long, device=device)
+
+        # Extract multi-scale feature maps of same resolution `config.d_model` (cf Figure 4 in paper)
+        # First, sent pixel_values + pixel_mask through Backbone to obtain the features
+        # which is a list of tuples
+        features, position_embeddings_list = self.backbone(pixel_values, pixel_mask)
+
+        # Then, apply 1x1 convolution to reduce the channel dimension to d_model (256 by default)
+        sources = []
+        masks = []
+        for level, (source, mask) in enumerate(features):
+            sources.append(self.input_proj[level](source))
+            masks.append(mask)
+            if mask is None:
+                raise ValueError("No attention mask was provided")
+
+        # Lowest resolution feature maps are obtained via 3x3 stride 2 convolutions on the final stage
+        if self.config.num_feature_levels > len(sources):
+            _len_sources = len(sources)
+            for level in range(_len_sources, self.config.num_feature_levels):
+                if level == _len_sources:
+                    source = self.input_proj[level](features[-1][0])
+                else:
+                    source = self.input_proj[level](sources[-1])
+                mask = nn.functional.interpolate(pixel_mask[None].float(), size=source.shape[-2:]).to(torch.bool)[0]
+                pos_l = self.backbone.position_embedding(source, mask).to(source.dtype)
+                sources.append(source)
+                masks.append(mask)
+                position_embeddings_list.append(pos_l)
+
+        # Create queries
+        query_embeds = None
+        if not self.config.two_stage:
+            query_embeds = self.query_position_embeddings.weight
+
+        # Prepare encoder inputs (by flattening)
+        spatial_shapes = [(source.shape[2:]) for source in sources]
+        source_flatten = [source.flatten(2).transpose(1, 2) for source in sources]
+        mask_flatten = [mask.flatten(1) for mask in masks]
+
+        lvl_pos_embed_flatten = []
+        for level, pos_embed in enumerate(position_embeddings_list):
+            pos_embed = pos_embed.flatten(2).transpose(1, 2)
+            lvl_pos_embed = pos_embed + self.level_embed[level].view(1, 1, -1)
+            lvl_pos_embed_flatten.append(lvl_pos_embed)
+
+        source_flatten = torch.cat(source_flatten, 1)
+        mask_flatten = torch.cat(mask_flatten, 1)
+        lvl_pos_embed_flatten = torch.cat(lvl_pos_embed_flatten, 1)
+        spatial_shapes = torch.as_tensor(spatial_shapes, dtype=torch.long, device=source_flatten.device)
+        level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
+        valid_ratios = torch.stack([self.get_valid_ratio(m) for m in masks], 1)
+        valid_ratios = valid_ratios.float()
+
+        # Fourth, sent source_flatten + mask_flatten + lvl_pos_embed_flatten (backbone + proj layer output) through encoder
+        # Also provide spatial_shapes, level_start_index and valid_ratios
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                inputs_embeds=source_flatten,
+                attention_mask=mask_flatten,
+                position_embeddings=lvl_pos_embed_flatten,
+                spatial_shapes=spatial_shapes,
+                level_start_index=level_start_index,
+                valid_ratios=valid_ratios,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        # Fifth, prepare decoder inputs
+        batch_size, _, num_channels = encoder_outputs[0].shape
+        enc_outputs_class = None
+        enc_outputs_coord_logits = None
+        output_proposals = None
+        if self.config.two_stage:
+            object_query_embedding, output_proposals, level_ids = self.gen_encoder_output_proposals(
+                encoder_outputs[0], ~mask_flatten, spatial_shapes
+            )
+
+            # hack implementation for two-stage DETA
+            # apply a detection head to each pixel (A.4 in paper)
+            # linear projection for bounding box binary classification (i.e. foreground and background)
+            enc_outputs_class = self.decoder.class_embed[-1](object_query_embedding)
+            # 3-layer FFN to predict bounding boxes coordinates (bbox regression branch)
+            delta_bbox = self.decoder.bbox_embed[-1](object_query_embedding)
+            enc_outputs_coord_logits = delta_bbox + output_proposals
+
+            # only keep top scoring `config.two_stage_num_proposals` proposals
+            topk = self.two_stage_num_proposals
+            proposal_logit = enc_outputs_class[..., 0]
+
+            if self.assign_first_stage:
+                proposal_boxes = center_to_corners_format(enc_outputs_coord_logits.sigmoid().float()).clamp(0, 1)
+                topk_proposals = []
+                for b in range(batch_size):
+                    prop_boxes_b = proposal_boxes[b]
+                    prop_logits_b = proposal_logit[b]
+
+                    # pre-nms per-level topk
+                    pre_nms_topk = 1000
+                    pre_nms_inds = []
+                    for lvl in range(len(spatial_shapes)):
+                        lvl_mask = level_ids == lvl
+                        pre_nms_inds.append(torch.topk(prop_logits_b.sigmoid() * lvl_mask, pre_nms_topk)[1])
+                    pre_nms_inds = torch.cat(pre_nms_inds)
+
+                    # nms on topk indices
+                    post_nms_inds = batched_nms(
+                        prop_boxes_b[pre_nms_inds], prop_logits_b[pre_nms_inds], level_ids[pre_nms_inds], 0.9
+                    )
+                    keep_inds = pre_nms_inds[post_nms_inds]
+
+                    if len(keep_inds) < self.two_stage_num_proposals:
+                        print(
+                            f"[WARNING] nms proposals ({len(keep_inds)}) < {self.two_stage_num_proposals}, running"
+                            " naive topk"
+                        )
+                        keep_inds = torch.topk(proposal_logit[b], topk)[1]
+
+                    # keep top Q/L indices for L levels
+                    q_per_l = topk // len(spatial_shapes)
+                    is_level_ordered = (
+                        level_ids[keep_inds][None]
+                        == torch.arange(len(spatial_shapes), device=level_ids.device)[:, None]
+                    )
+                    keep_inds_mask = is_level_ordered & (is_level_ordered.cumsum(1) <= q_per_l)  # LS
+                    keep_inds_mask = keep_inds_mask.any(0)  # S
+
+                    # pad to Q indices (might let ones filtered from pre-nms sneak by... unlikely because we pick high conf anyways)
+                    if keep_inds_mask.sum() < topk:
+                        num_to_add = topk - keep_inds_mask.sum()
+                        pad_inds = (~keep_inds_mask).nonzero()[:num_to_add]
+                        keep_inds_mask[pad_inds] = True
+
+                    keep_inds_topk = keep_inds[keep_inds_mask]
+                    topk_proposals.append(keep_inds_topk)
+                topk_proposals = torch.stack(topk_proposals)
+            else:
+                topk_proposals = torch.topk(enc_outputs_class[..., 0], topk, dim=1)[1]
+
+            topk_coords_logits = torch.gather(
+                enc_outputs_coord_logits, 1, topk_proposals.unsqueeze(-1).repeat(1, 1, 4)
+            )
+            topk_coords_logits = topk_coords_logits.detach()
+            reference_points = topk_coords_logits.sigmoid()
+            init_reference_points = reference_points
+            pos_trans_out = self.pos_trans_norm(self.pos_trans(self.get_proposal_pos_embed(topk_coords_logits)))
+            query_embed, target = torch.split(pos_trans_out, num_channels, dim=2)
+
+            topk_feats = torch.stack(
+                [object_query_embedding[b][topk_proposals[b]] for b in range(batch_size)]
+            ).detach()
+            target = target + self.pix_trans_norm(self.pix_trans(topk_feats))
+        else:
+            query_embed, target = torch.split(query_embeds, num_channels, dim=1)
+            query_embed = query_embed.unsqueeze(0).expand(batch_size, -1, -1)
+            target = target.unsqueeze(0).expand(batch_size, -1, -1)
+            reference_points = self.reference_points(query_embed).sigmoid()
+            init_reference_points = reference_points
+
+        decoder_outputs = self.decoder(
+            inputs_embeds=target,
+            position_embeddings=query_embed,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=mask_flatten,
+            reference_points=reference_points,
+            spatial_shapes=spatial_shapes,
+            level_start_index=level_start_index,
+            valid_ratios=valid_ratios,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            enc_outputs = tuple(value for value in [enc_outputs_class, enc_outputs_coord_logits] if value is not None)
+            tuple_outputs = (init_reference_points,) + decoder_outputs + encoder_outputs + enc_outputs
+
+            return tuple_outputs
+
+        return DetaModelOutput(
+            init_reference_points=init_reference_points,
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            intermediate_hidden_states=decoder_outputs.intermediate_hidden_states,
+            intermediate_reference_points=decoder_outputs.intermediate_reference_points,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+            enc_outputs_class=enc_outputs_class,
+            enc_outputs_coord_logits=enc_outputs_coord_logits,
+            output_proposals=output_proposals,
+        )
+
+
+@add_start_docstrings(
+    """
+    DETA Model (consisting of a backbone and encoder-decoder Transformer) with object detection heads on top, for tasks
+    such as COCO detection.
+    """,
+    DETA_START_DOCSTRING,
+)
+class DetaForObjectDetection(DetaPreTrainedModel):
+    # When using clones, all layers > 0 will be clones, but layer 0 *is* required
+    _tied_weights_keys = [r"bbox_embed\.\d+", r"class_embed\.\d+"]
+    # We can't initialize the model on meta device as some weights are modified during the initialization
+    _no_split_modules = None
+
+    def __init__(self, config: DetaConfig):
+        super().__init__(config)
+
+        # Deformable DETR encoder-decoder model
+        self.model = DetaModel(config)
+
+        # Detection heads on top
+        self.class_embed = nn.Linear(config.d_model, config.num_labels)
+        self.bbox_embed = DetaMLPPredictionHead(
+            input_dim=config.d_model, hidden_dim=config.d_model, output_dim=4, num_layers=3
+        )
+
+        prior_prob = 0.01
+        bias_value = -math.log((1 - prior_prob) / prior_prob)
+        self.class_embed.bias.data.fill_(bias_value)
+        nn.init.constant_(self.bbox_embed.layers[-1].weight.data, 0)
+        nn.init.constant_(self.bbox_embed.layers[-1].bias.data, 0)
+
+        # if two-stage, the last class_embed and bbox_embed is for region proposal generation
+        num_pred = (config.decoder_layers + 1) if config.two_stage else config.decoder_layers
+        if config.with_box_refine:
+            self.class_embed = _get_clones(self.class_embed, num_pred)
+            self.bbox_embed = _get_clones(self.bbox_embed, num_pred)
+            nn.init.constant_(self.bbox_embed[0].layers[-1].bias.data[2:], -2.0)
+            # hack implementation for iterative bounding box refinement
+            self.model.decoder.bbox_embed = self.bbox_embed
+        else:
+            nn.init.constant_(self.bbox_embed.layers[-1].bias.data[2:], -2.0)
+            self.class_embed = nn.ModuleList([self.class_embed for _ in range(num_pred)])
+            self.bbox_embed = nn.ModuleList([self.bbox_embed for _ in range(num_pred)])
+            self.model.decoder.bbox_embed = None
+        if config.two_stage:
+            # hack implementation for two-stage
+            self.model.decoder.class_embed = self.class_embed
+            for box_embed in self.bbox_embed:
+                nn.init.constant_(box_embed.layers[-1].bias.data[2:], 0.0)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @torch.jit.unused
+    def _set_aux_loss(self, outputs_class, outputs_coord):
+        # this is a workaround to make torchscript happy, as torchscript
+        # doesn't support dictionary with non-homogeneous values, such
+        # as a dict having both a Tensor and a list.
+        aux_loss = [
+            {"logits": logits, "pred_boxes": pred_boxes}
+            for logits, pred_boxes in zip(outputs_class.transpose(0, 1)[:-1], outputs_coord.transpose(0, 1)[:-1])
+        ]
+        return aux_loss
+
+    @add_start_docstrings_to_model_forward(DETA_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=DetaObjectDetectionOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        pixel_mask: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.FloatTensor] = None,
+        encoder_outputs: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[list[dict]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.FloatTensor], DetaObjectDetectionOutput]:
+        r"""
+        labels (`list[Dict]` of len `(batch_size,)`, *optional*):
+            Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
+            following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch
+            respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes
+            in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`.
+
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, DetaForObjectDetection
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("jozhang97/deta-swin-large")
+        >>> model = DetaForObjectDetection.from_pretrained("jozhang97/deta-swin-large")
+
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
+        >>> target_sizes = torch.tensor([image.size[::-1]])
+        >>> results = image_processor.post_process_object_detection(outputs, threshold=0.5, target_sizes=target_sizes)[
+        ...     0
+        ... ]
+        >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
+        ...     box = [round(i, 2) for i in box.tolist()]
+        ...     print(
+        ...         f"Detected {model.config.id2label[label.item()]} with confidence "
+        ...         f"{round(score.item(), 3)} at location {box}"
+        ...     )
+        Detected cat with confidence 0.802 at location [9.87, 54.36, 316.93, 473.44]
+        Detected cat with confidence 0.795 at location [346.62, 24.35, 639.62, 373.2]
+        Detected remote with confidence 0.725 at location [40.41, 73.36, 175.77, 117.29]
+        Detected remote with confidence 0.638 at location [333.34, 76.81, 370.22, 187.94]
+        Detected couch with confidence 0.584 at location [0.03, 0.99, 640.02, 474.93]
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # First, sent images through DETR base model to obtain encoder + decoder outputs
+        outputs = self.model(
+            pixel_values,
+            pixel_mask=pixel_mask,
+            decoder_attention_mask=decoder_attention_mask,
+            encoder_outputs=encoder_outputs,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs.intermediate_hidden_states if return_dict else outputs[2]
+        init_reference = outputs.init_reference_points if return_dict else outputs[0]
+        inter_references = outputs.intermediate_reference_points if return_dict else outputs[3]
+
+        # class logits + predicted bounding boxes
+        outputs_classes = []
+        outputs_coords = []
+
+        for level in range(hidden_states.shape[1]):
+            if level == 0:
+                reference = init_reference
+            else:
+                reference = inter_references[:, level - 1]
+            reference = inverse_sigmoid(reference)
+            outputs_class = self.class_embed[level](hidden_states[:, level])
+            delta_bbox = self.bbox_embed[level](hidden_states[:, level])
+            if reference.shape[-1] == 4:
+                outputs_coord_logits = delta_bbox + reference
+            elif reference.shape[-1] == 2:
+                delta_bbox[..., :2] += reference
+                outputs_coord_logits = delta_bbox
+            else:
+                raise ValueError(f"reference.shape[-1] should be 4 or 2, but got {reference.shape[-1]}")
+            outputs_coord = outputs_coord_logits.sigmoid()
+            outputs_classes.append(outputs_class)
+            outputs_coords.append(outputs_coord)
+        # Keep batch_size as first dimension
+        outputs_class = torch.stack(outputs_classes, dim=1)
+        outputs_coord = torch.stack(outputs_coords, dim=1)
+
+        logits = outputs_class[:, -1]
+        pred_boxes = outputs_coord[:, -1]
+
+        loss, loss_dict, auxiliary_outputs = None, None, None
+        if labels is not None:
+            # First: create the matcher
+            matcher = DetaHungarianMatcher(
+                class_cost=self.config.class_cost, bbox_cost=self.config.bbox_cost, giou_cost=self.config.giou_cost
+            )
+            # Second: create the criterion
+            losses = ["labels", "boxes", "cardinality"]
+            criterion = DetaLoss(
+                matcher=matcher,
+                num_classes=self.config.num_labels,
+                focal_alpha=self.config.focal_alpha,
+                losses=losses,
+                num_queries=self.config.num_queries,
+                assign_first_stage=self.config.assign_first_stage,
+                assign_second_stage=self.config.assign_second_stage,
+            )
+            criterion.to(logits.device)
+            # Third: compute the losses, based on outputs and labels
+            outputs_loss = {}
+            outputs_loss["logits"] = logits
+            outputs_loss["pred_boxes"] = pred_boxes
+            outputs_loss["init_reference"] = init_reference
+            if self.config.auxiliary_loss:
+                auxiliary_outputs = self._set_aux_loss(outputs_class, outputs_coord)
+                outputs_loss["auxiliary_outputs"] = auxiliary_outputs
+            if self.config.two_stage:
+                enc_outputs_coord = outputs.enc_outputs_coord_logits.sigmoid()
+                outputs_loss["enc_outputs"] = {
+                    "logits": outputs.enc_outputs_class,
+                    "pred_boxes": enc_outputs_coord,
+                    "anchors": outputs.output_proposals.sigmoid(),
+                }
+
+            loss_dict = criterion(outputs_loss, labels)
+            # Fourth: compute total loss, as a weighted sum of the various losses
+            weight_dict = {"loss_ce": 1, "loss_bbox": self.config.bbox_loss_coefficient}
+            weight_dict["loss_giou"] = self.config.giou_loss_coefficient
+            if self.config.auxiliary_loss:
+                aux_weight_dict = {}
+                for i in range(self.config.decoder_layers - 1):
+                    aux_weight_dict.update({k + f"_{i}": v for k, v in weight_dict.items()})
+                aux_weight_dict.update({k + "_enc": v for k, v in weight_dict.items()})
+                weight_dict.update(aux_weight_dict)
+            loss = sum(loss_dict[k] * weight_dict[k] for k in loss_dict if k in weight_dict)
+
+        if not return_dict:
+            if auxiliary_outputs is not None:
+                output = (logits, pred_boxes) + auxiliary_outputs + outputs
+            else:
+                output = (logits, pred_boxes) + outputs
+            tuple_outputs = ((loss, loss_dict) + output) if loss is not None else output
+
+            return tuple_outputs
+
+        dict_outputs = DetaObjectDetectionOutput(
+            loss=loss,
+            loss_dict=loss_dict,
+            logits=logits,
+            pred_boxes=pred_boxes,
+            auxiliary_outputs=auxiliary_outputs,
+            last_hidden_state=outputs.last_hidden_state,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+            intermediate_hidden_states=outputs.intermediate_hidden_states,
+            intermediate_reference_points=outputs.intermediate_reference_points,
+            init_reference_points=outputs.init_reference_points,
+            enc_outputs_class=outputs.enc_outputs_class,
+            enc_outputs_coord_logits=outputs.enc_outputs_coord_logits,
+            output_proposals=outputs.output_proposals,
+        )
+
+        return dict_outputs
+
+
+def dice_loss(inputs, targets, num_boxes):
+    """
+    Compute the DICE loss, similar to generalized IOU for masks
+
+    Args:
+        inputs: A float tensor of arbitrary shape.
+                The predictions for each example.
+        targets: A float tensor with the same shape as inputs. Stores the binary
+                 classification label for each element in inputs (0 for the negative class and 1 for the positive
+                 class).
+    """
+    inputs = inputs.sigmoid()
+    inputs = inputs.flatten(1)
+    numerator = 2 * (inputs * targets).sum(1)
+    denominator = inputs.sum(-1) + targets.sum(-1)
+    loss = 1 - (numerator + 1) / (denominator + 1)
+    return loss.sum() / num_boxes
+
+
+def sigmoid_focal_loss(inputs, targets, num_boxes, alpha: float = 0.25, gamma: float = 2):
+    """
+    Loss used in RetinaNet for dense detection: https://huggingface.co/papers/1708.02002.
+
+    Args:
+        inputs (`torch.FloatTensor` of arbitrary shape):
+            The predictions for each example.
+        targets (`torch.FloatTensor` with the same shape as `inputs`)
+            A tensor storing the binary classification label for each element in the `inputs` (0 for the negative class
+            and 1 for the positive class).
+        alpha (`float`, *optional*, defaults to `0.25`):
+            Optional weighting factor in the range (0,1) to balance positive vs. negative examples.
+        gamma (`int`, *optional*, defaults to `2`):
+            Exponent of the modulating factor (1 - p_t) to balance easy vs hard examples.
+
+    Returns:
+        Loss tensor
+    """
+    prob = inputs.sigmoid()
+    ce_loss = nn.functional.binary_cross_entropy_with_logits(inputs, targets, reduction="none")
+    # add modulating factor
+    p_t = prob * targets + (1 - prob) * (1 - targets)
+    loss = ce_loss * ((1 - p_t) ** gamma)
+
+    if alpha >= 0:
+        alpha_t = alpha * targets + (1 - alpha) * (1 - targets)
+        loss = alpha_t * loss
+
+    return loss.mean(1).sum() / num_boxes
+
+
+class DetaLoss(nn.Module):
+    """
+    This class computes the losses for `DetaForObjectDetection`. The process happens in two steps: 1) we compute
+    hungarian assignment between ground truth boxes and the outputs of the model 2) we supervise each pair of matched
+    ground-truth / prediction (supervised class and box).
+
+    Args:
+        matcher (`DetaHungarianMatcher`):
+            Module able to compute a matching between targets and proposals.
+        num_classes (`int`):
+            Number of object categories, omitting the special no-object category.
+        focal_alpha (`float`):
+            Alpha parameter in focal loss.
+        losses (`list[str]`):
+            List of all the losses to be applied. See `get_loss` for a list of all available losses.
+    """
+
+    def __init__(
+        self,
+        matcher,
+        num_classes,
+        focal_alpha,
+        losses,
+        num_queries,
+        assign_first_stage=False,
+        assign_second_stage=False,
+    ):
+        super().__init__()
+        self.matcher = matcher
+        self.num_classes = num_classes
+        self.focal_alpha = focal_alpha
+        self.losses = losses
+        self.assign_first_stage = assign_first_stage
+        self.assign_second_stage = assign_second_stage
+
+        if self.assign_first_stage:
+            self.stg1_assigner = DetaStage1Assigner()
+        if self.assign_second_stage:
+            self.stg2_assigner = DetaStage2Assigner(num_queries)
+
+    def loss_labels(self, outputs, targets, indices, num_boxes):
+        """
+        Classification loss (Binary focal loss) targets dicts must contain the key "class_labels" containing a tensor
+        of dim [nb_target_boxes]
+        """
+        if "logits" not in outputs:
+            raise KeyError("No logits were found in the outputs")
+        source_logits = outputs["logits"]
+
+        idx = self._get_source_permutation_idx(indices)
+        target_classes_o = torch.cat([t["class_labels"][J] for t, (_, J) in zip(targets, indices)])
+        target_classes = torch.full(
+            source_logits.shape[:2], self.num_classes, dtype=torch.int64, device=source_logits.device
+        )
+        target_classes[idx] = target_classes_o
+
+        target_classes_onehot = torch.zeros(
+            [source_logits.shape[0], source_logits.shape[1], source_logits.shape[2] + 1],
+            dtype=source_logits.dtype,
+            layout=source_logits.layout,
+            device=source_logits.device,
+        )
+        target_classes_onehot.scatter_(2, target_classes.unsqueeze(-1), 1)
+
+        target_classes_onehot = target_classes_onehot[:, :, :-1]
+        loss_ce = (
+            sigmoid_focal_loss(source_logits, target_classes_onehot, num_boxes, alpha=self.focal_alpha, gamma=2)
+            * source_logits.shape[1]
+        )
+        losses = {"loss_ce": loss_ce}
+
+        return losses
+
+    @torch.no_grad()
+    def loss_cardinality(self, outputs, targets, indices, num_boxes):
+        """
+        Compute the cardinality error, i.e. the absolute error in the number of predicted non-empty boxes.
+
+        This is not really a loss, it is intended for logging purposes only. It doesn't propagate gradients.
+        """
+        logits = outputs["logits"]
+        device = logits.device
+        target_lengths = torch.as_tensor([len(v["class_labels"]) for v in targets], device=device)
+        # Count the number of predictions that are NOT "no-object" (which is the last class)
+        card_pred = (logits.argmax(-1) != logits.shape[-1] - 1).sum(1)
+        card_err = nn.functional.l1_loss(card_pred.float(), target_lengths.float())
+        losses = {"cardinality_error": card_err}
+        return losses
+
+    def loss_boxes(self, outputs, targets, indices, num_boxes):
+        """
+        Compute the losses related to the bounding boxes, the L1 regression loss and the GIoU loss.
+
+        Targets dicts must contain the key "boxes" containing a tensor of dim [nb_target_boxes, 4]. The target boxes
+        are expected in format (center_x, center_y, w, h), normalized by the image size.
+        """
+        if "pred_boxes" not in outputs:
+            raise KeyError("No predicted boxes found in outputs")
+        idx = self._get_source_permutation_idx(indices)
+        source_boxes = outputs["pred_boxes"][idx]
+        target_boxes = torch.cat([t["boxes"][i] for t, (_, i) in zip(targets, indices)], dim=0)
+
+        loss_bbox = nn.functional.l1_loss(source_boxes, target_boxes, reduction="none")
+
+        losses = {}
+        losses["loss_bbox"] = loss_bbox.sum() / num_boxes
+
+        loss_giou = 1 - torch.diag(
+            generalized_box_iou(center_to_corners_format(source_boxes), center_to_corners_format(target_boxes))
+        )
+        losses["loss_giou"] = loss_giou.sum() / num_boxes
+        return losses
+
+    def _get_source_permutation_idx(self, indices):
+        # permute predictions following indices
+        batch_idx = torch.cat([torch.full_like(source, i) for i, (source, _) in enumerate(indices)])
+        source_idx = torch.cat([source for (source, _) in indices])
+        return batch_idx, source_idx
+
+    def _get_target_permutation_idx(self, indices):
+        # permute targets following indices
+        batch_idx = torch.cat([torch.full_like(target, i) for i, (_, target) in enumerate(indices)])
+        target_idx = torch.cat([target for (_, target) in indices])
+        return batch_idx, target_idx
+
+    def get_loss(self, loss, outputs, targets, indices, num_boxes):
+        loss_map = {
+            "labels": self.loss_labels,
+            "cardinality": self.loss_cardinality,
+            "boxes": self.loss_boxes,
+        }
+        if loss not in loss_map:
+            raise ValueError(f"Loss {loss} not supported")
+        return loss_map[loss](outputs, targets, indices, num_boxes)
+
+    def forward(self, outputs, targets):
+        """
+        This performs the loss computation.
+
+        Args:
+             outputs (`dict`, *optional*):
+                Dictionary of tensors, see the output specification of the model for the format.
+             targets (`list[dict]`, *optional*):
+                List of dicts, such that `len(targets) == batch_size`. The expected keys in each dict depends on the
+                losses applied, see each loss' doc.
+        """
+        outputs_without_aux = {k: v for k, v in outputs.items() if k not in ("auxiliary_outputs", "enc_outputs")}
+
+        # Retrieve the matching between the outputs of the last layer and the targets
+        if self.assign_second_stage:
+            indices = self.stg2_assigner(outputs_without_aux, targets)
+        else:
+            indices = self.matcher(outputs_without_aux, targets)
+
+        # Compute the average number of target boxes across all nodes, for normalization purposes
+        num_boxes = sum(len(t["class_labels"]) for t in targets)
+        num_boxes = torch.as_tensor([num_boxes], dtype=torch.float, device=next(iter(outputs.values())).device)
+        # Check that we have initialized the distributed state
+        world_size = 1
+        if is_accelerate_available():
+            if PartialState._shared_state != {}:
+                num_boxes = reduce(num_boxes)
+                world_size = PartialState().num_processes
+        num_boxes = torch.clamp(num_boxes / world_size, min=1).item()
+
+        # Compute all the requested losses
+        losses = {}
+        for loss in self.losses:
+            losses.update(self.get_loss(loss, outputs, targets, indices, num_boxes))
+
+        # In case of auxiliary losses, we repeat this process with the output of each intermediate layer.
+        if "auxiliary_outputs" in outputs:
+            for i, auxiliary_outputs in enumerate(outputs["auxiliary_outputs"]):
+                if not self.assign_second_stage:
+                    indices = self.matcher(auxiliary_outputs, targets)
+                for loss in self.losses:
+                    l_dict = self.get_loss(loss, auxiliary_outputs, targets, indices, num_boxes)
+                    l_dict = {k + f"_{i}": v for k, v in l_dict.items()}
+                    losses.update(l_dict)
+
+        if "enc_outputs" in outputs:
+            enc_outputs = outputs["enc_outputs"]
+            bin_targets = copy.deepcopy(targets)
+            for bt in bin_targets:
+                bt["class_labels"] = torch.zeros_like(bt["class_labels"])
+            if self.assign_first_stage:
+                indices = self.stg1_assigner(enc_outputs, bin_targets)
+            else:
+                indices = self.matcher(enc_outputs, bin_targets)
+            for loss in self.losses:
+                l_dict = self.get_loss(loss, enc_outputs, bin_targets, indices, num_boxes)
+                l_dict = {k + "_enc": v for k, v in l_dict.items()}
+                losses.update(l_dict)
+
+        return losses
+
+
+class DetaMLPPredictionHead(nn.Module):
+    """
+    Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
+    height and width of a bounding box w.r.t. an image.
+
+    Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+
+    """
+
+    def __init__(self, input_dim, hidden_dim, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [hidden_dim] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
+class DetaHungarianMatcher(nn.Module):
+    """
+    This class computes an assignment between the targets and the predictions of the network.
+
+    For efficiency reasons, the targets don't include the no_object. Because of this, in general, there are more
+    predictions than targets. In this case, we do a 1-to-1 matching of the best predictions, while the others are
+    un-matched (and thus treated as non-objects).
+
+    Args:
+        class_cost:
+            The relative weight of the classification error in the matching cost.
+        bbox_cost:
+            The relative weight of the L1 error of the bounding box coordinates in the matching cost.
+        giou_cost:
+            The relative weight of the giou loss of the bounding box in the matching cost.
+    """
+
+    def __init__(self, class_cost: float = 1, bbox_cost: float = 1, giou_cost: float = 1):
+        super().__init__()
+        requires_backends(self, ["scipy"])
+
+        self.class_cost = class_cost
+        self.bbox_cost = bbox_cost
+        self.giou_cost = giou_cost
+        if class_cost == 0 and bbox_cost == 0 and giou_cost == 0:
+            raise ValueError("All costs of the Matcher can't be 0")
+
+    @torch.no_grad()
+    def forward(self, outputs, targets):
+        """
+        Args:
+            outputs (`dict`):
+                A dictionary that contains at least these entries:
+                * "logits": Tensor of dim [batch_size, num_queries, num_classes] with the classification logits
+                * "pred_boxes": Tensor of dim [batch_size, num_queries, 4] with the predicted box coordinates.
+            targets (`list[dict]`):
+                A list of targets (len(targets) = batch_size), where each target is a dict containing:
+                * "class_labels": Tensor of dim [num_target_boxes] (where num_target_boxes is the number of
+                  ground-truth
+                 objects in the target) containing the class labels
+                * "boxes": Tensor of dim [num_target_boxes, 4] containing the target box coordinates.
+
+        Returns:
+            `list[Tuple]`: A list of size `batch_size`, containing tuples of (index_i, index_j) where:
+            - index_i is the indices of the selected predictions (in order)
+            - index_j is the indices of the corresponding selected targets (in order)
+            For each batch element, it holds: len(index_i) = len(index_j) = min(num_queries, num_target_boxes)
+        """
+        batch_size, num_queries = outputs["logits"].shape[:2]
+
+        # We flatten to compute the cost matrices in a batch
+        out_prob = outputs["logits"].flatten(0, 1).sigmoid()  # [batch_size * num_queries, num_classes]
+        out_bbox = outputs["pred_boxes"].flatten(0, 1)  # [batch_size * num_queries, 4]
+
+        # Also concat the target labels and boxes
+        target_ids = torch.cat([v["class_labels"] for v in targets])
+        target_bbox = torch.cat([v["boxes"] for v in targets])
+
+        # Compute the classification cost.
+        alpha = 0.25
+        gamma = 2.0
+        neg_cost_class = (1 - alpha) * (out_prob**gamma) * (-(1 - out_prob + 1e-8).log())
+        pos_cost_class = alpha * ((1 - out_prob) ** gamma) * (-(out_prob + 1e-8).log())
+        class_cost = pos_cost_class[:, target_ids] - neg_cost_class[:, target_ids]
+
+        # Compute the L1 cost between boxes
+        bbox_cost = torch.cdist(out_bbox, target_bbox, p=1)
+
+        # Compute the giou cost between boxes
+        giou_cost = -generalized_box_iou(center_to_corners_format(out_bbox), center_to_corners_format(target_bbox))
+
+        # Final cost matrix
+        cost_matrix = self.bbox_cost * bbox_cost + self.class_cost * class_cost + self.giou_cost * giou_cost
+        cost_matrix = cost_matrix.view(batch_size, num_queries, -1).cpu()
+
+        sizes = [len(v["boxes"]) for v in targets]
+        indices = [linear_sum_assignment(c[i]) for i, c in enumerate(cost_matrix.split(sizes, -1))]
+        return [(torch.as_tensor(i, dtype=torch.int64), torch.as_tensor(j, dtype=torch.int64)) for i, j in indices]
+
+
+def _upcast(t: Tensor) -> Tensor:
+    # Protects from numerical overflows in multiplications by upcasting to the equivalent higher type
+    if t.is_floating_point():
+        return t if t.dtype in (torch.float32, torch.float64) else t.float()
+    else:
+        return t if t.dtype in (torch.int32, torch.int64) else t.int()
+
+
+def box_area(boxes: Tensor) -> Tensor:
+    """
+    Computes the area of a set of bounding boxes, which are specified by its (x1, y1, x2, y2) coordinates.
+
+    Args:
+        boxes (`torch.FloatTensor` of shape `(number_of_boxes, 4)`):
+            Boxes for which the area will be computed. They are expected to be in (x1, y1, x2, y2) format with `0 <= x1
+            < x2` and `0 <= y1 < y2`.
+
+    Returns:
+        `torch.FloatTensor`: a tensor containing the area for each box.
+    """
+    boxes = _upcast(boxes)
+    return (boxes[:, 2] - boxes[:, 0]) * (boxes[:, 3] - boxes[:, 1])
+
+
+def box_iou(boxes1, boxes2):
+    area1 = box_area(boxes1)
+    area2 = box_area(boxes2)
+
+    left_top = torch.max(boxes1[:, None, :2], boxes2[:, :2])  # [N,M,2]
+    right_bottom = torch.min(boxes1[:, None, 2:], boxes2[:, 2:])  # [N,M,2]
+
+    width_height = (right_bottom - left_top).clamp(min=0)  # [N,M,2]
+    inter = width_height[:, :, 0] * width_height[:, :, 1]  # [N,M]
+
+    union = area1[:, None] + area2 - inter
+
+    iou = inter / union
+    return iou, union
+
+
+def generalized_box_iou(boxes1, boxes2):
+    """
+    Generalized IoU from https://giou.stanford.edu/. The boxes should be in [x0, y0, x1, y1] (corner) format.
+
+    Returns:
+        `torch.FloatTensor`: a [N, M] pairwise matrix, where N = len(boxes1) and M = len(boxes2)
+    """
+    # degenerate boxes gives inf / nan results
+    # so do an early check
+    if not (boxes1[:, 2:] >= boxes1[:, :2]).all():
+        raise ValueError(f"boxes1 must be in [x0, y0, x1, y1] (corner) format, but got {boxes1}")
+    if not (boxes2[:, 2:] >= boxes2[:, :2]).all():
+        raise ValueError(f"boxes2 must be in [x0, y0, x1, y1] (corner) format, but got {boxes2}")
+    iou, union = box_iou(boxes1, boxes2)
+
+    top_left = torch.min(boxes1[:, None, :2], boxes2[:, :2])
+    bottom_right = torch.max(boxes1[:, None, 2:], boxes2[:, 2:])
+
+    width_height = (bottom_right - top_left).clamp(min=0)  # [N,M,2]
+    area = width_height[:, :, 0] * width_height[:, :, 1]
+
+    return iou - (area - union) / area
+
+
+# from https://github.com/facebookresearch/detectron2/blob/cbbc1ce26473cb2a5cc8f58e8ada9ae14cb41052/detectron2/layers/wrappers.py#L100
+def nonzero_tuple(x):
+    """
+    A 'as_tuple=True' version of torch.nonzero to support torchscript. because of
+    https://github.com/pytorch/pytorch/issues/38718
+    """
+    if torch.jit.is_scripting():
+        if x.dim() == 0:
+            return x.unsqueeze(0).nonzero().unbind(1)
+        return x.nonzero().unbind(1)
+    else:
+        return x.nonzero(as_tuple=True)
+
+
+# from https://github.com/facebookresearch/detectron2/blob/9921a2caa585d4fa66c4b534b6fab6e74d89b582/detectron2/modeling/matcher.py#L9
+class DetaMatcher:
+    """
+    This class assigns to each predicted "element" (e.g., a box) a ground-truth element. Each predicted element will
+    have exactly zero or one matches; each ground-truth element may be matched to zero or more predicted elements.
+
+    The matching is determined by the MxN match_quality_matrix, that characterizes how well each (ground-truth,
+    prediction)-pair match each other. For example, if the elements are boxes, this matrix may contain box
+    intersection-over-union overlap values.
+
+    The matcher returns (a) a vector of length N containing the index of the ground-truth element m in [0, M) that
+    matches to prediction n in [0, N). (b) a vector of length N containing the labels for each prediction.
+    """
+
+    def __init__(self, thresholds: list[float], labels: list[int], allow_low_quality_matches: bool = False):
+        """
+        Args:
+            thresholds (`list[float]`):
+                A list of thresholds used to stratify predictions into levels.
+            labels (`list[int`):
+                A list of values to label predictions belonging at each level. A label can be one of {-1, 0, 1}
+                signifying {ignore, negative class, positive class}, respectively.
+            allow_low_quality_matches (`bool`, *optional*, defaults to `False`):
+                If `True`, produce additional matches for predictions with maximum match quality lower than
+                high_threshold. See `set_low_quality_matches_` for more details.
+
+            For example,
+                thresholds = [0.3, 0.5] labels = [0, -1, 1] All predictions with iou < 0.3 will be marked with 0 and
+                thus will be considered as false positives while training. All predictions with 0.3 <= iou < 0.5 will
+                be marked with -1 and thus will be ignored. All predictions with 0.5 <= iou will be marked with 1 and
+                thus will be considered as true positives.
+        """
+        # Add -inf and +inf to first and last position in thresholds
+        thresholds = thresholds[:]
+        if thresholds[0] < 0:
+            raise ValueError("Thresholds should be positive")
+        thresholds.insert(0, -float("inf"))
+        thresholds.append(float("inf"))
+        # Currently torchscript does not support all + generator
+        if not all(low <= high for (low, high) in zip(thresholds[:-1], thresholds[1:])):
+            raise ValueError("Thresholds should be sorted.")
+        if not all(l in [-1, 0, 1] for l in labels):
+            raise ValueError("All labels should be either -1, 0 or 1")
+        if len(labels) != len(thresholds) - 1:
+            raise ValueError("Number of labels should be equal to number of thresholds - 1")
+        self.thresholds = thresholds
+        self.labels = labels
+        self.allow_low_quality_matches = allow_low_quality_matches
+
+    def __call__(self, match_quality_matrix):
+        """
+        Args:
+            match_quality_matrix (Tensor[float]): an MxN tensor, containing the
+                pairwise quality between M ground-truth elements and N predicted elements. All elements must be >= 0
+                (due to the us of `torch.nonzero` for selecting indices in `set_low_quality_matches_`).
+
+        Returns:
+            matches (Tensor[int64]): a vector of length N, where matches[i] is a matched
+                ground-truth index in [0, M)
+            match_labels (Tensor[int8]): a vector of length N, where pred_labels[i] indicates
+                whether a prediction is a true or false positive or ignored
+        """
+        assert match_quality_matrix.dim() == 2
+        if match_quality_matrix.numel() == 0:
+            default_matches = match_quality_matrix.new_full((match_quality_matrix.size(1),), 0, dtype=torch.int64)
+            # When no gt boxes exist, we define IOU = 0 and therefore set labels
+            # to `self.labels[0]`, which usually defaults to background class 0
+            # To choose to ignore instead, can make labels=[-1,0,-1,1] + set appropriate thresholds
+            default_match_labels = match_quality_matrix.new_full(
+                (match_quality_matrix.size(1),), self.labels[0], dtype=torch.int8
+            )
+            return default_matches, default_match_labels
+
+        assert torch.all(match_quality_matrix >= 0)
+
+        # match_quality_matrix is M (gt) x N (predicted)
+        # Max over gt elements (dim 0) to find best gt candidate for each prediction
+        matched_vals, matches = match_quality_matrix.max(dim=0)
+
+        match_labels = matches.new_full(matches.size(), 1, dtype=torch.int8)
+
+        for l, low, high in zip(self.labels, self.thresholds[:-1], self.thresholds[1:]):
+            low_high = (matched_vals >= low) & (matched_vals < high)
+            match_labels[low_high] = l
+
+        if self.allow_low_quality_matches:
+            self.set_low_quality_matches_(match_labels, match_quality_matrix)
+
+        return matches, match_labels
+
+    def set_low_quality_matches_(self, match_labels, match_quality_matrix):
+        """
+        Produce additional matches for predictions that have only low-quality matches. Specifically, for each
+        ground-truth G find the set of predictions that have maximum overlap with it (including ties); for each
+        prediction in that set, if it is unmatched, then match it to the ground-truth G.
+
+        This function implements the RPN assignment case (i) in Sec. 3.1.2 of :paper:`Faster R-CNN`.
+        """
+        # For each gt, find the prediction with which it has highest quality
+        highest_quality_foreach_gt, _ = match_quality_matrix.max(dim=1)
+        # Find the highest quality match available, even if it is low, including ties.
+        # Note that the matches qualities must be positive due to the use of
+        # `torch.nonzero`.
+        _, pred_inds_with_highest_quality = nonzero_tuple(match_quality_matrix == highest_quality_foreach_gt[:, None])
+        # If an anchor was labeled positive only due to a low-quality match
+        # with gt_A, but it has larger overlap with gt_B, it's matched index will still be gt_B.
+        # This follows the implementation in Detectron, and is found to have no significant impact.
+        match_labels[pred_inds_with_highest_quality] = 1
+
+
+# from https://github.com/facebookresearch/detectron2/blob/cbbc1ce26473cb2a5cc8f58e8ada9ae14cb41052/detectron2/modeling/sampling.py#L9
+def subsample_labels(labels: torch.Tensor, num_samples: int, positive_fraction: float, bg_label: int):
+    """
+    Return `num_samples` (or fewer, if not enough found) random samples from `labels` which is a mixture of positives &
+    negatives. It will try to return as many positives as possible without exceeding `positive_fraction * num_samples`,
+    and then try to fill the remaining slots with negatives.
+
+    Args:
+        labels (Tensor): (N, ) label vector with values:
+            * -1: ignore
+            * bg_label: background ("negative") class
+            * otherwise: one or more foreground ("positive") classes
+        num_samples (int): The total number of labels with value >= 0 to return.
+            Values that are not sampled will be filled with -1 (ignore).
+        positive_fraction (float): The number of subsampled labels with values > 0
+            is `min(num_positives, int(positive_fraction * num_samples))`. The number of negatives sampled is
+            `min(num_negatives, num_samples - num_positives_sampled)`. In order words, if there are not enough
+            positives, the sample is filled with negatives. If there are also not enough negatives, then as many
+            elements are sampled as is possible.
+        bg_label (int): label index of background ("negative") class.
+
+    Returns:
+        pos_idx, neg_idx (Tensor):
+            1D vector of indices. The total length of both is `num_samples` or fewer.
+    """
+    positive = nonzero_tuple((labels != -1) & (labels != bg_label))[0]
+    negative = nonzero_tuple(labels == bg_label)[0]
+
+    num_pos = int(num_samples * positive_fraction)
+    # protect against not enough positive examples
+    num_pos = min(positive.numel(), num_pos)
+    num_neg = num_samples - num_pos
+    # protect against not enough negative examples
+    num_neg = min(negative.numel(), num_neg)
+
+    # randomly select positive and negative examples
+    perm1 = torch.randperm(positive.numel(), device=positive.device)[:num_pos]
+    perm2 = torch.randperm(negative.numel(), device=negative.device)[:num_neg]
+
+    pos_idx = positive[perm1]
+    neg_idx = negative[perm2]
+    return pos_idx, neg_idx
+
+
+def sample_topk_per_gt(pr_inds, gt_inds, iou, k):
+    if len(gt_inds) == 0:
+        return pr_inds, gt_inds
+    # find topk matches for each gt
+    gt_inds2, counts = gt_inds.unique(return_counts=True)
+    scores, pr_inds2 = iou[gt_inds2].topk(k, dim=1)
+    gt_inds2 = gt_inds2[:, None].repeat(1, k)
+
+    # filter to as many matches that gt has
+    pr_inds3 = torch.cat([pr[:c] for c, pr in zip(counts, pr_inds2)])
+    gt_inds3 = torch.cat([gt[:c] for c, gt in zip(counts, gt_inds2)])
+    return pr_inds3, gt_inds3
+
+
+# modified from https://github.com/facebookresearch/detectron2/blob/cbbc1ce26473cb2a5cc8f58e8ada9ae14cb41052/detectron2/modeling/roi_heads/roi_heads.py#L123
+class DetaStage2Assigner(nn.Module):
+    def __init__(self, num_queries, max_k=4):
+        super().__init__()
+        self.positive_fraction = 0.25
+        self.bg_label = 400  # number > 91 to filter out later
+        self.batch_size_per_image = num_queries
+        self.proposal_matcher = DetaMatcher(thresholds=[0.6], labels=[0, 1], allow_low_quality_matches=True)
+        self.k = max_k
+
+    def _sample_proposals(self, matched_idxs: torch.Tensor, matched_labels: torch.Tensor, gt_classes: torch.Tensor):
+        """
+        Based on the matching between N proposals and M groundtruth, sample the proposals and set their classification
+        labels.
+
+        Args:
+            matched_idxs (Tensor): a vector of length N, each is the best-matched
+                gt index in [0, M) for each proposal.
+            matched_labels (Tensor): a vector of length N, the matcher's label
+                (one of cfg.MODEL.ROI_HEADS.IOU_LABELS) for each proposal.
+            gt_classes (Tensor): a vector of length M.
+
+        Returns:
+            Tensor: a vector of indices of sampled proposals. Each is in [0, N). Tensor: a vector of the same length,
+            the classification label for
+                each sampled proposal. Each sample is labeled as either a category in [0, num_classes) or the
+                background (num_classes).
+        """
+        has_gt = gt_classes.numel() > 0
+        # Get the corresponding GT for each proposal
+        if has_gt:
+            gt_classes = gt_classes[matched_idxs]
+            # Label unmatched proposals (0 label from matcher) as background (label=num_classes)
+            gt_classes[matched_labels == 0] = self.bg_label
+            # Label ignore proposals (-1 label)
+            gt_classes[matched_labels == -1] = -1
+        else:
+            gt_classes = torch.zeros_like(matched_idxs) + self.bg_label
+
+        sampled_fg_idxs, sampled_bg_idxs = subsample_labels(
+            gt_classes, self.batch_size_per_image, self.positive_fraction, self.bg_label
+        )
+
+        sampled_idxs = torch.cat([sampled_fg_idxs, sampled_bg_idxs], dim=0)
+        return sampled_idxs, gt_classes[sampled_idxs]
+
+    def forward(self, outputs, targets, return_cost_matrix=False):
+        # COCO categories are from 1 to 90. They set num_classes=91 and apply sigmoid.
+
+        bs = len(targets)
+        indices = []
+        ious = []
+        for b in range(bs):
+            iou, _ = box_iou(
+                center_to_corners_format(targets[b]["boxes"]),
+                center_to_corners_format(outputs["init_reference"][b].detach()),
+            )
+            matched_idxs, matched_labels = self.proposal_matcher(
+                iou
+            )  # proposal_id -> highest_iou_gt_id, proposal_id -> [1 if iou > 0.6, 0 ow]
+            (
+                sampled_idxs,
+                sampled_gt_classes,
+            ) = self._sample_proposals(  # list of sampled proposal_ids, sampled_id -> [0, num_classes)+[bg_label]
+                matched_idxs, matched_labels, targets[b]["class_labels"]
+            )
+            pos_pr_inds = sampled_idxs[sampled_gt_classes != self.bg_label]
+            pos_gt_inds = matched_idxs[pos_pr_inds]
+            pos_pr_inds, pos_gt_inds = self.postprocess_indices(pos_pr_inds, pos_gt_inds, iou)
+            indices.append((pos_pr_inds, pos_gt_inds))
+            ious.append(iou)
+        if return_cost_matrix:
+            return indices, ious
+        return indices
+
+    def postprocess_indices(self, pr_inds, gt_inds, iou):
+        return sample_topk_per_gt(pr_inds, gt_inds, iou, self.k)
+
+
+# modified from https://github.com/facebookresearch/detectron2/blob/cbbc1ce26473cb2a5cc8f58e8ada9ae14cb41052/detectron2/modeling/proposal_generator/rpn.py#L181
+class DetaStage1Assigner(nn.Module):
+    def __init__(self, t_low=0.3, t_high=0.7, max_k=4):
+        super().__init__()
+        self.positive_fraction = 0.5
+        self.batch_size_per_image = 256
+        self.k = max_k
+        self.t_low = t_low
+        self.t_high = t_high
+        self.anchor_matcher = DetaMatcher(
+            thresholds=[t_low, t_high], labels=[0, -1, 1], allow_low_quality_matches=True
+        )
+
+    def _subsample_labels(self, label):
+        """
+        Randomly sample a subset of positive and negative examples, and overwrite the label vector to the ignore value
+        (-1) for all elements that are not included in the sample.
+
+        Args:
+            labels (Tensor): a vector of -1, 0, 1. Will be modified in-place and returned.
+        """
+        pos_idx, neg_idx = subsample_labels(label, self.batch_size_per_image, self.positive_fraction, 0)
+        # Fill with the ignore label (-1), then set positive and negative labels
+        label.fill_(-1)
+        label.scatter_(0, pos_idx, 1)
+        label.scatter_(0, neg_idx, 0)
+        return label
+
+    def forward(self, outputs, targets):
+        bs = len(targets)
+        indices = []
+        for b in range(bs):
+            anchors = outputs["anchors"][b]
+            if len(targets[b]["boxes"]) == 0:
+                indices.append(
+                    (
+                        torch.tensor([], dtype=torch.long, device=anchors.device),
+                        torch.tensor([], dtype=torch.long, device=anchors.device),
+                    )
+                )
+                continue
+            iou, _ = box_iou(
+                center_to_corners_format(targets[b]["boxes"]),
+                center_to_corners_format(anchors),
+            )
+            matched_idxs, matched_labels = self.anchor_matcher(
+                iou
+            )  # proposal_id -> highest_iou_gt_id, proposal_id -> [1 if iou > 0.7, 0 if iou < 0.3, -1 ow]
+            matched_labels = self._subsample_labels(matched_labels)
+
+            all_pr_inds = torch.arange(len(anchors), device=matched_labels.device)
+            pos_pr_inds = all_pr_inds[matched_labels == 1]
+            pos_gt_inds = matched_idxs[pos_pr_inds]
+            pos_pr_inds, pos_gt_inds = self.postprocess_indices(pos_pr_inds, pos_gt_inds, iou)
+            pos_pr_inds, pos_gt_inds = pos_pr_inds.to(anchors.device), pos_gt_inds.to(anchors.device)
+            indices.append((pos_pr_inds, pos_gt_inds))
+        return indices
+
+    def postprocess_indices(self, pr_inds, gt_inds, iou):
+        return sample_topk_per_gt(pr_inds, gt_inds, iou, self.k)
+
+
+__all__ = ["DetaForObjectDetection", "DetaModel", "DetaPreTrainedModel"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..db3d0a634051c713aee33b48f2962647ac48d0eb
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ....utils import _LazyModule
+from ....utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_efficientformer import *
+    from .image_processing_efficientformer import *
+    from .modeling_efficientformer import *
+    from .modeling_tf_efficientformer import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/configuration_efficientformer.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/configuration_efficientformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..9aaee649ba047e5967f132eb0b3527dce5855d11
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/configuration_efficientformer.py
@@ -0,0 +1,170 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""EfficientFormer model configuration"""
+
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class EfficientFormerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of an [`EfficientFormerModel`]. It is used to
+    instantiate an EfficientFormer model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the EfficientFormer
+    [snap-research/efficientformer-l1](https://huggingface.co/snap-research/efficientformer-l1) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        depths (`List(int)`, *optional*, defaults to `[3, 2, 6, 4]`)
+            Depth of each stage.
+        hidden_sizes (`List(int)`, *optional*, defaults to `[48, 96, 224, 448]`)
+            Dimensionality of each stage.
+        downsamples (`List(bool)`, *optional*, defaults to `[True, True, True, True]`)
+            Whether or not to downsample inputs between two stages.
+        dim (`int`, *optional*, defaults to 448):
+            Number of channels in Meta3D layers
+        key_dim (`int`, *optional*, defaults to 32):
+            The size of the key in meta3D block.
+        attention_ratio (`int`, *optional*, defaults to 4):
+            Ratio of the dimension of the query and value to the dimension of the key in MSHA block
+        resolution (`int`, *optional*, defaults to 7)
+            Size of each patch
+        num_hidden_layers (`int`, *optional*, defaults to 5):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the 3D MetaBlock.
+        mlp_expansion_ratio (`int`, *optional*, defaults to 4):
+            Ratio of size of the hidden dimensionality of an MLP to the dimensionality of its input.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings and encoder.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        pool_size (`int`, *optional*, defaults to 3):
+            Kernel size of pooling layers.
+        downsample_patch_size (`int`, *optional*, defaults to 3):
+            The size of patches in downsampling layers.
+        downsample_stride (`int`, *optional*, defaults to 2):
+            The stride of convolution kernels in downsampling layers.
+        downsample_pad (`int`, *optional*, defaults to 1):
+            Padding in downsampling layers.
+        drop_path_rate (`int`, *optional*, defaults to 0):
+            Rate at which to increase dropout probability in DropPath.
+        num_meta3d_blocks (`int`, *optional*, defaults to 1):
+            The number of 3D MetaBlocks in the last stage.
+        distillation (`bool`, *optional*, defaults to `True`):
+            Whether to add a distillation head.
+        use_layer_scale (`bool`, *optional*, defaults to `True`):
+            Whether to scale outputs from token mixers.
+        layer_scale_init_value (`float`, *optional*, defaults to 1e-5):
+            Factor by which outputs from token mixers are scaled.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        image_size (`int`, *optional*, defaults to `224`):
+            The size (resolution) of each image.
+
+    Example:
+
+    ```python
+    >>> from transformers import EfficientFormerConfig, EfficientFormerModel
+
+    >>> # Initializing a EfficientFormer efficientformer-l1 style configuration
+    >>> configuration = EfficientFormerConfig()
+
+    >>> # Initializing a EfficientFormerModel (with random weights) from the efficientformer-l3 style configuration
+    >>> model = EfficientFormerModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "efficientformer"
+
+    def __init__(
+        self,
+        depths: list[int] = [3, 2, 6, 4],
+        hidden_sizes: list[int] = [48, 96, 224, 448],
+        downsamples: list[bool] = [True, True, True, True],
+        dim: int = 448,
+        key_dim: int = 32,
+        attention_ratio: int = 4,
+        resolution: int = 7,
+        num_hidden_layers: int = 5,
+        num_attention_heads: int = 8,
+        mlp_expansion_ratio: int = 4,
+        hidden_dropout_prob: float = 0.0,
+        patch_size: int = 16,
+        num_channels: int = 3,
+        pool_size: int = 3,
+        downsample_patch_size: int = 3,
+        downsample_stride: int = 2,
+        downsample_pad: int = 1,
+        drop_path_rate: float = 0.0,
+        num_meta3d_blocks: int = 1,
+        distillation: bool = True,
+        use_layer_scale: bool = True,
+        layer_scale_init_value: float = 1e-5,
+        hidden_act: str = "gelu",
+        initializer_range: float = 0.02,
+        layer_norm_eps: float = 1e-12,
+        image_size: int = 224,
+        batch_norm_eps: float = 1e-05,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.hidden_sizes = hidden_sizes
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.depths = depths
+        self.mlp_expansion_ratio = mlp_expansion_ratio
+        self.downsamples = downsamples
+        self.dim = dim
+        self.key_dim = key_dim
+        self.attention_ratio = attention_ratio
+        self.resolution = resolution
+        self.pool_size = pool_size
+        self.downsample_patch_size = downsample_patch_size
+        self.downsample_stride = downsample_stride
+        self.downsample_pad = downsample_pad
+        self.drop_path_rate = drop_path_rate
+        self.num_meta3d_blocks = num_meta3d_blocks
+        self.distillation = distillation
+        self.use_layer_scale = use_layer_scale
+        self.layer_scale_init_value = layer_scale_init_value
+        self.image_size = image_size
+        self.batch_norm_eps = batch_norm_eps
+
+
+__all__ = [
+    "EfficientFormerConfig",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/image_processing_efficientformer.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/image_processing_efficientformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..a2dd1281e920fdea3b4c51ef79c09bc86c605921
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/image_processing_efficientformer.py
@@ -0,0 +1,324 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for EfficientFormer."""
+
+from typing import Optional, Union
+
+import numpy as np
+
+from ....image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ....image_transforms import (
+    get_resize_output_image_size,
+    resize,
+    to_channel_dimension_format,
+)
+from ....image_utils import (
+    IMAGENET_DEFAULT_MEAN,
+    IMAGENET_DEFAULT_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_batched,
+    is_scaled_image,
+    to_numpy_array,
+    valid_images,
+    validate_kwargs,
+    validate_preprocess_arguments,
+)
+from ....utils import TensorType, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class EfficientFormerImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a EfficientFormer image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `(size["height"],
+            size["width"])`. Can be overridden by the `do_resize` parameter in the `preprocess` method.
+        size (`dict`, *optional*, defaults to `{"height": 224, "width": 224}`):
+            Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
+            method.
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+            Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
+            `preprocess` method.
+        do_center_crop (`bool`, *optional*, defaults to `True`):
+            Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the
+            `preprocess` method.
+        crop_size (`dict[str, int]` *optional*, defaults to 224):
+            Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess`
+            method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
+            parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
+            `preprocess` method.
+        do_normalize:
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method.
+        image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_center_crop: bool = True,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        crop_size: Optional[dict[str, int]] = None,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 224, "width": 224}
+        size = get_size_dict(size)
+        crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
+        crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
+
+        self.do_resize = do_resize
+        self.do_rescale = do_rescale
+        self.do_normalize = do_normalize
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.size = size
+        self.resample = resample
+        self.rescale_factor = rescale_factor
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_DEFAULT_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_DEFAULT_STD
+        self._valid_processor_keys = [
+            "images",
+            "do_resize",
+            "size",
+            "resample",
+            "do_center_crop",
+            "crop_size",
+            "do_rescale",
+            "rescale_factor",
+            "do_normalize",
+            "image_mean",
+            "image_std",
+            "return_tensors",
+            "data_format",
+            "input_data_format",
+        ]
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image to `(size["height"], size["width"])`.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`dict[str, int]`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            resample:
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+
+        Returns:
+            `np.ndarray`: The resized image.
+        """
+        size = get_size_dict(size)
+
+        if "shortest_edge" in size:
+            size = get_resize_output_image_size(
+                image, size=size["shortest_edge"], default_to_square=False, input_data_format=input_data_format
+            )
+            # size = get_resize_output_image_size(image, size["shortest_edge"], size["longest_edge"])
+        elif "height" in size and "width" in size:
+            size = (size["height"], size["width"])
+        else:
+            raise ValueError(f"Size must contain 'height' and 'width' keys or 'shortest_edge' key. Got {size.keys()}")
+        return resize(
+            image, size=size, resample=resample, data_format=data_format, input_data_format=input_data_format, **kwargs
+        )
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample: Optional[PILImageResampling] = None,
+        do_center_crop: Optional[bool] = None,
+        crop_size: Optional[int] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Dictionary in the format `{"height": h, "width": w}` specifying the size of the output image after
+                resizing.
+            resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`):
+                `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has
+                an effect if `do_resize` is set to `True`.
+            do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
+                Whether to center crop the image.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            crop_size (`dict[str, int]`, *optional*, defaults to `self.crop_size`):
+                Size of the center crop. Only has an effect if `do_center_crop` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use if `do_normalize` is set to `True`.
+            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use if `do_normalize` is set to `True`.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
+        crop_size = crop_size if crop_size is not None else self.crop_size
+        crop_size = get_size_dict(crop_size, param_name="crop_size", default_to_square=True)
+        resample = resample if resample is not None else self.resample
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+
+        size = size if size is not None else self.size
+        size_dict = get_size_dict(size)
+
+        validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys)
+
+        if not is_batched(images):
+            images = [images]
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_center_crop=do_center_crop,
+            crop_size=crop_size,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if do_rescale and is_scaled_image(images[0]):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        if do_resize:
+            images = [
+                self.resize(image=image, size=size_dict, resample=resample, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        if do_center_crop:
+            images = [
+                self.center_crop(image=image, size=crop_size, input_data_format=input_data_format) for image in images
+            ]
+
+        if do_rescale:
+            images = [
+                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        if do_normalize:
+            images = [
+                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
+
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+
+__all__ = ["EfficientFormerImageProcessor"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/modeling_efficientformer.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/modeling_efficientformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..d35d3e82c007c6c6cf9326e1c933df34c6ae41ba
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/modeling_efficientformer.py
@@ -0,0 +1,786 @@
+# coding=utf-8
+# Copyright 2022 Snapchat Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch EfficientFormer model."""
+
+import itertools
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+from torch import nn
+
+from ....activations import ACT2FN
+from ....modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
+from ....modeling_utils import PreTrainedModel
+from ....utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
+from .configuration_efficientformer import EfficientFormerConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "EfficientFormerConfig"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "snap-research/efficientformer-l1-300"
+_EXPECTED_OUTPUT_SHAPE = [1, 49, 448]
+
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "snap-research/efficientformer-l1-300"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "Egyptian cat"
+
+
+class EfficientFormerPatchEmbeddings(nn.Module):
+    """
+    This class performs downsampling between two stages. For the input tensor with the shape [batch_size, num_channels,
+    height, width] it produces output tensor with the shape [batch_size, num_channels, height/stride, width/stride]
+    """
+
+    def __init__(self, config: EfficientFormerConfig, num_channels: int, embed_dim: int, apply_norm: bool = True):
+        super().__init__()
+        self.num_channels = num_channels
+
+        self.projection = nn.Conv2d(
+            num_channels,
+            embed_dim,
+            kernel_size=config.downsample_patch_size,
+            stride=config.downsample_stride,
+            padding=config.downsample_pad,
+        )
+        self.norm = nn.BatchNorm2d(embed_dim, eps=config.batch_norm_eps) if apply_norm else nn.Identity()
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        batch_size, num_channels, height, width = pixel_values.shape
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+
+        embeddings = self.projection(pixel_values)
+        embeddings = self.norm(embeddings)
+
+        return embeddings
+
+
+class EfficientFormerSelfAttention(nn.Module):
+    def __init__(self, dim: int, key_dim: int, num_heads: int, attention_ratio: int, resolution: int):
+        super().__init__()
+
+        self.num_heads = num_heads
+        self.key_dim = key_dim
+        self.attention_ratio = attention_ratio
+        self.scale = key_dim**-0.5
+        self.total_key_dim = key_dim * num_heads
+        self.expanded_key_dim = int(attention_ratio * key_dim)
+        self.total_expanded_key_dim = int(self.expanded_key_dim * num_heads)
+        hidden_size = self.total_expanded_key_dim + self.total_key_dim * 2
+        self.qkv = nn.Linear(dim, hidden_size)
+        self.projection = nn.Linear(self.total_expanded_key_dim, dim)
+        points = list(itertools.product(range(resolution), range(resolution)))
+        num_points = len(points)
+        attention_offsets = {}
+        idxs = []
+        for point_1 in points:
+            for point_2 in points:
+                offset = (abs(point_1[0] - point_2[0]), abs(point_1[1] - point_2[1]))
+                if offset not in attention_offsets:
+                    attention_offsets[offset] = len(attention_offsets)
+                idxs.append(attention_offsets[offset])
+        self.attention_biases = torch.nn.Parameter(torch.zeros(num_heads, len(attention_offsets)))
+        self.register_buffer("attention_bias_idxs", torch.LongTensor(idxs).view(num_points, num_points))
+
+    @torch.no_grad()
+    def train(self, mode=True):
+        super().train(mode)
+        if mode and hasattr(self, "ab"):
+            del self.ab
+        else:
+            self.ab = self.attention_biases[:, self.attention_bias_idxs]
+
+    def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> tuple[torch.Tensor]:
+        batch_size, sequence_length, num_channels = hidden_states.shape
+        qkv = self.qkv(hidden_states)
+        query_layer, key_layer, value_layer = qkv.reshape(batch_size, sequence_length, self.num_heads, -1).split(
+            [self.key_dim, self.key_dim, self.expanded_key_dim], dim=3
+        )
+        query_layer = query_layer.permute(0, 2, 1, 3)
+        key_layer = key_layer.permute(0, 2, 1, 3)
+        value_layer = value_layer.permute(0, 2, 1, 3)
+
+        # set `model.to(torch_device)` won't change `self.ab.device`, if there is no follow-up `train` or `eval` call.
+        # Let's do it manually here, so users won't have to do this everytime.
+        if not self.training:
+            self.ab = self.ab.to(self.attention_biases.device)
+        attention_probs = (torch.matmul(query_layer, key_layer.transpose(-2, -1))) * self.scale + (
+            self.attention_biases[:, self.attention_bias_idxs] if self.training else self.ab
+        )
+
+        attention_probs = attention_probs.softmax(dim=-1)
+
+        context_layer = torch.matmul(attention_probs, value_layer).transpose(1, 2)
+        context_layer = context_layer.reshape(batch_size, sequence_length, self.total_expanded_key_dim)
+        context_layer = self.projection(context_layer)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+class EfficientFormerConvStem(nn.Module):
+    def __init__(self, config: EfficientFormerConfig, out_channels: int):
+        super().__init__()
+
+        self.convolution1 = nn.Conv2d(config.num_channels, out_channels // 2, kernel_size=3, stride=2, padding=1)
+        self.batchnorm_before = nn.BatchNorm2d(out_channels // 2, eps=config.batch_norm_eps)
+
+        self.convolution2 = nn.Conv2d(out_channels // 2, out_channels, kernel_size=3, stride=2, padding=1)
+        self.batchnorm_after = nn.BatchNorm2d(out_channels, eps=config.batch_norm_eps)
+
+        self.activation = nn.ReLU()
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        features = self.batchnorm_before(self.convolution1(pixel_values))
+        features = self.activation(features)
+        features = self.batchnorm_after(self.convolution2(features))
+        features = self.activation(features)
+
+        return features
+
+
+class EfficientFormerPooling(nn.Module):
+    def __init__(self, pool_size: int):
+        super().__init__()
+        self.pool = nn.AvgPool2d(pool_size, stride=1, padding=pool_size // 2, count_include_pad=False)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        output = self.pool(hidden_states) - hidden_states
+        return output
+
+
+class EfficientFormerDenseMlp(nn.Module):
+    def __init__(
+        self,
+        config: EfficientFormerConfig,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+
+        self.linear_in = nn.Linear(in_features, hidden_features)
+        self.activation = ACT2FN[config.hidden_act]
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.linear_out = nn.Linear(hidden_features, out_features)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.linear_in(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.linear_out(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        return hidden_states
+
+
+class EfficientFormerConvMlp(nn.Module):
+    def __init__(
+        self,
+        config: EfficientFormerConfig,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        drop: float = 0.0,
+    ):
+        super().__init__()
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+
+        self.convolution1 = nn.Conv2d(in_features, hidden_features, 1)
+        self.activation = ACT2FN[config.hidden_act]
+        self.convolution2 = nn.Conv2d(hidden_features, out_features, 1)
+        self.dropout = nn.Dropout(drop)
+
+        self.batchnorm_before = nn.BatchNorm2d(hidden_features, eps=config.batch_norm_eps)
+        self.batchnorm_after = nn.BatchNorm2d(out_features, eps=config.batch_norm_eps)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.convolution1(hidden_state)
+        hidden_state = self.batchnorm_before(hidden_state)
+
+        hidden_state = self.activation(hidden_state)
+        hidden_state = self.dropout(hidden_state)
+        hidden_state = self.convolution2(hidden_state)
+
+        hidden_state = self.batchnorm_after(hidden_state)
+        hidden_state = self.dropout(hidden_state)
+
+        return hidden_state
+
+
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+
+
+class EfficientFormerDropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return f"p={self.drop_prob}"
+
+
+class EfficientFormerFlat(nn.Module):
+    def __init__(self):
+        super().__init__()
+
+    def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor]:
+        hidden_states = hidden_states.flatten(2).transpose(1, 2)
+        return hidden_states
+
+
+class EfficientFormerMeta3D(nn.Module):
+    def __init__(self, config: EfficientFormerConfig, dim: int, drop_path: float = 0.0):
+        super().__init__()
+
+        self.token_mixer = EfficientFormerSelfAttention(
+            dim=config.dim,
+            key_dim=config.key_dim,
+            num_heads=config.num_attention_heads,
+            attention_ratio=config.attention_ratio,
+            resolution=config.resolution,
+        )
+
+        self.layernorm1 = nn.LayerNorm(dim, eps=config.layer_norm_eps)
+        self.layernorm2 = nn.LayerNorm(dim, eps=config.layer_norm_eps)
+
+        mlp_hidden_dim = int(dim * config.mlp_expansion_ratio)
+        self.mlp = EfficientFormerDenseMlp(config, in_features=dim, hidden_features=mlp_hidden_dim)
+
+        self.drop_path = EfficientFormerDropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.use_layer_scale = config.use_layer_scale
+        if config.use_layer_scale:
+            self.layer_scale_1 = nn.Parameter(config.layer_scale_init_value * torch.ones(dim), requires_grad=True)
+            self.layer_scale_2 = nn.Parameter(config.layer_scale_init_value * torch.ones(dim), requires_grad=True)
+
+    def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> tuple[torch.Tensor]:
+        self_attention_outputs = self.token_mixer(self.layernorm1(hidden_states), output_attentions)
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        if self.use_layer_scale:
+            layer_output = hidden_states + self.drop_path(
+                self.layer_scale_1.unsqueeze(0).unsqueeze(0) * attention_output
+            )
+            layer_output = layer_output + self.drop_path(
+                self.layer_scale_2.unsqueeze(0).unsqueeze(0) * self.mlp(self.layernorm2(layer_output))
+            )
+        else:
+            layer_output = hidden_states + self.drop_path(attention_output)
+            layer_output = layer_output + self.drop_path(self.mlp(self.layernorm2(layer_output)))
+
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+
+class EfficientFormerMeta3DLayers(nn.Module):
+    def __init__(self, config: EfficientFormerConfig):
+        super().__init__()
+        drop_paths = [
+            config.drop_path_rate * (block_idx + sum(config.depths[:-1]))
+            for block_idx in range(config.num_meta3d_blocks)
+        ]
+        self.blocks = nn.ModuleList(
+            [EfficientFormerMeta3D(config, config.hidden_sizes[-1], drop_path=drop_path) for drop_path in drop_paths]
+        )
+
+    def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> tuple[torch.Tensor]:
+        all_attention_outputs = () if output_attentions else None
+
+        for layer_module in self.blocks:
+            if isinstance(hidden_states, tuple):
+                hidden_states = hidden_states[0]
+
+            hidden_states = layer_module(hidden_states, output_attentions)
+
+            if output_attentions:
+                all_attention_outputs = all_attention_outputs + (hidden_states[1],)
+
+        if output_attentions:
+            outputs = (hidden_states[0],) + all_attention_outputs
+            return outputs
+
+        return hidden_states
+
+
+class EfficientFormerMeta4D(nn.Module):
+    def __init__(self, config: EfficientFormerConfig, dim: int, drop_path: float = 0.0):
+        super().__init__()
+        pool_size = config.pool_size if config.pool_size is not None else 3
+        self.token_mixer = EfficientFormerPooling(pool_size=pool_size)
+        mlp_hidden_dim = int(dim * config.mlp_expansion_ratio)
+        self.mlp = EfficientFormerConvMlp(
+            config, in_features=dim, hidden_features=mlp_hidden_dim, drop=config.hidden_dropout_prob
+        )
+
+        self.drop_path = EfficientFormerDropPath(drop_path) if drop_path > 0.0 else nn.Identity()
+        self.use_layer_scale = config.use_layer_scale
+        if config.use_layer_scale:
+            self.layer_scale_1 = nn.Parameter(config.layer_scale_init_value * torch.ones(dim), requires_grad=True)
+            self.layer_scale_2 = nn.Parameter(config.layer_scale_init_value * torch.ones(dim), requires_grad=True)
+
+    def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor]:
+        outputs = self.token_mixer(hidden_states)
+
+        if self.use_layer_scale:
+            layer_output = hidden_states + self.drop_path(self.layer_scale_1.unsqueeze(-1).unsqueeze(-1) * outputs)
+
+            layer_output = layer_output + self.drop_path(
+                self.layer_scale_2.unsqueeze(-1).unsqueeze(-1) * self.mlp(layer_output)
+            )
+        else:
+            layer_output = hidden_states + self.drop_path(outputs)
+            layer_output = layer_output + self.drop_path(self.mlp(layer_output))
+
+        return layer_output
+
+
+class EfficientFormerMeta4DLayers(nn.Module):
+    def __init__(self, config: EfficientFormerConfig, stage_idx: int):
+        super().__init__()
+        num_layers = (
+            config.depths[stage_idx] if stage_idx != -1 else config.depths[stage_idx] - config.num_meta3d_blocks
+        )
+        drop_paths = [
+            config.drop_path_rate * (block_idx + sum(config.depths[:stage_idx])) for block_idx in range(num_layers)
+        ]
+
+        self.blocks = nn.ModuleList(
+            [
+                EfficientFormerMeta4D(config, config.hidden_sizes[stage_idx], drop_path=drop_path)
+                for drop_path in drop_paths
+            ]
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor]:
+        for layer_module in self.blocks:
+            hidden_states = layer_module(hidden_states)
+        return hidden_states
+
+
+class EfficientFormerIntermediateStage(nn.Module):
+    def __init__(self, config: EfficientFormerConfig, index: int):
+        super().__init__()
+        self.meta4D_layers = EfficientFormerMeta4DLayers(config, index)
+
+    def forward(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor]:
+        hidden_states = self.meta4D_layers(hidden_states)
+        return hidden_states
+
+
+class EfficientFormerLastStage(nn.Module):
+    def __init__(self, config: EfficientFormerConfig):
+        super().__init__()
+        self.meta4D_layers = EfficientFormerMeta4DLayers(config, -1)
+        self.flat = EfficientFormerFlat()
+        self.meta3D_layers = EfficientFormerMeta3DLayers(config)
+
+    def forward(self, hidden_states: torch.Tensor, output_attentions: bool = False) -> tuple[torch.Tensor]:
+        hidden_states = self.meta4D_layers(hidden_states)
+        hidden_states = self.flat(hidden_states)
+        hidden_states = self.meta3D_layers(hidden_states, output_attentions)
+
+        return hidden_states
+
+
+class EfficientFormerEncoder(nn.Module):
+    def __init__(self, config: EfficientFormerConfig):
+        super().__init__()
+        self.config = config
+        num_intermediate_stages = len(config.depths) - 1
+        downsamples = [
+            config.downsamples[i] or config.hidden_sizes[i] != config.hidden_sizes[i + 1]
+            for i in range(num_intermediate_stages)
+        ]
+        intermediate_stages = []
+
+        for i in range(num_intermediate_stages):
+            intermediate_stages.append(EfficientFormerIntermediateStage(config, i))
+            if downsamples[i]:
+                intermediate_stages.append(
+                    EfficientFormerPatchEmbeddings(config, config.hidden_sizes[i], config.hidden_sizes[i + 1])
+                )
+
+        self.intermediate_stages = nn.ModuleList(intermediate_stages)
+        self.last_stage = EfficientFormerLastStage(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        output_hidden_states: bool = False,
+        output_attentions: bool = False,
+        return_dict: bool = True,
+    ) -> BaseModelOutput:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        for layer_module in self.intermediate_stages:
+            hidden_states = layer_module(hidden_states)
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+        layer_output = self.last_stage(hidden_states, output_attentions=output_attentions)
+
+        if output_attentions:
+            all_self_attentions = all_self_attentions + layer_output[1:]
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (layer_output[0],)
+
+        if not return_dict:
+            return tuple(v for v in [layer_output[0], all_hidden_states, all_self_attentions] if v is not None)
+
+        return BaseModelOutput(
+            last_hidden_state=layer_output[0],
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class EfficientFormerPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config: EfficientFormerConfig
+    base_model_prefix = "efficientformer"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = False
+
+    def _init_weights(self, module: nn.Module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+EFFICIENTFORMER_START_DOCSTRING = r"""
+    This model is a PyTorch [nn.Module](https://pytorch.org/docs/stable/nn.html#nn.Module) subclass. Use it as a
+    regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior.
+
+    Parameters:
+        config ([`EfficientFormerConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+EFFICIENTFORMER_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`ViTImageProcessor`]. See
+            [`ViTImageProcessor.preprocess`] for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare EfficientFormer Model transformer outputting raw hidden-states without any specific head on top.",
+    EFFICIENTFORMER_START_DOCSTRING,
+)
+class EfficientFormerModel(EfficientFormerPreTrainedModel):
+    def __init__(self, config: EfficientFormerConfig):
+        super().__init__(config)
+        self.config = config
+        _no_split_modules = ["EfficientFormerMeta4D"]
+
+        self.patch_embed = EfficientFormerConvStem(config, config.hidden_sizes[0])
+        self.encoder = EfficientFormerEncoder(config)
+        self.layernorm = nn.LayerNorm(config.hidden_sizes[-1], eps=config.layer_norm_eps)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPooling,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        embedding_output = self.patch_embed(pixel_values)
+        encoder_outputs = self.encoder(
+            embedding_output, output_attentions=output_attentions, output_hidden_states=output_hidden_states
+        )
+
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+
+        if not return_dict:
+            head_outputs = (sequence_output,)
+            return head_outputs + encoder_outputs[1:]
+
+        return BaseModelOutput(
+            last_hidden_state=sequence_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    EfficientFormer Model transformer with an image classification head on top (a linear layer on top of the final
+    hidden state of the [CLS] token) e.g. for ImageNet.
+    """,
+    EFFICIENTFORMER_START_DOCSTRING,
+)
+class EfficientFormerForImageClassification(EfficientFormerPreTrainedModel):
+    def __init__(self, config: EfficientFormerConfig):
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.efficientformer = EfficientFormerModel(config)
+
+        # Classifier head
+        self.classifier = (
+            nn.Linear(config.hidden_sizes[-1], config.num_labels) if config.num_labels > 0 else nn.Identity()
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=ImageClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, ImageClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.efficientformer(
+            pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.classifier(sequence_output.mean(-2))
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(labels, logits, self.config)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return ImageClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@dataclass
+class EfficientFormerForImageClassificationWithTeacherOutput(ModelOutput):
+    """
+    Output type of [`EfficientFormerForImageClassificationWithTeacher`].
+
+    Args:
+        logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
+            Prediction scores as the average of the cls_logits and distillation logits.
+        cls_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
+            Prediction scores of the classification head (i.e. the linear layer on top of the final hidden state of the
+            class token).
+        distillation_logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
+            Prediction scores of the distillation head (i.e. the linear layer on top of the final hidden state of the
+            distillation token).
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
+            plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+    """
+
+    logits: Optional[torch.FloatTensor] = None
+    cls_logits: Optional[torch.FloatTensor] = None
+    distillation_logits: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+
+
+@add_start_docstrings(
+    """
+    EfficientFormer Model transformer with image classification heads on top (a linear layer on top of the final hidden
+    state of the [CLS] token and a linear layer on top of the final hidden state of the distillation token) e.g. for
+    ImageNet.
+
+    <Tip warning={true}>
+
+           This model supports inference-only. Fine-tuning with distillation (i.e. with a teacher) is not yet
+           supported.
+
+    </Tip>
+    """,
+    EFFICIENTFORMER_START_DOCSTRING,
+)
+class EfficientFormerForImageClassificationWithTeacher(EfficientFormerPreTrainedModel):
+    def __init__(self, config: EfficientFormerConfig):
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.efficientformer = EfficientFormerModel(config)
+
+        # Classifier head
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
+        # Distillation head
+        self.distillation_classifier = (
+            nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=EfficientFormerForImageClassificationWithTeacherOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, EfficientFormerForImageClassificationWithTeacherOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.efficientformer(
+            pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        cls_logits = self.classifier(sequence_output.mean(-2))
+        distillation_logits = self.distillation_classifier(sequence_output.mean(-2))
+
+        # during inference, return the average of both classifier predictions
+        logits = (cls_logits + distillation_logits) / 2
+
+        if not return_dict:
+            output = (logits, cls_logits, distillation_logits) + outputs[1:]
+            return output
+
+        return EfficientFormerForImageClassificationWithTeacherOutput(
+            logits=logits,
+            cls_logits=cls_logits,
+            distillation_logits=distillation_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = [
+    "EfficientFormerForImageClassification",
+    "EfficientFormerForImageClassificationWithTeacher",
+    "EfficientFormerModel",
+    "EfficientFormerPreTrainedModel",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/modeling_tf_efficientformer.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/modeling_tf_efficientformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..643097e79c3eab47b62f7e7c4781b2b276418176
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/efficientformer/modeling_tf_efficientformer.py
@@ -0,0 +1,1198 @@
+# coding=utf-8
+# Copyright 2023 Snapchat Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TensorFlow EfficientFormer model."""
+
+import itertools
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import tensorflow as tf
+
+from ....activations_tf import ACT2FN
+from ....modeling_tf_outputs import (
+    TFBaseModelOutput,
+    TFBaseModelOutputWithPooling,
+    TFImageClassifierOutput,
+)
+from ....modeling_tf_utils import (
+    TFPreTrainedModel,
+    TFSequenceClassificationLoss,
+    get_initializer,
+    keras,
+    keras_serializable,
+    unpack_inputs,
+)
+from ....tf_utils import shape_list, stable_softmax
+from ....utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
+from .configuration_efficientformer import EfficientFormerConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "EfficientFormerConfig"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "snap-research/efficientformer-l1-300"
+_EXPECTED_OUTPUT_SHAPE = [1, 49, 448]
+
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "snap-research/efficientformer-l1-300"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "LABEL_281"
+
+
+class TFEfficientFormerPatchEmbeddings(keras.layers.Layer):
+    """
+    This class performs downsampling between two stages. For the input tensor with the shape [batch_size, num_channels,
+    height, width] it produces output tensor with the shape [batch_size, num_channels, height/stride, width/stride]
+    """
+
+    def __init__(
+        self, config: EfficientFormerConfig, num_channels: int, embed_dim: int, apply_norm: bool = True, **kwargs
+    ) -> None:
+        super().__init__(**kwargs)
+        self.num_channels = num_channels
+
+        self.padding = keras.layers.ZeroPadding2D(padding=config.downsample_pad)
+        self.projection = keras.layers.Conv2D(
+            filters=embed_dim,
+            kernel_size=config.downsample_patch_size,
+            strides=config.downsample_stride,
+            padding="valid",
+            name="projection",
+        )
+        # Use same default momentum and epsilon as PyTorch equivalent for BatchNormalization
+        self.norm = (
+            keras.layers.BatchNormalization(axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="norm")
+            if apply_norm
+            else tf.identity
+        )
+        self.embed_dim = embed_dim
+
+    def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor:
+        tf.debugging.assert_shapes(
+            [(pixel_values, (..., None, None, self.num_channels))],
+            message="Make sure that the channel dimension of the pixel values match with the one set in the configuration.",
+        )
+        embeddings = self.projection(self.padding(pixel_values))
+        embeddings = self.norm(embeddings, training=training)
+        return embeddings
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "projection", None) is not None:
+            with tf.name_scope(self.projection.name):
+                self.projection.build([None, None, None, self.num_channels])
+        if getattr(self, "norm", None) is not None:
+            if hasattr(self.norm, "name"):
+                with tf.name_scope(self.norm.name):
+                    self.norm.build([None, None, None, self.embed_dim])
+
+
+class TFEfficientFormerSelfAttention(keras.layers.Layer):
+    def __init__(
+        self,
+        dim: int,
+        key_dim: int,
+        num_heads: int,
+        attention_ratio: int,
+        resolution: int,
+        config: EfficientFormerConfig,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.num_heads = num_heads
+        self.key_dim = key_dim
+        self.attention_ratio = attention_ratio
+        self.scale = key_dim**-0.5
+        self.total_key_dim = key_dim * num_heads
+        self.expanded_key_dim = int(attention_ratio * key_dim)
+        self.total_expanded_key_dim = int(self.expanded_key_dim * num_heads)
+        hidden_size = self.total_expanded_key_dim + self.total_key_dim * 2
+
+        self.qkv = keras.layers.Dense(
+            units=hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="qkv"
+        )
+        self.projection = keras.layers.Dense(
+            units=dim, kernel_initializer=get_initializer(config.initializer_range), name="projection"
+        )
+        self.resolution = resolution
+        self.dim = dim
+
+    def build(self, input_shape: tf.TensorShape) -> None:
+        points = list(itertools.product(range(self.resolution), range(self.resolution)))
+        num_points = len(points)
+        attention_offsets = {}
+
+        idxs = []
+
+        for point_1 in points:
+            for point_2 in points:
+                offset = (abs(point_1[0] - point_2[0]), abs(point_1[1] - point_2[1]))
+                if offset not in attention_offsets:
+                    attention_offsets[offset] = len(attention_offsets)
+                idxs.append(attention_offsets[offset])
+
+        self.attention_biases = self.add_weight(
+            shape=(self.num_heads, len(attention_offsets)),
+            initializer=keras.initializers.zeros(),
+            trainable=True,
+            name="attention_biases",
+        )
+        self.attention_bias_idxs = self.add_weight(
+            shape=(num_points, num_points),
+            trainable=False,
+            dtype=tf.int32,
+            name="attention_bias_idxs",
+        )
+
+        self.attention_bias_idxs.assign(tf.reshape(tf.cast(idxs, dtype=tf.int32), (num_points, num_points)))
+
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "qkv", None) is not None:
+            with tf.name_scope(self.qkv.name):
+                self.qkv.build([None, None, self.dim])
+        if getattr(self, "projection", None) is not None:
+            with tf.name_scope(self.projection.name):
+                self.projection.build([None, None, self.total_expanded_key_dim])
+
+    def call(
+        self, hidden_states: tf.Tensor, output_attentions: bool = False, training: bool = False
+    ) -> tuple[tf.Tensor]:
+        batch_size, sequence_length, *_ = shape_list(hidden_states)
+        qkv = self.qkv(inputs=hidden_states)
+
+        query_layer, key_layer, value_layer = tf.split(
+            tf.reshape(tensor=qkv, shape=(batch_size, sequence_length, self.num_heads, -1)),
+            num_or_size_splits=[self.key_dim, self.key_dim, self.expanded_key_dim],
+            axis=3,
+        )
+
+        query_layer = tf.transpose(query_layer, perm=[0, 2, 1, 3])
+        key_layer = tf.transpose(key_layer, perm=[0, 2, 1, 3])
+        value_layer = tf.transpose(value_layer, perm=[0, 2, 1, 3])
+
+        attention_probs = tf.matmul(query_layer, tf.transpose(key_layer, perm=[0, 1, 3, 2]))
+        scale = tf.cast(self.scale, dtype=attention_probs.dtype)
+        attention_probs = tf.multiply(attention_probs, scale)
+
+        attention_biases = tf.gather(params=self.attention_biases, indices=self.attention_bias_idxs, axis=1)
+        attention_probs = attention_probs + attention_biases
+        attention_probs = stable_softmax(logits=attention_probs, axis=-1)
+
+        context_layer = tf.matmul(attention_probs, value_layer)
+        context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
+
+        context_layer = tf.reshape(
+            tensor=context_layer, shape=(batch_size, sequence_length, self.total_expanded_key_dim)
+        )
+        context_layer = self.projection(context_layer)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+class TFEfficientFormerConvStem(keras.layers.Layer):
+    def __init__(self, config: EfficientFormerConfig, out_channels: int, **kwargs):
+        super().__init__(**kwargs)
+
+        self.padding = keras.layers.ZeroPadding2D(padding=1)
+        self.convolution1 = keras.layers.Conv2D(
+            filters=out_channels // 2, kernel_size=3, strides=2, padding="valid", name="convolution1"
+        )
+        # Use same default momentum and epsilon as PyTorch equivalent for BatchNormalization
+        self.batchnorm_before = keras.layers.BatchNormalization(
+            axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="batchnorm_before"
+        )
+
+        self.convolution2 = keras.layers.Conv2D(
+            filters=out_channels,
+            kernel_size=3,
+            strides=2,
+            padding="valid",
+            name="convolution2",
+        )
+        # Use same default momentum and epsilon as PyTorch equivalent for BatchNormalization
+        self.batchnorm_after = keras.layers.BatchNormalization(
+            axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="batchnorm_after"
+        )
+
+        self.activation = keras.layers.Activation(activation=keras.activations.relu, name="activation")
+        self.out_channels = out_channels
+        self.config = config
+
+    def call(self, pixel_values: tf.Tensor, training: bool = False) -> tf.Tensor:
+        features = self.batchnorm_before(self.convolution1(self.padding(pixel_values)), training=training)
+        features = self.activation(features)
+        features = self.batchnorm_after(self.convolution2(self.padding(features)), training=training)
+        features = self.activation(features)
+        return features
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "convolution1", None) is not None:
+            with tf.name_scope(self.convolution1.name):
+                self.convolution1.build([None, None, None, self.config.num_channels])
+        if getattr(self, "batchnorm_before", None) is not None:
+            with tf.name_scope(self.batchnorm_before.name):
+                self.batchnorm_before.build([None, None, None, self.out_channels // 2])
+        if getattr(self, "convolution2", None) is not None:
+            with tf.name_scope(self.convolution2.name):
+                self.convolution2.build([None, None, None, self.out_channels // 2])
+        if getattr(self, "batchnorm_after", None) is not None:
+            with tf.name_scope(self.batchnorm_after.name):
+                self.batchnorm_after.build([None, None, None, self.out_channels])
+        if getattr(self, "activation", None) is not None:
+            with tf.name_scope(self.activation.name):
+                self.activation.build(None)
+
+
+class TFEfficientFormerPooling(keras.layers.Layer):
+    def __init__(self, pool_size: int, **kwargs):
+        super().__init__(**kwargs)
+        self.pool = keras.layers.AveragePooling2D(pool_size=pool_size, strides=1, padding="same")
+
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        output = self.pool(hidden_states)
+        output = output - hidden_states
+        return output
+
+
+class TFEfficientFormerDenseMlp(keras.layers.Layer):
+    def __init__(
+        self,
+        config: EfficientFormerConfig,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+
+        self.linear_in = keras.layers.Dense(
+            units=hidden_features, kernel_initializer=get_initializer(config.initializer_range), name="linear_in"
+        )
+        self.activation = ACT2FN[config.hidden_act]
+        self.dropout = keras.layers.Dropout(rate=config.hidden_dropout_prob)
+
+        self.linear_out = keras.layers.Dense(
+            units=out_features, kernel_initializer=get_initializer(config.initializer_range), name="linear_out"
+        )
+        self.hidden_features = hidden_features
+        self.in_features = in_features
+
+    def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
+        hidden_states = self.linear_in(inputs=hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.dropout(inputs=hidden_states, training=training)
+        hidden_states = self.linear_out(inputs=hidden_states)
+        hidden_states = self.dropout(inputs=hidden_states, training=training)
+
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "linear_in", None) is not None:
+            with tf.name_scope(self.linear_in.name):
+                self.linear_in.build([None, None, self.in_features])
+        if getattr(self, "linear_out", None) is not None:
+            with tf.name_scope(self.linear_out.name):
+                self.linear_out.build([None, None, self.hidden_features])
+
+
+class TFEfficientFormerConvMlp(keras.layers.Layer):
+    def __init__(
+        self,
+        config: EfficientFormerConfig,
+        in_features: int,
+        hidden_features: Optional[int] = None,
+        out_features: Optional[int] = None,
+        drop: float = 0.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        out_features = out_features or in_features
+        hidden_features = hidden_features or in_features
+
+        self.convolution1 = keras.layers.Conv2D(
+            filters=hidden_features,
+            kernel_size=1,
+            name="convolution1",
+            padding="valid",
+        )
+
+        self.activation = ACT2FN[config.hidden_act]
+
+        self.convolution2 = keras.layers.Conv2D(
+            filters=out_features,
+            kernel_size=1,
+            name="convolution2",
+            padding="valid",
+        )
+
+        self.dropout = keras.layers.Dropout(rate=drop)
+
+        # Use same default momentum and epsilon as PyTorch equivalent for BatchNormalization
+        self.batchnorm_before = keras.layers.BatchNormalization(
+            axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="batchnorm_before"
+        )
+        # Use same default momentum and epsilon as PyTorch equivalent for BatchNormalization
+        self.batchnorm_after = keras.layers.BatchNormalization(
+            axis=-1, epsilon=config.batch_norm_eps, momentum=0.9, name="batchnorm_after"
+        )
+        self.hidden_features = hidden_features
+        self.in_features = in_features
+        self.out_features = out_features
+
+    def call(self, hidden_state: tf.Tensor, training: bool = False) -> tf.Tensor:
+        hidden_state = self.convolution1(hidden_state)
+        hidden_state = self.batchnorm_before(hidden_state, training=training)
+        hidden_state = self.activation(hidden_state)
+        hidden_state = self.dropout(hidden_state, training=training)
+        hidden_state = self.convolution2(hidden_state)
+        hidden_state = self.batchnorm_after(hidden_state, training=training)
+        hidden_state = self.dropout(hidden_state, training=training)
+        return hidden_state
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "convolution1", None) is not None:
+            with tf.name_scope(self.convolution1.name):
+                self.convolution1.build([None, None, None, self.in_features])
+        if getattr(self, "convolution2", None) is not None:
+            with tf.name_scope(self.convolution2.name):
+                self.convolution2.build([None, None, None, self.hidden_features])
+        if getattr(self, "batchnorm_before", None) is not None:
+            with tf.name_scope(self.batchnorm_before.name):
+                self.batchnorm_before.build([None, None, None, self.hidden_features])
+        if getattr(self, "batchnorm_after", None) is not None:
+            with tf.name_scope(self.batchnorm_after.name):
+                self.batchnorm_after.build([None, None, None, self.out_features])
+
+
+# Copied from transformers.models.convnext.modeling_tf_convnext.TFConvNextDropPath with ConvNext->EfficientFormer
+class TFEfficientFormerDropPath(keras.layers.Layer):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    References:
+        (1) github.com:rwightman/pytorch-image-models
+    """
+
+    def __init__(self, drop_path: float, **kwargs):
+        super().__init__(**kwargs)
+        self.drop_path = drop_path
+
+    def call(self, x: tf.Tensor, training=None):
+        if training:
+            keep_prob = 1 - self.drop_path
+            shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1)
+            random_tensor = keep_prob + tf.random.uniform(shape, 0, 1)
+            random_tensor = tf.floor(random_tensor)
+            return (x / keep_prob) * random_tensor
+        return x
+
+
+class TFEfficientFormerFlat(keras.layers.Layer):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def call(self, hidden_states: tf.Tensor) -> tuple[tf.Tensor]:
+        batch_size, _, _, in_channels = shape_list(hidden_states)
+        hidden_states = tf.reshape(hidden_states, shape=[batch_size, -1, in_channels])
+        return hidden_states
+
+
+class TFEfficientFormerMeta3D(keras.layers.Layer):
+    def __init__(self, config: EfficientFormerConfig, dim: int, drop_path: float = 0.0, **kwargs):
+        super().__init__(**kwargs)
+
+        self.token_mixer = TFEfficientFormerSelfAttention(
+            dim=config.dim,
+            key_dim=config.key_dim,
+            num_heads=config.num_attention_heads,
+            attention_ratio=config.attention_ratio,
+            resolution=config.resolution,
+            name="token_mixer",
+            config=config,
+        )
+        self.dim = dim
+        self.config = config
+
+        self.layernorm1 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm1")
+        self.layernorm2 = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm2")
+        mlp_hidden_dim = int(dim * config.mlp_expansion_ratio)
+        self.mlp = TFEfficientFormerDenseMlp(config, in_features=dim, hidden_features=mlp_hidden_dim, name="mlp")
+
+        # Using `layers.Activation` instead of `tf.identity` to better control `training' behavior.
+        self.drop_path = (
+            TFEfficientFormerDropPath(drop_path)
+            if drop_path > 0.0
+            else keras.layers.Activation("linear", name="drop_path")
+        )
+        self.config = config
+
+    def build(self, input_shape=None):
+        self.layer_scale_1 = None
+        self.layer_scale_2 = None
+
+        if self.config.use_layer_scale:
+            self.layer_scale_1 = self.add_weight(
+                shape=(self.dim,),
+                initializer=keras.initializers.Constant(value=self.config.layer_scale_init_value),
+                trainable=True,
+                name="layer_scale_1",
+            )
+            self.layer_scale_2 = self.add_weight(
+                shape=(self.dim,),
+                initializer=keras.initializers.Constant(value=self.config.layer_scale_init_value),
+                trainable=True,
+                name="layer_scale_2",
+            )
+
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "token_mixer", None) is not None:
+            with tf.name_scope(self.token_mixer.name):
+                self.token_mixer.build(None)
+        if getattr(self, "layernorm1", None) is not None:
+            with tf.name_scope(self.layernorm1.name):
+                self.layernorm1.build([None, None, self.dim])
+        if getattr(self, "layernorm2", None) is not None:
+            with tf.name_scope(self.layernorm2.name):
+                self.layernorm2.build([None, None, self.dim])
+        if getattr(self, "mlp", None) is not None:
+            with tf.name_scope(self.mlp.name):
+                self.mlp.build(None)
+        if getattr(self, "drop_path", None) is not None:
+            with tf.name_scope(self.drop_path.name):
+                self.drop_path.build(None)
+
+    def call(
+        self, hidden_states: tf.Tensor, output_attentions: bool = False, training: bool = False
+    ) -> tuple[tf.Tensor]:
+        self_attention_outputs = self.token_mixer(
+            hidden_states=self.layernorm1(hidden_states, training=training),
+            output_attentions=output_attentions,
+            training=training,
+        )
+
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        if self.config.use_layer_scale:
+            layer_output = hidden_states + self.drop_path(
+                tf.expand_dims(tf.expand_dims(self.layer_scale_1, 0), 0) * attention_output,
+                training=training,
+            )
+            layer_output = layer_output + self.drop_path(
+                tf.expand_dims(tf.expand_dims(self.layer_scale_2, 0), 0)
+                * self.mlp(hidden_states=self.layernorm2(inputs=layer_output, training=training), training=training),
+                training=training,
+            )
+        else:
+            layer_output = hidden_states + self.drop_path(attention_output, training=training)
+            layer_output = layer_output + self.drop_path(
+                self.mlp(hidden_states=self.layernorm2(inputs=layer_output, training=training), training=training),
+                training=training,
+            )
+
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+
+class TFEfficientFormerMeta3DLayers(keras.layers.Layer):
+    def __init__(self, config: EfficientFormerConfig, **kwargs):
+        super().__init__(**kwargs)
+        drop_paths = [
+            config.drop_path_rate * (block_idx + sum(config.depths[:-1]))
+            for block_idx in range(config.num_meta3d_blocks)
+        ]
+        self.blocks = [
+            TFEfficientFormerMeta3D(config, config.hidden_sizes[-1], drop_path=drop_path, name=f"blocks.{i}")
+            for i, drop_path in enumerate(drop_paths)
+        ]
+
+    def call(
+        self, hidden_states: tf.Tensor, output_attentions: bool = False, training: bool = False
+    ) -> tuple[tf.Tensor]:
+        all_attention_outputs = () if output_attentions else None
+
+        for i, layer_module in enumerate(self.blocks):
+            if isinstance(hidden_states, tuple):
+                hidden_states = hidden_states[0]
+
+            hidden_states = layer_module(
+                hidden_states=hidden_states, output_attentions=output_attentions, training=training
+            )
+            if output_attentions:
+                all_attention_outputs = all_attention_outputs + (hidden_states[1],)
+
+        if output_attentions:
+            outputs = (hidden_states[0],) + all_attention_outputs
+            return outputs
+
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "blocks", None) is not None:
+            for layer in self.blocks:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+
+
+class TFEfficientFormerMeta4D(keras.layers.Layer):
+    def __init__(self, config: EfficientFormerConfig, dim: int, drop_path: float = 0.0, **kwargs):
+        super().__init__(**kwargs)
+        pool_size = config.pool_size if config.pool_size is not None else 3
+        self.token_mixer = TFEfficientFormerPooling(pool_size=pool_size, name="token_mixer")
+        self.dim = dim
+        mlp_hidden_dim = int(dim * config.mlp_expansion_ratio)
+        self.mlp = TFEfficientFormerConvMlp(
+            config=config, in_features=dim, hidden_features=mlp_hidden_dim, drop=config.hidden_dropout_prob, name="mlp"
+        )
+
+        self.drop_path = (
+            TFEfficientFormerDropPath(drop_path, name="drop_path")
+            if drop_path > 0.0
+            else keras.layers.Activation("linear", name="drop_path")
+        )
+        self.config = config
+
+    def build(self, input_shape=None):
+        self.layer_scale_1 = None
+        self.layer_scale_2 = None
+
+        if self.config.use_layer_scale:
+            self.layer_scale_1 = self.add_weight(
+                shape=(self.dim),
+                initializer=keras.initializers.Constant(value=self.config.layer_scale_init_value),
+                trainable=True,
+                name="layer_scale_1",
+            )
+            self.layer_scale_2 = self.add_weight(
+                shape=(self.dim),
+                initializer=keras.initializers.Constant(value=self.config.layer_scale_init_value),
+                trainable=True,
+                name="layer_scale_2",
+            )
+
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "token_mixer", None) is not None:
+            with tf.name_scope(self.token_mixer.name):
+                self.token_mixer.build(None)
+        if getattr(self, "mlp", None) is not None:
+            with tf.name_scope(self.mlp.name):
+                self.mlp.build(None)
+        if getattr(self, "drop_path", None) is not None:
+            with tf.name_scope(self.drop_path.name):
+                self.drop_path.build(None)
+
+    def call(self, hidden_states: tf.Tensor, training: bool = False) -> tuple[tf.Tensor]:
+        outputs = self.token_mixer(hidden_states)
+
+        if self.config.use_layer_scale:
+            layer_output = hidden_states + self.drop_path(
+                tf.expand_dims(tf.expand_dims(self.layer_scale_1, 0), 0) * outputs,
+                training=training,
+            )
+
+            layer_output = layer_output + self.drop_path(
+                tf.expand_dims(tf.expand_dims(self.layer_scale_2, 0), 0)
+                * self.mlp(hidden_state=layer_output, training=training),
+                training=training,
+            )
+
+        else:
+            layer_output = hidden_states + self.drop_path(outputs, training=training)
+            layer_output = layer_output + self.drop_path(
+                self.mlp(hidden_state=layer_output, training=training), training=training
+            )
+
+        return layer_output
+
+
+class TFEfficientFormerMeta4DLayers(keras.layers.Layer):
+    def __init__(self, config: EfficientFormerConfig, stage_idx: int, **kwargs):
+        super().__init__(**kwargs)
+        num_layers = (
+            config.depths[stage_idx] if stage_idx != -1 else config.depths[stage_idx] - config.num_meta3d_blocks
+        )
+        drop_paths = [
+            config.drop_path_rate * (block_idx + sum(config.depths[:stage_idx])) for block_idx in range(num_layers)
+        ]
+
+        self.blocks = [
+            TFEfficientFormerMeta4D(
+                config=config, dim=config.hidden_sizes[stage_idx], drop_path=drop_paths[i], name=f"blocks.{i}"
+            )
+            for i in range(len(drop_paths))
+        ]
+
+    def call(self, hidden_states: tf.Tensor, training: bool = False) -> tuple[tf.Tensor]:
+        for layer_module in self.blocks:
+            hidden_states = layer_module(hidden_states=hidden_states, training=training)
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "blocks", None) is not None:
+            for layer in self.blocks:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+
+
+class TFEfficientFormerIntermediateStage(keras.layers.Layer):
+    def __init__(self, config: EfficientFormerConfig, index: int, **kwargs):
+        super().__init__(**kwargs)
+        self.meta4D_layers = TFEfficientFormerMeta4DLayers(config=config, stage_idx=index, name="meta4D_layers")
+
+    def call(self, hidden_states: tf.Tensor, training: bool = False) -> tuple[tf.Tensor]:
+        hidden_states = self.meta4D_layers(hidden_states=hidden_states, training=training)
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "meta4D_layers", None) is not None:
+            with tf.name_scope(self.meta4D_layers.name):
+                self.meta4D_layers.build(None)
+
+
+class TFEfficientFormerLastStage(keras.layers.Layer):
+    def __init__(self, config: EfficientFormerConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.meta4D_layers = TFEfficientFormerMeta4DLayers(config=config, stage_idx=-1, name="meta4D_layers")
+        self.flat = TFEfficientFormerFlat(name="flat")
+        self.meta3D_layers = TFEfficientFormerMeta3DLayers(config, name="meta3D_layers")
+
+    def call(
+        self, hidden_states: tf.Tensor, output_attentions: bool = False, training: bool = False
+    ) -> tuple[tf.Tensor]:
+        hidden_states = self.meta4D_layers(hidden_states=hidden_states, training=training)
+        hidden_states = self.flat(hidden_states=hidden_states)
+        hidden_states = self.meta3D_layers(
+            hidden_states=hidden_states, output_attentions=output_attentions, training=training
+        )
+
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "meta4D_layers", None) is not None:
+            with tf.name_scope(self.meta4D_layers.name):
+                self.meta4D_layers.build(None)
+        if getattr(self, "flat", None) is not None:
+            with tf.name_scope(self.flat.name):
+                self.flat.build(None)
+        if getattr(self, "meta3D_layers", None) is not None:
+            with tf.name_scope(self.meta3D_layers.name):
+                self.meta3D_layers.build(None)
+
+
+class TFEfficientFormerEncoder(keras.layers.Layer):
+    def __init__(self, config: EfficientFormerConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.config = config
+        num_intermediate_stages = len(config.depths) - 1
+        downsamples = [
+            config.downsamples[i] or config.hidden_sizes[i] != config.hidden_sizes[i + 1]
+            for i in range(num_intermediate_stages)
+        ]
+
+        intermediate_stages = []
+        layer_count = -1
+        for i in range(num_intermediate_stages):
+            layer_count += 1
+            intermediate_stages.append(
+                TFEfficientFormerIntermediateStage(config, i, name=f"intermediate_stages.{layer_count}")
+            )
+            if downsamples[i]:
+                layer_count += 1
+                intermediate_stages.append(
+                    TFEfficientFormerPatchEmbeddings(
+                        config,
+                        config.hidden_sizes[i],
+                        config.hidden_sizes[i + 1],
+                        name=f"intermediate_stages.{layer_count}",
+                    )
+                )
+        self.intermediate_stages = intermediate_stages
+        self.last_stage = TFEfficientFormerLastStage(config, name="last_stage")
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        output_hidden_states: bool,
+        output_attentions: bool,
+        return_dict: bool,
+        training: bool = False,
+    ) -> TFBaseModelOutput:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        for layer_module in self.intermediate_stages:
+            hidden_states = layer_module(hidden_states, training=training)
+
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+        layer_output = self.last_stage(hidden_states, output_attentions=output_attentions, training=training)
+
+        if output_attentions:
+            all_self_attentions = all_self_attentions + layer_output[1:]
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (layer_output[0],)
+
+        if not return_dict:
+            return tuple(v for v in [layer_output[0], all_hidden_states, all_self_attentions] if v is not None)
+
+        return TFBaseModelOutput(
+            last_hidden_state=layer_output[0],
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "last_stage", None) is not None:
+            with tf.name_scope(self.last_stage.name):
+                self.last_stage.build(None)
+        for layer in self.intermediate_stages:
+            with tf.name_scope(layer.name):
+                layer.build(None)
+
+
+@keras_serializable
+class TFEfficientFormerMainLayer(keras.layers.Layer):
+    config_class = EfficientFormerConfig
+
+    def __init__(self, config: EfficientFormerConfig, **kwargs) -> None:
+        super().__init__(**kwargs)
+        self.config = config
+
+        self.patch_embed = TFEfficientFormerConvStem(config, config.hidden_sizes[0], name="patch_embed")
+        self.encoder = TFEfficientFormerEncoder(config, name="encoder")
+        self.layernorm = keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="layernorm")
+
+    @unpack_inputs
+    def call(
+        self,
+        pixel_values: Optional[tf.Tensor] = None,
+        output_attentions: Optional[tf.Tensor] = None,
+        output_hidden_states: Optional[tf.Tensor] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+    ) -> Union[TFBaseModelOutput, tuple[tf.Tensor, ...]]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        # When running on CPU, keras.layers.Conv2D and keras.layers.AveragePool2D do not
+        # support channels first NCHW format. A number of blocks contain both.
+        # So change the input format from (batch_size, num_channels, height, width) to
+        # (batch_size, height, width, num_channels) here.
+        # shape = (batch_size, in_height, in_width, in_channels=num_channels)
+        pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))
+        embedding_output = self.patch_embed(pixel_values, training=training)
+
+        encoder_outputs = self.encoder(
+            hidden_states=embedding_output,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output, training=training)
+
+        # Change the hidden states from (batch_size, height, width, num_channels) to
+        # (batch_size, num_channels, height, width).
+        # The hidden states are in (batch_size, height, width, num_channels)
+        # shape after all stages except the MB3D blocks.
+        if output_hidden_states:
+            hidden_states = tuple(tf.transpose(h, perm=(0, 3, 1, 2)) for h in encoder_outputs[1][:-1]) + (
+                encoder_outputs[1][-1],
+            )
+
+        if not return_dict:
+            head_outputs = (sequence_output,)
+            return head_outputs + encoder_outputs[1:]
+
+        return TFBaseModelOutput(
+            last_hidden_state=sequence_output,
+            hidden_states=hidden_states if output_hidden_states else encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "patch_embed", None) is not None:
+            with tf.name_scope(self.patch_embed.name):
+                self.patch_embed.build(None)
+        if getattr(self, "encoder", None) is not None:
+            with tf.name_scope(self.encoder.name):
+                self.encoder.build(None)
+        if getattr(self, "layernorm", None) is not None:
+            with tf.name_scope(self.layernorm.name):
+                self.layernorm.build([None, None, self.config.hidden_sizes[-1]])
+
+
+class TFEfficientFormerPreTrainedModel(TFPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = EfficientFormerConfig
+    base_model_prefix = "efficientformer"
+    main_input_name = "pixel_values"
+
+
+EFFICIENTFORMER_START_DOCSTRING = r"""
+    This model is a TensorFlow
+    [keras.layers.Layer](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Layer). Use it as a regular
+    TensorFlow Module and refer to the TensorFlow documentation for all matter related to general usage and behavior.
+
+
+    Parameters:
+        config ([`EfficientFormerConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+EFFICIENTFORMER_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values ((`tf.Tensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`EfficientFormerImageProcessor.__call__`] for details.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare EfficientFormer Model transformer outputting raw hidden-states without any specific head on top.",
+    EFFICIENTFORMER_START_DOCSTRING,
+)
+class TFEfficientFormerModel(TFEfficientFormerPreTrainedModel):
+    def __init__(self, config: EfficientFormerConfig, **kwargs) -> None:
+        super().__init__(config, **kwargs)
+
+        self.efficientformer = TFEfficientFormerMainLayer(config, name="efficientformer")
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFBaseModelOutputWithPooling,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def call(
+        self,
+        pixel_values: Optional[tf.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+    ) -> Union[tuple, TFBaseModelOutput]:
+        outputs = self.efficientformer(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "efficientformer", None) is not None:
+            with tf.name_scope(self.efficientformer.name):
+                self.efficientformer.build(None)
+
+
+@add_start_docstrings(
+    """
+    EfficientFormer Model transformer with an image classification head on top of pooled last hidden state, e.g. for
+    ImageNet.
+    """,
+    EFFICIENTFORMER_START_DOCSTRING,
+)
+class TFEfficientFormerForImageClassification(TFEfficientFormerPreTrainedModel, TFSequenceClassificationLoss):
+    def __init__(self, config: EfficientFormerConfig):
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.efficientformer = TFEfficientFormerMainLayer(config, name="efficientformer")
+
+        # Classifier head
+        self.classifier = (
+            keras.layers.Dense(config.num_labels, name="classifier")
+            if config.num_labels > 0
+            else keras.layers.Activation("linear", name="classifier")
+        )
+        self.config = config
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=TFImageClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
+    def call(
+        self,
+        pixel_values: Optional[tf.Tensor] = None,
+        labels: Optional[tf.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+    ) -> Union[tf.Tensor, TFImageClassifierOutput]:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.efficientformer(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.classifier(tf.reduce_mean(sequence_output, axis=-2))
+
+        loss = None if labels is None else self.hf_compute_loss(labels, logits)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFImageClassifierOutput(
+            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "efficientformer", None) is not None:
+            with tf.name_scope(self.efficientformer.name):
+                self.efficientformer.build(None)
+        if getattr(self, "classifier", None) is not None:
+            if hasattr(self.classifier, "name"):
+                with tf.name_scope(self.classifier.name):
+                    self.classifier.build([None, None, self.config.hidden_sizes[-1]])
+
+
+@dataclass
+class TFEfficientFormerForImageClassificationWithTeacherOutput(ModelOutput):
+    """
+    Args:
+    Output type of [`EfficientFormerForImageClassificationWithTeacher`].
+        logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
+            Prediction scores as the average of the cls_logits and distillation logits.
+        cls_logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
+            Prediction scores of the classification head (i.e. the linear layer on top of the final hidden state of the
+            class token).
+        distillation_logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
+            Prediction scores of the distillation head (i.e. the linear layer on top of the final hidden state of the
+            distillation token).
+        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when
+        `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer plus
+            the initial embedding outputs.
+        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when
+        `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+    """
+
+    logits: Optional[tf.Tensor] = None
+    cls_logits: Optional[tf.Tensor] = None
+    distillation_logits: Optional[tf.Tensor] = None
+    hidden_states: Optional[tuple[tf.Tensor]] = None
+    attentions: Optional[tuple[tf.Tensor]] = None
+
+
+@add_start_docstrings(
+    """
+    EfficientFormer Model transformer with image classification heads on top (a linear layer on top of the final hidden
+    state and a linear layer on top of the final hidden state of the distillation token) e.g. for ImageNet.
+
+    .. warning::
+            This model supports inference-only. Fine-tuning with distillation (i.e. with a teacher) is not yet
+            supported.
+    """,
+    EFFICIENTFORMER_START_DOCSTRING,
+)
+class TFEfficientFormerForImageClassificationWithTeacher(TFEfficientFormerPreTrainedModel):
+    def __init__(self, config: EfficientFormerConfig) -> None:
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.efficientformer = TFEfficientFormerMainLayer(config, name="efficientformer")
+
+        # Classifier heads
+        self.classifier = (
+            keras.layers.Dense(config.num_labels, name="classifier")
+            if config.num_labels > 0
+            else keras.layers.Activation("linear", name="classifier")
+        )
+        self.distillation_classifier = (
+            keras.layers.Dense(config.num_labels, name="distillation_classifier")
+            if config.num_labels > 0
+            else keras.layers.Activation("linear", name="distillation_classifier")
+        )
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(EFFICIENTFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=TFEfficientFormerForImageClassificationWithTeacherOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
+    def call(
+        self,
+        pixel_values: Optional[tf.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        training: bool = False,
+    ) -> Union[tuple, TFEfficientFormerForImageClassificationWithTeacherOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if training:
+            raise Exception(
+                "This model supports inference-only. Fine-tuning with distillation (i.e. with a teacher) is not yet supported."
+            )
+
+        outputs = self.efficientformer(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        sequence_output = outputs[0]
+
+        cls_logits = self.classifier(tf.reduce_mean(sequence_output, axis=-2))
+        distillation_logits = self.distillation_classifier(tf.reduce_mean(sequence_output, axis=-2))
+        logits = (cls_logits + distillation_logits) / 2
+
+        if not return_dict:
+            output = (logits, cls_logits, distillation_logits) + outputs[1:]
+            return output
+
+        return TFEfficientFormerForImageClassificationWithTeacherOutput(
+            logits=logits,
+            cls_logits=cls_logits,
+            distillation_logits=distillation_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "efficientformer", None) is not None:
+            with tf.name_scope(self.efficientformer.name):
+                self.efficientformer.build(None)
+        if getattr(self, "classifier", None) is not None:
+            if hasattr(self.classifier, "name"):
+                with tf.name_scope(self.classifier.name):
+                    self.classifier.build([None, None, self.config.hidden_sizes[-1]])
+        if getattr(self, "distillation_classifier", None) is not None:
+            if hasattr(self.distillation_classifier, "name"):
+                with tf.name_scope(self.distillation_classifier.name):
+                    self.distillation_classifier.build([None, None, self.config.hidden_sizes[-1]])
+
+
+__all__ = [
+    "TFEfficientFormerForImageClassification",
+    "TFEfficientFormerForImageClassificationWithTeacher",
+    "TFEfficientFormerModel",
+    "TFEfficientFormerPreTrainedModel",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2beb8f463ff10af33ea980599ea4d5fc05888acb
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2023 The HuggingFace and Baidu Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ....utils import _LazyModule
+from ....utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_ernie_m import *
+    from .modeling_ernie_m import *
+    from .tokenization_ernie_m import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/configuration_ernie_m.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/configuration_ernie_m.py
new file mode 100644
index 0000000000000000000000000000000000000000..839b17b11bf7248136668d8c325f03e10a64cd96
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/configuration_ernie_m.py
@@ -0,0 +1,112 @@
+# coding=utf-8
+# Copyright 2023 Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""ErnieM model configuration"""
+# Adapted from original paddlenlp repository.(https://github.com/PaddlePaddle/PaddleNLP/blob/develop/paddlenlp/transformers/ernie_m/configuration.py)
+
+from __future__ import annotations
+
+from ....configuration_utils import PretrainedConfig
+
+
+class ErnieMConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ErnieMModel`]. It is used to instantiate a
+    Ernie-M model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the `Ernie-M`
+    [susnato/ernie-m-base_pytorch](https://huggingface.co/susnato/ernie-m-base_pytorch) architecture.
+
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 250002):
+            Vocabulary size of `inputs_ids` in [`ErnieMModel`]. Also is the vocab size of token embedding matrix.
+            Defines the number of different tokens that can be represented by the `inputs_ids` passed when calling
+            [`ErnieMModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the embedding layer, encoder layers and pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the feed-forward (ff) layer in the encoder. Input tensors to feed-forward layers are
+            firstly projected from hidden_size to intermediate_size, and then projected back to hidden_size. Typically
+            intermediate_size is larger than hidden_size.
+        hidden_act (`str`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function in the feed-forward layer. `"gelu"`, `"relu"` and any other torch
+            supported activation functions are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings and encoder.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability used in `MultiHeadAttention` in all encoder layers to drop some attention target.
+        max_position_embeddings (`int`, *optional*, defaults to 514):
+            The maximum value of the dimensionality of position encoding, which dictates the maximum supported length
+            of an input sequence.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the normal initializer for initializing all weight matrices. The index of padding
+            token in the token vocabulary.
+        pad_token_id (`int`, *optional*, defaults to 1):
+            Padding token id.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+        act_dropout (`float`, *optional*, defaults to 0.0):
+            This dropout probability is used in `ErnieMEncoderLayer` after activation.
+
+    A normal_initializer initializes weight matrices as normal distributions. See
+    `ErnieMPretrainedModel._init_weights()` for how weights are initialized in `ErnieMModel`.
+    """
+
+    model_type = "ernie_m"
+    attribute_map: dict[str, str] = {"dropout": "classifier_dropout", "num_classes": "num_labels"}
+
+    def __init__(
+        self,
+        vocab_size: int = 250002,
+        hidden_size: int = 768,
+        num_hidden_layers: int = 12,
+        num_attention_heads: int = 12,
+        intermediate_size: int = 3072,
+        hidden_act: str = "gelu",
+        hidden_dropout_prob: float = 0.1,
+        attention_probs_dropout_prob: float = 0.1,
+        max_position_embeddings: int = 514,
+        initializer_range: float = 0.02,
+        pad_token_id: int = 1,
+        layer_norm_eps: float = 1e-05,
+        classifier_dropout=None,
+        act_dropout=0.0,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.classifier_dropout = classifier_dropout
+        self.act_dropout = act_dropout
+
+
+__all__ = ["ErnieMConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/modeling_ernie_m.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/modeling_ernie_m.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0e97c132d0989079aa647cac4e64bbae451a116
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/modeling_ernie_m.py
@@ -0,0 +1,1061 @@
+# coding=utf-8
+# Copyright 2023 Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch ErnieM model."""
+
+import math
+from typing import Optional, Union
+
+import torch
+from torch import nn, tensor
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ....activations import ACT2FN
+from ....cache_utils import Cache
+from ....modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    MultipleChoiceModelOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from ....modeling_utils import PreTrainedModel
+from ....pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ....utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from ....utils.deprecation import deprecate_kwarg
+from .configuration_ernie_m import ErnieMConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "susnato/ernie-m-base_pytorch"
+_CONFIG_FOR_DOC = "ErnieMConfig"
+_TOKENIZER_FOR_DOC = "ErnieMTokenizer"
+
+
+# Adapted from paddlenlp.transformers.ernie_m.modeling.ErnieEmbeddings
+class ErnieMEmbeddings(nn.Module):
+    """Construct the embeddings from word and position embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(
+            config.max_position_embeddings, config.hidden_size, padding_idx=config.pad_token_id
+        )
+        self.layer_norm = nn.LayerNorm(normalized_shape=config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(p=config.hidden_dropout_prob)
+        self.padding_idx = config.pad_token_id
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.LongTensor] = None,
+        past_key_values_length: int = 0,
+    ) -> torch.Tensor:
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        if position_ids is None:
+            input_shape = inputs_embeds.size()[:-1]
+            ones = torch.ones(input_shape, dtype=torch.int64, device=inputs_embeds.device)
+            seq_length = torch.cumsum(ones, dim=1)
+            position_ids = seq_length - ones
+
+            if past_key_values_length > 0:
+                position_ids = position_ids + past_key_values_length
+        # to mimic paddlenlp implementation
+        position_ids += 2
+        position_embeddings = self.position_embeddings(position_ids)
+        embeddings = inputs_embeds + position_embeddings
+        embeddings = self.layer_norm(embeddings)
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+class ErnieMSelfAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.q_proj = nn.Linear(config.hidden_size, self.all_head_size)
+        self.k_proj = nn.Linear(config.hidden_size, self.all_head_size)
+        self.v_proj = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = position_embedding_type or getattr(
+            config, "position_embedding_type", "absolute"
+        )
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.Tensor]:
+        mixed_query_layer = self.q_proj(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_values is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_values[0]
+            value_layer = past_key_values[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.k_proj(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.v_proj(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_values is not None:
+            key_layer = self.transpose_for_scores(self.k_proj(hidden_states))
+            value_layer = self.transpose_for_scores(self.v_proj(hidden_states))
+            key_layer = torch.cat([past_key_values[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_values[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.k_proj(hidden_states))
+            value_layer = self.transpose_for_scores(self.v_proj(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        use_cache = past_key_values is not None
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_values` is always `None`
+            past_key_values = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            query_length, key_length = query_layer.shape[2], key_layer.shape[2]
+            if use_cache:
+                position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view(
+                    -1, 1
+                )
+            else:
+                position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in ErnieMModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_values,)
+        return outputs
+
+
+class ErnieMAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        self.self_attn = ErnieMSelfAttention(config, position_embedding_type=position_embedding_type)
+        self.out_proj = nn.Linear(config.hidden_size, config.hidden_size)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self_attn.num_attention_heads, self.self_attn.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self_attn.q_proj = prune_linear_layer(self.self_attn.q_proj, index)
+        self.self_attn.k_proj = prune_linear_layer(self.self_attn.k_proj, index)
+        self.self_attn.v_proj = prune_linear_layer(self.self_attn.v_proj, index)
+        self.out_proj = prune_linear_layer(self.out_proj, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self_attn.num_attention_heads = self.self_attn.num_attention_heads - len(heads)
+        self.self_attn.all_head_size = self.self_attn.attention_head_size * self.self_attn.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.Tensor]:
+        self_outputs = self.self_attn(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_values,
+            output_attentions,
+        )
+        attention_output = self.out_proj(self_outputs[0])
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class ErnieMEncoderLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        # to mimic paddlenlp implementation
+        dropout = 0.1 if config.hidden_dropout_prob is None else config.hidden_dropout_prob
+        act_dropout = config.hidden_dropout_prob if config.act_dropout is None else config.act_dropout
+
+        self.self_attn = ErnieMAttention(config)
+        self.linear1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.dropout = nn.Dropout(act_dropout)
+        self.linear2 = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout1 = nn.Dropout(dropout)
+        self.dropout2 = nn.Dropout(dropout)
+        if isinstance(config.hidden_act, str):
+            self.activation = ACT2FN[config.hidden_act]
+        else:
+            self.activation = config.hidden_act
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = True,
+    ):
+        residual = hidden_states
+        if output_attentions:
+            hidden_states, attention_opt_weights = self.self_attn(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+            )
+
+        else:
+            hidden_states = self.self_attn(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+            )
+        hidden_states = residual + self.dropout1(hidden_states)
+        hidden_states = self.norm1(hidden_states)
+        residual = hidden_states
+
+        hidden_states = self.linear1(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.linear2(hidden_states)
+        hidden_states = residual + self.dropout2(hidden_states)
+        hidden_states = self.norm2(hidden_states)
+
+        if output_attentions:
+            return hidden_states, attention_opt_weights
+        else:
+            return hidden_states
+
+
+class ErnieMEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([ErnieMEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+
+    def forward(
+        self,
+        input_embeds: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
+        hidden_states = () if output_hidden_states else None
+        attentions = () if output_attentions else None
+
+        output = input_embeds
+        if output_hidden_states:
+            hidden_states = hidden_states + (output,)
+        for i, layer in enumerate(self.layers):
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            output, opt_attn_weights = layer(
+                hidden_states=output,
+                attention_mask=attention_mask,
+                head_mask=layer_head_mask,
+                past_key_values=past_key_values[i] if past_key_values is not None else None,
+            )
+
+            if output_hidden_states:
+                hidden_states = hidden_states + (output,)
+            if output_attentions:
+                attentions = attentions + (opt_attn_weights,)
+
+        last_hidden_state = output
+        if not return_dict:
+            return tuple(v for v in [last_hidden_state, hidden_states, attentions] if v is not None)
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=last_hidden_state, hidden_states=hidden_states, attentions=attentions
+        )
+
+
+class ErnieMPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class ErnieMPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config: ErnieMConfig
+    base_model_prefix = "ernie_m"
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+ERNIE_M_START_DOCSTRING = r"""
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`ErnieMConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+ERNIE_M_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`ErnieMTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare ErnieM Model transformer outputting raw hidden-states without any specific head on top.",
+    ERNIE_M_START_DOCSTRING,
+)
+class ErnieMModel(ErnieMPreTrainedModel):
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.initializer_range = config.initializer_range
+        self.embeddings = ErnieMEmbeddings(config)
+        self.encoder = ErnieMEncoder(config)
+        self.pooler = ErnieMPooler(config) if add_pooling_layer else None
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layers[layer].self_attn.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPastAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[tensor] = None,
+        position_ids: Optional[tensor] = None,
+        attention_mask: Optional[tensor] = None,
+        head_mask: Optional[tensor] = None,
+        inputs_embeds: Optional[tensor] = None,
+        past_key_values: Optional[tuple[tuple[tensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.FloatTensor], BaseModelOutputWithPoolingAndCrossAttentions]:
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time.")
+
+        # init the default bool value
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        past_key_values_length = 0
+        if past_key_values is not None:
+            past_key_values_length = past_key_values.get_seq_length()
+
+        # Adapted from paddlenlp.transformers.ernie_m.ErnieMModel
+        if attention_mask is None:
+            attention_mask = (input_ids == self.config.pad_token_id).to(torch.float32)
+            attention_mask *= torch.finfo(attention_mask.dtype).min
+            if past_key_values is not None:
+                batch_size = past_key_values[0][0].shape[0]
+                past_mask = torch.zeros([batch_size, 1, 1, past_key_values_length], dtype=attention_mask.dtype)
+                attention_mask = torch.concat([past_mask, attention_mask], dim=-1)
+        # For 2D attention_mask from tokenizer
+        elif attention_mask.ndim == 2:
+            attention_mask = attention_mask.to(torch.float32)
+            attention_mask = 1.0 - attention_mask
+            attention_mask *= torch.finfo(attention_mask.dtype).min
+
+        extended_attention_mask = attention_mask.unsqueeze(1).unsqueeze(1)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            sequence_output = encoder_outputs[0]
+            pooler_output = self.pooler(sequence_output) if self.pooler is not None else None
+            return (sequence_output, pooler_output) + encoder_outputs[1:]
+
+        sequence_output = encoder_outputs["last_hidden_state"]
+        pooler_output = self.pooler(sequence_output) if self.pooler is not None else None
+        hidden_states = None if not output_hidden_states else encoder_outputs["hidden_states"]
+        attentions = None if not output_attentions else encoder_outputs["attentions"]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooler_output,
+            hidden_states=hidden_states,
+            attentions=attentions,
+        )
+
+
+@add_start_docstrings(
+    """ErnieM Model transformer with a sequence classification/regression head on top (a linear layer on top of
+    the pooled output) e.g. for GLUE tasks.""",
+    ERNIE_M_START_DOCSTRING,
+)
+class ErnieMForSequenceClassification(ErnieMPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+
+        self.ernie_m = ErnieMModel(config)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = True,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[tuple[torch.FloatTensor], SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.ernie_m(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            past_key_values=past_key_values,
+            output_hidden_states=output_hidden_states,
+            output_attentions=output_attentions,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """ErnieM Model with a multiple choice classification head on top (a linear layer on top of
+    the pooled output and a softmax) e.g. for RocStories/SWAG tasks.""",
+    ERNIE_M_START_DOCSTRING,
+)
+class ErnieMForMultipleChoice(ErnieMPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.ernie_m = ErnieMModel(config)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = True,
+    ) -> Union[tuple[torch.FloatTensor], MultipleChoiceModelOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+            `input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+
+        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+
+        outputs = self.ernie_m(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """ErnieM Model with a token classification head on top (a linear layer on top of
+    the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.""",
+    ERNIE_M_START_DOCSTRING,
+)
+class ErnieMForTokenClassification(ErnieMPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.ernie_m = ErnieMModel(config, add_pooling_layer=False)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = True,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[tuple[torch.FloatTensor], TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.ernie_m(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """ErnieM Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).""",
+    ERNIE_M_START_DOCSTRING,
+)
+class ErnieMForQuestionAnswering(ErnieMPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.ernie_m = ErnieMModel(config, add_pooling_layer=False)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        processor_class=_TOKENIZER_FOR_DOC,
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=QuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        start_positions: Optional[torch.Tensor] = None,
+        end_positions: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = True,
+    ) -> Union[tuple[torch.FloatTensor], QuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.ernie_m(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """ErnieMForInformationExtraction is a Ernie-M Model with two linear layer on top of the hidden-states output to
+    compute `start_prob` and `end_prob`, designed for Universal Information Extraction.""",
+    ERNIE_M_START_DOCSTRING,
+)
+class ErnieMForInformationExtraction(ErnieMPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.ernie_m = ErnieMModel(config)
+        self.linear_start = nn.Linear(config.hidden_size, 1)
+        self.linear_end = nn.Linear(config.hidden_size, 1)
+        self.sigmoid = nn.Sigmoid()
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(ERNIE_M_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        start_positions: Optional[torch.Tensor] = None,
+        end_positions: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = True,
+    ) -> Union[tuple[torch.FloatTensor], QuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for position (index) for computing the start_positions loss. Position outside of the sequence are
+            not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) for computing the end_positions loss. Position outside of the sequence are not
+            taken into account for computing the loss.
+        """
+
+        result = self.ernie_m(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        if return_dict:
+            sequence_output = result.last_hidden_state
+        elif not return_dict:
+            sequence_output = result[0]
+
+        start_logits = self.linear_start(sequence_output)
+        start_logits = start_logits.squeeze(-1)
+        end_logits = self.linear_end(sequence_output)
+        end_logits = end_logits.squeeze(-1)
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = BCEWithLogitsLoss()
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            return tuple(
+                i
+                for i in [total_loss, start_logits, end_logits, result.hidden_states, result.attentions]
+                if i is not None
+            )
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=result.hidden_states,
+            attentions=result.attentions,
+        )
+
+
+__all__ = [
+    "ErnieMForMultipleChoice",
+    "ErnieMForQuestionAnswering",
+    "ErnieMForSequenceClassification",
+    "ErnieMForTokenClassification",
+    "ErnieMModel",
+    "ErnieMPreTrainedModel",
+    "ErnieMForInformationExtraction",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/tokenization_ernie_m.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/tokenization_ernie_m.py
new file mode 100644
index 0000000000000000000000000000000000000000..4c33abd043e6d170485e6d01298788d461275aa6
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/ernie_m/tokenization_ernie_m.py
@@ -0,0 +1,409 @@
+# coding=utf-8
+# Copyright 2023 Xuan Ouyang, Shuohuan Wang, Chao Pang, Yu Sun, Hao Tian, Hua Wu, Haifeng Wang and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for Ernie-M."""
+
+import os
+import unicodedata
+from typing import Any, Optional
+
+import sentencepiece as spm
+
+from ....tokenization_utils import PreTrainedTokenizer
+from ....utils import logging
+from ....utils.import_utils import requires
+
+
+logger = logging.get_logger(__name__)
+
+SPIECE_UNDERLINE = "▁"
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "sentencepiece_model_ckpt": "sentencepiece.bpe.model"}
+
+RESOURCE_FILES_NAMES = {
+    "sentencepiece_model_file": "sentencepiece.bpe.model",
+    "vocab_file": "vocab.txt",
+}
+
+
+# Adapted from paddlenlp.transformers.ernie_m.tokenizer.ErnieMTokenizer
+@requires(backends=("sentencepiece",))
+class ErnieMTokenizer(PreTrainedTokenizer):
+    r"""
+    Constructs a Ernie-M tokenizer. It uses the `sentencepiece` tools to cut the words to sub-words.
+
+    Args:
+        sentencepiece_model_file (`str`):
+            The file path of sentencepiece model.
+        vocab_file (`str`, *optional*):
+            The file path of the vocabulary.
+        do_lower_case (`str`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
+            A special token representing the `unknown (out-of-vocabulary)` token. An unknown token is set to be
+            `unk_token` inorder to be converted to an ID.
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
+            A special token separating two different sentences in the same input.
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
+            A special token used to make arrays of tokens the same size for batching purposes.
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
+            A special token used for sequence classification. It is the last token of the sequence when built with
+            special tokens.
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
+            A special token representing a masked token. This is the token used in the masked language modeling task
+            which the model tries to predict the original unmasked ones.
+    """
+
+    # Ernie-M model doesn't have token_type embedding.
+    model_input_names: list[str] = ["input_ids"]
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    resource_files_names = RESOURCE_FILES_NAMES
+
+    def __init__(
+        self,
+        sentencepiece_model_ckpt,
+        vocab_file=None,
+        do_lower_case=False,
+        encoding="utf8",
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        sp_model_kwargs: Optional[dict[str, Any]] = None,
+        **kwargs,
+    ) -> None:
+        # Mask token behave like a normal word, i.e. include the space before it and
+        # is included in the raw text, there should be a match in a non-normalized sentence.
+
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+
+        self.do_lower_case = do_lower_case
+        self.sentencepiece_model_ckpt = sentencepiece_model_ckpt
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(sentencepiece_model_ckpt)
+
+        # to mimic paddlenlp.transformers.ernie_m.tokenizer.ErnieMTokenizer functioning
+        if vocab_file is not None:
+            self.vocab = self.load_vocab(filepath=vocab_file)
+        else:
+            self.vocab = {self.sp_model.id_to_piece(id): id for id in range(self.sp_model.get_piece_size())}
+        self.reverse_vocab = {v: k for k, v in self.vocab.items()}
+
+        super().__init__(
+            do_lower_case=do_lower_case,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            vocab_file=vocab_file,
+            encoding=encoding,
+            sp_model_kwargs=self.sp_model_kwargs,
+            **kwargs,
+        )
+
+    def get_offset_mapping(self, text):
+        if text is None:
+            return None
+
+        split_tokens = self.tokenize(text)
+        normalized_text, char_mapping = "", []
+
+        for i, ch in enumerate(text):
+            if ch in self.SP_CHAR_MAPPING:
+                ch = self.SP_CHAR_MAPPING.get(ch)
+            else:
+                ch = unicodedata.normalize("NFKC", ch)
+            if self.is_whitespace(ch):
+                continue
+            normalized_text += ch
+            char_mapping.extend([i] * len(ch))
+
+        text, token_mapping, offset = normalized_text, [], 0
+
+        if self.do_lower_case:
+            text = text.lower()
+
+        for token in split_tokens:
+            if token[:1] == "▁":
+                token = token[1:]
+            start = text[offset:].index(token) + offset
+            end = start + len(token)
+
+            token_mapping.append((char_mapping[start], char_mapping[end - 1] + 1))
+            offset = end
+        return token_mapping
+
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+
+    def get_vocab(self):
+        return dict(self.vocab, **self.added_tokens_encoder)
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__ = d
+
+        # for backward compatibility
+        if not hasattr(self, "sp_model_kwargs"):
+            self.sp_model_kwargs = {}
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(self.sentencepiece_model_ckpt)
+
+    def clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        return "".join(self.SP_CHAR_MAPPING.get(c, c) for c in text)
+
+    def _tokenize(self, text, enable_sampling=False, nbest_size=64, alpha=0.1):
+        """Tokenize a string."""
+
+        if self.sp_model_kwargs.get("enable_sampling") is True:
+            enable_sampling = True
+        if self.sp_model_kwargs.get("alpha") is not None:
+            alpha = self.sp_model_kwargs.get("alpha")
+        if self.sp_model_kwargs.get("nbest_size") is not None:
+            nbest_size = self.sp_model_kwargs.get("nbest_size")
+
+        if not enable_sampling:
+            pieces = self.sp_model.EncodeAsPieces(text)
+        else:
+            pieces = self.sp_model.SampleEncodeAsPieces(text, nbest_size, alpha)
+        new_pieces = []
+        for pi, piece in enumerate(pieces):
+            if piece == SPIECE_UNDERLINE:
+                if not pieces[pi + 1].startswith(SPIECE_UNDERLINE) and pi != 0:
+                    new_pieces.append(SPIECE_UNDERLINE)
+                    continue
+                else:
+                    continue
+            lst_i = 0
+            for i, chunk in enumerate(piece):
+                if chunk == SPIECE_UNDERLINE:
+                    continue
+                if self.is_ch_char(chunk) or self.is_punct(chunk):
+                    if i > lst_i and piece[lst_i:i] != SPIECE_UNDERLINE:
+                        new_pieces.append(piece[lst_i:i])
+                    new_pieces.append(chunk)
+                    lst_i = i + 1
+                elif chunk.isdigit() and i > 0 and not piece[i - 1].isdigit():
+                    if i > lst_i and piece[lst_i:i] != SPIECE_UNDERLINE:
+                        new_pieces.append(piece[lst_i:i])
+                    lst_i = i
+                elif not chunk.isdigit() and i > 0 and piece[i - 1].isdigit():
+                    if i > lst_i and piece[lst_i:i] != SPIECE_UNDERLINE:
+                        new_pieces.append(piece[lst_i:i])
+                    lst_i = i
+            if len(piece) > lst_i:
+                new_pieces.append(piece[lst_i:])
+        return new_pieces
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (strings for sub-words) in a single string."""
+        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
+        return out_string
+
+    def convert_ids_to_string(self, ids):
+        """
+        Converts a sequence of tokens (strings for sub-words) in a single string.
+        """
+        tokens = self.convert_ids_to_tokens(ids)
+        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
+        return out_string
+
+    # to mimic paddlenlp.transformers.ernie_m.tokenizer.ErnieMTokenizer functioning
+    def _convert_token_to_id(self, token):
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+
+    # to mimic paddlenlp.transformers.ernie_m.tokenizer.ErnieMTokenizer functioning
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.reverse_vocab.get(index, self.unk_token)
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        r"""
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. An ErnieM sequence has the following format:
+
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] [SEP] B [SEP]`
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `list[int]`: List of input_id with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        _cls = [self.cls_token_id]
+        _sep = [self.sep_token_id]
+        return _cls + token_ids_0 + _sep + _sep + token_ids_1 + _sep
+
+    def build_offset_mapping_with_special_tokens(self, offset_mapping_0, offset_mapping_1=None):
+        r"""
+        Build offset map from a pair of offset map by concatenating and adding offsets of special tokens. An Ernie-M
+        offset_mapping has the following format:
+
+        - single sequence: `(0,0) X (0,0)`
+        - pair of sequences: `(0,0) A (0,0) (0,0) B (0,0)`
+
+        Args:
+            offset_mapping_ids_0 (`list[tuple]`):
+                List of char offsets to which the special tokens will be added.
+            offset_mapping_ids_1 (`list[tuple]`, *optional*):
+                Optional second list of wordpiece offsets for offset mapping pairs.
+        Returns:
+            `list[tuple]`: List of wordpiece offsets with the appropriate offsets of special tokens.
+        """
+        if offset_mapping_1 is None:
+            return [(0, 0)] + offset_mapping_0 + [(0, 0)]
+
+        return [(0, 0)] + offset_mapping_0 + [(0, 0), (0, 0)] + offset_mapping_1 + [(0, 0)]
+
+    def get_special_tokens_mask(self, token_ids_0, token_ids_1=None, already_has_special_tokens=False):
+        r"""
+        Retrieves sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `encode` method.
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of ids of the first sequence.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`str`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+        Returns:
+            `list[int]`:
+                The list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            if token_ids_1 is not None:
+                raise ValueError(
+                    "You should not supply a second sequence if the provided sequence of "
+                    "ids is already formatted with special tokens for the model."
+                )
+            return [1 if x in [self.sep_token_id, self.cls_token_id] else 0 for x in token_ids_0]
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Create the token type IDs corresponding to the sequences passed. [What are token type
+        IDs?](../glossary#token-type-ids) Should be overridden in a subclass if the model has a special way of
+        building: those.
+
+        Args:
+            token_ids_0 (`list[int]`):
+                The first tokenized sequence.
+            token_ids_1 (`list[int]`, *optional*):
+                The second tokenized sequence.
+        Returns:
+            `list[int]`: The token type ids.
+        """
+        # called when `add_special_tokens` is True, so align with `build_inputs_with_special_tokens` method
+        if token_ids_1 is None:
+            # [CLS] X [SEP]
+            return (len(token_ids_0) + 2) * [0]
+
+        # [CLS] A [SEP] [SEP] B [SEP]
+        return [0] * (len(token_ids_0) + 1) + [1] * (len(token_ids_1) + 3)
+
+    def is_ch_char(self, char):
+        """
+        is_ch_char
+        """
+        if "\u4e00" <= char <= "\u9fff":
+            return True
+        return False
+
+    def is_alpha(self, char):
+        """
+        is_alpha
+        """
+        if ("a" <= char <= "z") or ("A" <= char <= "Z"):
+            return True
+        return False
+
+    def is_punct(self, char):
+        """
+        is_punct
+        """
+        if char in ",;:.?!~，；：。？！《》【】":
+            return True
+        return False
+
+    def is_whitespace(self, char):
+        """
+        is whitespace
+        """
+        if char == " " or char == "\t" or char == "\n" or char == "\r":
+            return True
+        if len(char) == 1:
+            cat = unicodedata.category(char)
+            if cat == "Zs":
+                return True
+        return False
+
+    def load_vocab(self, filepath):
+        token_to_idx = {}
+        with open(filepath, "r", encoding="utf-8") as f:
+            for index, line in enumerate(f):
+                token = line.rstrip("\n")
+                token_to_idx[token] = int(index)
+
+        return token_to_idx
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        index = 0
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(
+                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+            )
+        else:
+            vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
+                        " Please check that the vocabulary is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(token + "\n")
+                index += 1
+
+        tokenizer_model_file = os.path.join(save_directory, "sentencepiece.bpe.model")
+        with open(tokenizer_model_file, "wb") as fi:
+            content_spiece_model = self.sp_model.serialized_model_proto()
+            fi.write(content_spiece_model)
+
+        return (vocab_file,)
+
+
+__all__ = ["ErnieMTokenizer"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/gptsan_japanese/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/gptsan_japanese/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c23b58f35012f1ddc00e2275c484810287de6f8
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/gptsan_japanese/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ....utils import _LazyModule
+from ....utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_gptsan_japanese import *
+    from .modeling_gptsan_japanese import *
+    from .tokenization_gptsan_japanese import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/gptsan_japanese/configuration_gptsan_japanese.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/gptsan_japanese/configuration_gptsan_japanese.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7d1e23e080eddc5cce28e72c551d123bec09de6
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/gptsan_japanese/configuration_gptsan_japanese.py
@@ -0,0 +1,157 @@
+# coding=utf-8
+# Copyright 2023, HuggingFace Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""GPTSAN-japanese model configuration"""
+
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class GPTSanJapaneseConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`GPTSanJapaneseModel`]. It is used to instantiate
+    a GPTSANJapanese model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the GPTSANJapanese
+    [Tanrei/GPTSAN-japanese](https://huggingface.co/Tanrei/GPTSAN-japanese) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Arguments:
+        vocab_size (`int`, *optional*, defaults to 36000):
+            Vocabulary size of the GPTSANJapanese model. Defines the number of different tokens that can be represented
+            by the `inputs_ids` passed when calling [`GPTSanJapaneseModel`].
+        max_position_embeddings (`int`, *optional*, defaults to 1280):
+            The maximum sequence length that this model might ever be used with. Defaults set this to 1280.
+        d_model (`int`, *optional*, defaults to 1024):
+            Size of the encoder layers and the pooler layer.
+        d_ff (`int`, *optional*, defaults to 8192):
+            Size of the intermediate feed forward layer in each `SwitchTransformersBlock`.
+        d_ext (`int`, *optional*, defaults to 4096):
+            Size of the intermediate feed forward layer in each Extra-layers.
+        d_spout (`int`, *optional*, defaults to 128):
+            Size of the `spout` vector.
+        num_switch_layers (`int`, *optional*, defaults to 10):
+            Number of layers in the Switch Transformer layer.
+        num_ext_layers (`int`, *optional*, defaults to 0):
+            Number of layers in the Extra-layers.
+        num_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_experts (`int`, *optional*, defaults to 16):
+            Number of experts for each SwitchTransformer layer.
+        expert_capacity (`int`, *optional*, defaults to 128):
+            Number of tokens that can be stored in each expert. If set to 1, the model will behave like a regular
+            Transformer.
+        dropout_rate (`float`, *optional*, defaults to 0.0):
+            The ratio for all dropout layers.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        router_bias (`bool`, *optional*, defaults to `False`):
+            Whether to add a bias to the router.
+        router_jitter_noise (`float`, *optional*, defaults to 0.0):
+            Amount of noise to add to the router. Set it to 0.0 during prediction or set small value (usually 1e-2)
+            during training.
+        router_dtype (`str`, *optional*, default to `"float32"`):
+            The `dtype` used for the routers. It is preferable to keep the `dtype` to `"float32"` as specified in the
+            *selective precision* discussion in [the paper](https://huggingface.co/papers/2101.03961).
+        router_ignore_padding_tokens (`bool`, *optional*, defaults to `False`):
+            Whether to ignore padding tokens when routing.
+        output_hidden_states (`bool`, *optional*, default to `False`):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        output_attentions (`bool`, *optional*, defaults to `False`):
+            Whether or not to return the attentions tensors of all attention layers.
+        initializer_factor (`float`, *optional*, defaults to 0.002):
+            A factor for initializing all weight matrices.
+        output_router_logits (`bool`, *optional*, default to `False`):
+            Whether or not to return the router logits of all experts.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models)
+    """
+
+    model_type = "gptsan-japanese"
+    keys_to_ignore_at_inference = [
+        "past_key_values",
+    ]
+    attribute_map = {
+        "hidden_size": "d_model",
+        "num_attention_heads": "num_heads",
+        "num_hidden_layers": "num_layers",
+    }
+
+    def __init__(
+        self,
+        vocab_size=36000,
+        max_position_embeddings=1280,
+        d_model=1024,
+        d_ff=8192,
+        d_ext=4096,
+        d_spout=128,
+        num_switch_layers=10,
+        num_ext_layers=0,
+        num_heads=16,
+        num_experts=16,
+        expert_capacity=128,
+        dropout_rate=0.0,
+        layer_norm_epsilon=1e-5,
+        router_bias=False,
+        router_jitter_noise=0.0,
+        router_dtype="float32",
+        router_ignore_padding_tokens=False,
+        output_hidden_states=False,
+        output_attentions=False,
+        initializer_factor=0.002,
+        output_router_logits=False,
+        use_cache=True,
+        separator_token_id=35998,
+        pad_token_id=35995,
+        eos_token_id=35999,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.d_model = d_model
+        self.d_ff = d_ff
+        self.d_ext = d_ext
+        self.d_spout = d_spout
+        self.num_switch_layers = num_switch_layers
+        self.num_ext_layers = num_ext_layers
+        self.num_layers = num_switch_layers + num_ext_layers
+        self.num_heads = num_heads
+        self.num_experts = num_experts
+        self.expert_capacity = expert_capacity
+        self.dropout_rate = dropout_rate
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.router_bias = router_bias
+        self.router_jitter_noise = router_jitter_noise
+        self.router_dtype = router_dtype
+        self.router_ignore_padding_tokens = router_ignore_padding_tokens
+        self.output_hidden_states = output_hidden_states
+        self.output_attentions = output_attentions
+        self.initializer_factor = initializer_factor
+        self.output_router_logits = output_router_logits
+        self.use_cache = use_cache
+
+        super().__init__(
+            separator_token_id=separator_token_id,
+            pad_token_id=pad_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs,
+        )
+
+
+__all__ = ["GPTSanJapaneseConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/gptsan_japanese/modeling_gptsan_japanese.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/gptsan_japanese/modeling_gptsan_japanese.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e2d27f03bacbd60ade444fe3221ada9f714cb60
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/gptsan_japanese/modeling_gptsan_japanese.py
@@ -0,0 +1,1331 @@
+# coding=utf-8
+# Copyright 2023 Toshiyuki Sakamoto(tanreinama) and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch GPTSANJapanese model."""
+
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+
+from ....activations import ACT2FN
+from ....cache_utils import Cache
+from ....modeling_outputs import MoECausalLMOutputWithPast, MoEModelOutputWithPastAndCrossAttentions
+from ....modeling_utils import PreTrainedModel
+from ....utils import (
+    DUMMY_INPUTS,
+    DUMMY_MASK,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_torch_fx_proxy,
+    logging,
+)
+from ....utils.deprecation import deprecate_kwarg
+from .configuration_gptsan_japanese import GPTSanJapaneseConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "GPTSanJapaneseConfig"
+_CHECKPOINT_FOR_DOC = "Tanrei/GPTSAN-japanese"
+
+####################################################
+# This dict contains ids and associated url
+# for the pretrained weights provided with the models
+####################################################
+
+
+def router_z_loss_func(router_logits: torch.Tensor) -> float:
+    r"""
+    Compute the router z-loss implemented in PyTorch.
+
+    The router z-loss was introduced in [Designing Effective Sparse Expert Models](https://huggingface.co/papers/2202.08906).
+    It encourages router logits to remain small in an effort to improve stability.
+
+    Args:
+        router_logits (`float`):
+            Input logits of shape [batch_size, sequence_length, num_experts]
+
+    Returns:
+        Scalar router z-loss.
+    """
+    num_groups, tokens_per_group, _ = router_logits.shape
+    log_z = torch.logsumexp(router_logits, dim=-1)
+    z_loss = log_z**2
+    return torch.sum(z_loss) / (num_groups * tokens_per_group)
+
+
+def load_balancing_loss_func(router_probs: torch.Tensor, expert_indices: torch.Tensor) -> float:
+    r"""
+    Computes auxiliary load balancing loss as in Switch Transformer - implemented in Pytorch.
+
+    See Switch Transformer (https://huggingface.co/papers/2101.03961) for more details. This function implements the loss
+    function presented in equations (4) - (6) of the paper. It aims at penalizing cases where the routing between
+    experts is too unbalanced.
+
+    Args:
+        router_probs (`torch.Tensor`):
+            Probability assigned to each expert per token. Shape: [batch_size, sequence_length, num_experts].
+        expert_indices (`torch.Tensor`):
+            Indices tensor of shape [batch_size, sequence_length] identifying the selected expert for a given token.
+
+    Returns:
+        The auxiliary loss.
+    """
+    num_experts = router_probs.shape[-1]
+
+    # cast the expert indices to int64, otherwise one-hot encoding will fail
+    if expert_indices.dtype != torch.int64:
+        expert_indices = expert_indices.to(torch.int64)
+
+    if len(expert_indices.shape) == 2:
+        expert_indices = expert_indices.unsqueeze(2)
+
+    expert_mask = torch.nn.functional.one_hot(expert_indices, num_experts)
+
+    # For a given token, determine if it was routed to a given expert.
+    expert_mask = torch.max(expert_mask, axis=-2).values
+
+    # cast to float32 otherwise mean will fail
+    expert_mask = expert_mask.to(torch.float32)
+    tokens_per_group_and_expert = torch.mean(expert_mask, axis=-2)
+
+    router_prob_per_group_and_expert = torch.mean(router_probs, axis=-2)
+    return torch.mean(tokens_per_group_and_expert * router_prob_per_group_and_expert) * (num_experts**2)
+
+
+class GPTSanJapaneseDenseActDense(nn.Module):
+    """
+    FFN Layer for Switch Transformer and Extra layers
+
+    GPTSAN can mix Switch Transformer layers and normal Transformer layers This class is used as Expert in Switch
+    Transformer layers and as FFN in regular Transformer layers. RELU is used in the Switch Transformer layer, and
+    Swish is used in the normal Transformer layer, so there is a choice of which is used in the argument.
+
+    """
+
+    def __init__(self, config: GPTSanJapaneseConfig, ext_layer=False):
+        super().__init__()
+        d_inter = config.d_ext if ext_layer else config.d_ff
+        self.wi = nn.Linear(config.d_model, d_inter, bias=ext_layer)
+        self.wo = nn.Linear(d_inter, config.d_model, bias=ext_layer)
+        self.dropout = nn.Identity() if ext_layer else nn.Dropout(config.dropout_rate)
+        self.act = ACT2FN["swish" if ext_layer else "relu"]
+
+    def forward(self, hidden_states):
+        r"""
+        Args:
+            hidden_states (`torch.Tensor`) :
+                [num_groups, tokens_per_group, hidden_dim] inputs to send to experts.
+        Returns:
+            torch.Tensor[num_groups, tokens_per_group, hidden_dim]
+
+        """
+        hidden_states = self.wi(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.wo(hidden_states)
+        return hidden_states
+
+
+class GPTSanJapaneseTop1Router(nn.Module):
+    """
+    Router using tokens choose top-1 experts assignment.
+
+    This router uses the same mechanism as in Switch Transformer (https://huggingface.co/papers/2101.03961) and V-MoE
+    (https://huggingface.co/papers/2106.05974): tokens choose their top experts. Items are sorted by router_probs and then
+    routed to their choice of expert until the expert's expert_capacity is reached. **There is no guarantee that each
+    token is processed by an expert**, or that each expert receives at least one token.
+
+    """
+
+    def __init__(self, config: GPTSanJapaneseConfig):
+        super().__init__()
+        self.num_experts = config.num_experts
+        self.expert_capacity = config.expert_capacity
+        self.classifier = nn.Linear(config.hidden_size, self.num_experts, bias=config.router_bias)
+        self.jitter_noise = config.router_jitter_noise
+        self.ignore_padding_tokens = config.router_ignore_padding_tokens
+        self.dtype = getattr(torch, config.router_dtype)
+
+    def _compute_router_probabilities(self, hidden_states: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        r"""
+        Computes router probabilities from input hidden states.
+
+        Args:
+            hidden_states (`torch.Tensor`):
+                (batch_size, sequence_length, hidden_dim) from which router probabilities are computed.
+        Returns:
+            router_probabilities (`torch.Tensor`):
+                Tensor of shape (batch_size, sequence_length, num_experts) corresponding to the probabilities for each
+                token and expert. Used for routing tokens to experts.
+            router_logits (`torch.Tensor`):
+                Logits tensor of shape (batch_size, sequence_length, num_experts) corresponding to raw router logits.
+                This is used later for computing router z-loss.
+        """
+        # float32 is used to ensure stability. See the discussion of "selective precision" in
+        # https://huggingface.co/papers/2101.03961.
+        # We also store the previous dtype to cast back the output to the previous dtype
+        self.input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(self.dtype)
+
+        if self.training and self.jitter_noise > 0:
+            # Multiply the token inputs by the uniform distribution - adding some noise
+            hidden_states *= torch.empty_like(hidden_states).uniform_(1.0 - self.jitter_noise, 1.0 + self.jitter_noise)
+
+        # Shape: [num_groups, tokens_per_group, num_experts]
+        self._cast_classifier()
+        router_logits = self.classifier(hidden_states)
+
+        # Apply Softmax and cast back to the original `dtype`
+        router_probabilities = nn.functional.softmax(router_logits, dim=-1, dtype=self.dtype).to(self.input_dtype)
+        return router_probabilities, router_logits
+
+    def _cast_classifier(self):
+        r"""
+        `bitsandbytes` `Linear8bitLt` layers does not support manual casting Therefore we need to check if they are an
+        instance of the `Linear8bitLt` class by checking special attributes.
+        """
+        if not (hasattr(self.classifier, "SCB") or hasattr(self.classifier, "CB")):
+            self.classifier = self.classifier.to(self.dtype)
+
+    def forward(self, hidden_states: torch.Tensor) -> tuple:
+        r"""
+        Generic forward function for every Router class. Each Router expects to have the same input hidden states
+        (`hidden_states`) corresponding to the hidden states for each token, the `expert_capacity` corresponding to the
+        number of tokens the Router will send to each expert, some Routers can send up to few tokens to each expert.
+
+        Each Router works as the following: it expects the hidden states for each token, gets the `router_probs` and
+        `router_logits` from the `router_weights`. This will assign for each token, the raw probability to be assigned
+        to an expert. Then each Router class will have to define its own `_compute_routing_instructions`.
+
+        Args:
+            hidden_states (`torch.Tensor`) :
+                [num_groups, tokens_per_group, hidden_dim] inputs to send to experts.
+        Returns:
+            tuple[`torch.Tensor`, `torch.Tensor`, `torch.Tensor`] Tuple containing the expert index, the router probs
+            and the router logits. The router probabilities and logits are required to compute the loss.
+        """
+        router_probs, router_logits = self._compute_router_probabilities(hidden_states)
+
+        expert_index = torch.argmax(router_probs, dim=-1)
+        expert_index = torch.nn.functional.one_hot(expert_index, num_classes=self.num_experts)
+
+        # Mask tokens outside expert capacity. Sum over each sequence
+        token_priority = torch.cumsum(expert_index, dim=-2)
+        # mask if the token routed to to the expert will overflow
+        expert_capacity_mask = token_priority <= self.expert_capacity
+        expert_index = expert_index * expert_capacity_mask
+
+        router_probs = torch.max(router_probs, dim=-1).values.unsqueeze(-1)
+        return expert_index, router_probs, router_logits
+
+
+class GPTSanJapaneseSparseMLP(nn.Module):
+    r"""
+    Implementation of the Switch Transformers Sparse MLP module.
+    """
+
+    def __init__(self, config: GPTSanJapaneseConfig, expert_class: nn.Module = GPTSanJapaneseDenseActDense):
+        super().__init__()
+        # Step 1: Get the correct router according to its class
+        self.router = GPTSanJapaneseTop1Router(config)
+
+        # Step 2: Get the experts
+        self.experts = nn.ModuleDict()
+        for idx in range(config.num_experts):
+            self.experts[f"expert_{idx}"] = expert_class(config)
+
+    def forward(self, hidden_states):
+        r"""
+        Hold on, this will be slightly tricky to understand In the correct order, a MoE layer does the following:
+
+        1- Gets the `router_mask` from the router. The shape of the mask is `(batch_size, sequence_length, num_expert)`
+        and corresponds to the argmax of the `router_probs`. The probabilities are needed in the computation of the
+        hidden states : they are broadcasted to the hidden states values (can be interpreted as a scaling factor).
+
+        2- Dispatch the tokens to its associated experts. We do a classic for loop over the experts and assign for each
+        expert the corresponding hidden states.
+
+        """
+        # Step 1: Get the router_mask from the router as well as the probabilities
+        router_mask, router_probs, router_logits = self.router(hidden_states)
+        expert_index = torch.argmax(router_mask, dim=-1)
+
+        # The routers introduced might not always map all the tokens, to a router, which means that some hidden states
+        # can be unchanged from one layer to another. That is why the hidden states are cloned before updating only the selected ones.
+
+        next_states = hidden_states.clone()
+        for idx, expert in enumerate(self.experts.values()):
+            token_indices = router_mask[:, :, idx].bool()
+            next_states[token_indices] = expert(hidden_states[token_indices]).to(next_states.dtype)
+
+        hidden_states = router_probs * next_states
+        return hidden_states, (router_logits, expert_index)
+
+
+class GPTSanJapaneseLayerSparseFF(nn.Module):
+    r"""
+    Switch Transformers Feed Forward layer module. This is a wrapper around the Mixture of Experts module.
+
+    Parameters:
+        config : ([`GPTSanJapaneseConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+    """
+
+    def __init__(self, config: GPTSanJapaneseConfig):
+        super().__init__()
+        self.mlp = GPTSanJapaneseSparseMLP(config)
+        self.soft_bypass_mlp = nn.Linear(config.d_model, config.d_model, bias=False)
+        self.norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+
+    def forward(self, hidden_states, output_router_logits):
+        r"""
+        Args:
+            hidden_states (`torch.Tensor`) :
+                [num_groups, tokens_per_group, hidden_dim] inputs to send to experts.
+            output_router_logits (`bool`) :
+                output experts router output.
+        Returns:
+            torch.Tensor[num_groups, tokens_per_group, hidden_dim]
+
+        """
+        forwarded_states, router_tuple = self.mlp(hidden_states)
+        forwarded_states += torch.tanh(self.soft_bypass_mlp(hidden_states))
+        output = hidden_states + self.norm(forwarded_states)
+
+        if output_router_logits and router_tuple is not None:
+            return output, router_tuple
+        else:
+            return output
+
+
+class GPTSanJapaneseLayerDenseFF(nn.Module):
+    r"""
+    Extra Transformers Feed Forward layer module.
+
+    Parameters:
+        config : ([`GPTSanJapaneseConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+    """
+
+    def __init__(self, config: GPTSanJapaneseConfig):
+        super().__init__()
+        # Check if it is a sparse layer, if not then it is a dense layer
+        self.mlp = GPTSanJapaneseDenseActDense(config, ext_layer=True)
+        self.norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+
+    def forward(self, hidden_states):
+        r"""
+        Args:
+            hidden_states (`torch.Tensor`) :
+                [num_groups, tokens_per_group, hidden_dim] inputs to send to experts.
+        Returns:
+            torch.Tensor[num_groups, tokens_per_group, hidden_dim]
+
+        """
+        forwarded_states = self.mlp(hidden_states)
+        output = hidden_states + self.norm(forwarded_states)
+        return output
+
+
+class GPTSanJapaneseAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+        is_causal: bool = False,
+        config: Optional[GPTSanJapaneseConfig] = None,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        self.config = config
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+        self.is_causal = is_causal
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        # `past_key_values[0].shape[2] == key_value_states.shape[1]`
+        # is checking that the `sequence_length` of the `past_key_values` is the same as
+        # the provided `key_value_states` to support prefix tuning
+        if (
+            is_cross_attention
+            and past_key_values is not None
+            and past_key_values[0].shape[2] == key_value_states.shape[1]
+        ):
+            # reuse k,v, cross_attentions
+            key_states = past_key_values[0]
+            value_states = past_key_values[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_values is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_values[0], key_states], dim=2)
+            value_states = torch.cat([past_key_values[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_values` is always `None`
+            past_key_values = (key_states, value_states)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.reshape(*proj_shape)
+        value_states = value_states.reshape(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                    f" {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to be reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned across GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped, past_key_values
+
+
+class GPTSanJapaneseLayerSelfAttention(nn.Module):
+    """
+    Self Attention and Normalization Unit
+    """
+
+    def __init__(self, config, has_relative_attention_bias=False):
+        super().__init__()
+        self.self_attn = GPTSanJapaneseAttention(
+            embed_dim=config.d_model,
+            num_heads=config.num_heads,
+            is_decoder=True,
+            bias=has_relative_attention_bias,
+        )
+        self.norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_epsilon)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: Optional[tuple[torch.FloatTensor]],
+        past_key_values: Optional[Cache] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[Union[torch.Tensor, tuple[torch.Tensor]], ...]:
+        r"""
+        Self-attention and normalize block.
+
+        Args:
+            hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                if the model is configured as a decoder.
+            past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+                Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up
+                decoding. If `past_key_values` are used, the user can optionally input only the last
+                `decoder_input_ids` (those that don't have their past key value states given to this model) of shape
+                `(batch_size, 1)` instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+            attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used
+                in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+            head_mask (`numpy.ndarray` of shape `({0})`, `optional):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        Returns:
+            tuple[torch.Tensor[num_groups, tokens_per_group, hidden_dim],...]
+        """
+        # Self Attention
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_values[:2] if past_key_values is not None else None
+        # add present self-attn cache to positions 1,2 of present_key_value tuple
+        atten_out = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_values=self_attn_past_key_value,
+            attention_mask=(1 - attention_mask) * torch.finfo(hidden_states.dtype).min,
+            layer_head_mask=head_mask,
+            output_attentions=output_attentions,
+        )
+        if output_attentions:
+            attn_weights = (atten_out[1],)
+        else:
+            attn_weights = ()
+
+        attention_output = atten_out[0]
+
+        hidden = hidden_states + self.norm(attention_output)
+
+        if use_cache:
+            outputs = (hidden, atten_out[2])  # hidden, present, (attentions)
+        else:
+            outputs = (hidden,)  # hidden, (attentions)
+
+        return outputs + attn_weights
+
+
+class GPTSanJapaneseBlock(nn.Module):
+    """
+    Self Attention and FFN Unit
+    """
+
+    def __init__(self, config, ext_layer=False):
+        super().__init__()
+        self.self_attn = GPTSanJapaneseLayerSelfAttention(config)
+        self.feed_forward = GPTSanJapaneseLayerDenseFF(config) if ext_layer else GPTSanJapaneseLayerSparseFF(config)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: Optional[tuple[torch.FloatTensor]],
+        past_key_values: Optional[Cache] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+        output_router_tuple: Optional[bool] = False,
+    ) -> tuple[Union[torch.Tensor, tuple[torch.Tensor]], ...]:
+        r"""
+        GPTSAN transformer block.
+
+        Args:
+            hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                if the model is configured as a decoder.
+            past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+                Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up
+                decoding. If `past_key_values` are used, the user can optionally input only the last
+                `decoder_input_ids` (those that don't have their past key value states given to this model) of shape
+                `(batch_size, 1)` instead of all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+            attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used
+                in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+            head_mask (`numpy.ndarray` of shape `({0})`, `optional):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            output_attentions (`bool`) :
+                output attention probabirities.
+            output_router_tuple:
+                output experts router logits and expert id.
+        Returns:
+            tuple[torch.Tensor[num_groups, tokens_per_group, hidden_dim],...]
+        """
+        atten_out = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+        )
+        attention_output = atten_out[0]
+
+        if isinstance(self.feed_forward, GPTSanJapaneseLayerSparseFF):
+            sparse_out = self.feed_forward(attention_output, output_router_tuple)
+            if output_router_tuple:
+                hidden, router_tuple = sparse_out
+            else:
+                hidden = sparse_out
+        else:
+            hidden = self.feed_forward(attention_output)
+
+        outputs = (hidden,) + atten_out[1:]
+
+        if isinstance(self.feed_forward, GPTSanJapaneseLayerSparseFF) and output_router_tuple:
+            outputs += (router_tuple,)
+
+        return outputs
+
+
+class GPTSanJapanesePreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config: GPTSanJapaneseConfig
+    base_model_prefix = "gptsan_japanese"
+    supports_gradient_checkpointing = False
+    _no_split_modules = ["GPTSanJapaneseBlock"]
+    _skip_keys_device_placement = "past_key_values"
+
+    @property
+    def dummy_inputs(self):
+        input_ids = torch.tensor(DUMMY_INPUTS)
+        input_mask = torch.tensor(DUMMY_MASK)
+        dummy_inputs = {
+            "input_ids": input_ids,
+            "attention_mask": input_mask,
+        }
+        return dummy_inputs
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        factor = self.config.initializer_factor  # Used for testing weights initialization
+        if isinstance(module, nn.LayerNorm):
+            module.weight.data.fill_(factor * 1.0)
+            module.bias.data.zero_()
+        elif isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
+            if hasattr(module, "bias") and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=factor * 1.0)
+        elif isinstance(module, GPTSanJapaneseModel):
+            # Mesh TensorFlow embeddings initialization
+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624
+            module.embed_tokens.weight.data.normal_(mean=0.0, std=factor * 1.0)
+            module.position_embeddings.weight.data.normal_(mean=0.0, std=factor * 1.0)
+            if hasattr(module, "extra_position_embeddings") and module.extra_position_embeddings is not None:
+                module.extra_position_embeddings.weight.data.normal_(mean=0.0, std=factor * 1.0)
+        elif isinstance(module, (GPTSanJapaneseModel, GPTSanJapaneseForConditionalGeneration)):
+            # Mesh TensorFlow embeddings initialization
+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L1624
+            module.final_logits_bias.data.normal_(mean=0.0, std=factor * 1.0)
+            if hasattr(module, "lm_head") and not self.config.tie_word_embeddings:
+                module.lm_head.weight.data.normal_(mean=0.0, std=factor * 1.0)
+        elif isinstance(module, GPTSanJapaneseDenseActDense):
+            # Mesh TensorFlow FF initialization
+            # See https://github.com/tensorflow/mesh/blob/master/mesh_tensorflow/transformer/transformer_layers.py#L56
+            # and https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/layers.py#L89
+            module.wi.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_model) ** -0.5))
+            if hasattr(module.wi, "bias") and module.wi.bias is not None:
+                module.wi.bias.data.zero_()
+            module.wo.weight.data.normal_(mean=0.0, std=factor * ((self.config.d_ff) ** -0.5))
+            if hasattr(module.wo, "bias") and module.wo.bias is not None:
+                module.wo.bias.data.zero_()
+        elif isinstance(module, GPTSanJapaneseAttention):
+            # Multi-headed attention
+            d_model = self.config.d_model
+            key_value_proj_dim = self.config.d_model
+            n_heads = self.config.num_heads
+            module.k_proj.weight.data.normal_(mean=0.0, std=factor * ((d_model * key_value_proj_dim) ** -0.5))
+            module.v_proj.weight.data.normal_(mean=0.0, std=factor * ((d_model * key_value_proj_dim) ** -0.5))
+            module.q_proj.weight.data.normal_(mean=0.0, std=factor * ((d_model * key_value_proj_dim) ** -0.5))
+            module.out_proj.weight.data.normal_(mean=0.0, std=factor * ((n_heads * key_value_proj_dim) ** -0.5))
+        elif isinstance(module, GPTSanJapaneseSparseMLP):
+            # Mesh TensorFlow attention initialization to avoid scaling before softmax
+            # See https://github.com/tensorflow/mesh/blob/fa19d69eafc9a482aff0b59ddd96b025c0cb207d/mesh_tensorflow/transformer/attention.py#L136
+            d_model = self.config.d_model
+            key_value_proj_dim = self.config.d_model
+            n_heads = self.config.num_heads
+            module.router.classifier.weight.data.normal_(mean=0.0, std=factor * 1)
+            for idx in range(self.config.num_experts):
+                module.experts[f"expert_{idx}"].wi.weight.data.normal_(mean=0.0, std=factor * (d_model**-0.5))
+                module.experts[f"expert_{idx}"].wo.weight.data.normal_(mean=0.0, std=factor * (d_model**-0.5))
+
+    def _shift_right(self, input_ids):
+        decoder_start_token_id = self.config.decoder_start_token_id
+        pad_token_id = self.config.pad_token_id
+
+        if decoder_start_token_id is None:
+            raise ValueError(
+                "self.model.config.decoder_start_token_id has to be defined. In T5 it is usually set to the pad_token_id. "
+                "See T5 docs for more information."
+            )
+
+        # shift inputs to the right
+        if is_torch_fx_proxy(input_ids):
+            # Item assignment is not supported natively for proxies.
+            shifted_input_ids = torch.full(input_ids.shape[:-1] + (1,), decoder_start_token_id)
+            shifted_input_ids = torch.cat([shifted_input_ids, input_ids[..., :-1]], dim=-1)
+        else:
+            shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+            shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
+            shifted_input_ids[..., 0] = decoder_start_token_id
+
+        if pad_token_id is None:
+            raise ValueError("self.model.config.pad_token_id has to be defined.")
+        # replace possible -100 values in labels by `pad_token_id`
+        shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+        return shifted_input_ids
+
+
+GPTSAN_JAPANESE_START_DOCSTRING = r"""
+
+    The [GPTSAN-japanese](https://github.com/tanreinama/GPTSAN) model was proposed in General-purpose Swich transformer
+    based Japanese language model
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`GPTSanJapaneseConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+GPTSAN_JAPANESE_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. GPTSAN-japanese is a model that generates sentence
+            continuations or predicts tokens at mask positions. Special tokens required for inputs to the model are
+            automatically appended.
+        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            An input that masks the Prefix part in the Prefix-LM input. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **prefix** input,
+            - 0 for tokens that are **not-prefix** input.
+        spout (`torch.Tensor` of shape `(batch_size, config.d_spout)`):
+                This vector is transformed through an 8-layer FFN and can be used instead of `past_key_values`.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, target_sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `decoder_input_ids` you can choose to directly pass an embedded
+            representation. If `past_key_values` is used, optionally only the last `decoder_inputs_embeds` have to be
+            input (see `past_key_values`). This is useful if you want more control over how to convert
+            `decoder_input_ids` indices into associated vectors than the model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        router_logits (`tuple(torch.FloatTensor)`, *optional*, returned when `output_router_logits=True` is passed or when `config.add_router_probs=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, sequence_length, num_experts)`.
+            Router logits of the decoder model, useful to compute the auxiliary loss for Mixture of Experts models.
+"""
+
+
+@add_start_docstrings(
+    "The bare GPTSAN-japanese Model transformer outputting raw hidden-states without any specific head on top.",
+    GPTSAN_JAPANESE_START_DOCSTRING,
+)
+class GPTSanJapaneseModel(GPTSanJapanesePreTrainedModel):
+    def __init__(self, config: GPTSanJapaneseConfig):
+        super().__init__(config)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.d_model)
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model)
+        self.last_project = nn.Linear(config.d_model, config.d_model, bias=True)
+        self.act = ACT2FN["swish"]
+
+        self.blocks = torch.nn.ModuleList([])
+        for _ in range(config.num_switch_layers):
+            self.blocks.append(GPTSanJapaneseBlock(config))
+        for _ in range(config.num_ext_layers):
+            self.blocks.append(GPTSanJapaneseBlock(config, ext_layer=True))
+
+        if config.num_ext_layers > 0:
+            self.extra_position_embeddings = nn.Embedding(config.max_position_embeddings, config.d_model)
+
+        if config.d_spout:
+            spouts = []
+            for _ in range(8):
+                spouts.append(nn.Linear(config.d_spout, config.d_spout, bias=False))
+                spouts.append(nn.Tanh())
+            spouts.append(nn.Linear(config.d_spout, config.num_layers * 2 * config.d_model, bias=False))
+            self.spout = nn.Sequential(*spouts)
+
+        self.post_init()
+
+    def set_input_embeddings(self, new_embeddings):
+        self.embed_tokens = new_embeddings
+
+    @add_start_docstrings_to_model_forward(GPTSAN_JAPANESE_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.FloatTensor] = None,
+        spout: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = False,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        output_router_logits: Optional[bool] = None,
+        num_precontext: Optional[torch.LongTensor] = None,
+    ) -> Union[MoEModelOutputWithPastAndCrossAttentions, tuple[torch.FloatTensor]]:
+        r"""
+        num_precontext (`torch.LongTensor` of shape `(batch_size,1)`):
+            length of `hybrid` input tokens in the input. Tokens up to this length refer to both front and back like
+            BERT, tokens after that refer only to front like GPT. see also:
+            https://github.com/tanreinama/GPTSAN/blob/main/report/model.md
+
+        Returns:
+            `MoEModelOutputWithPastAndCrossAttentions` or `tuple` if `return_dict` returns
+            MoEModelOutputWithPastAndCrossAttentions instead of tuple
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        device = self.position_embeddings.weight.device
+        if input_ids is None:
+            input_ids = torch.zeros([1, 1]).int().to(device)  # dummy for input_ids was None
+        if inputs_embeds is not None:
+            raise NotImplementedError(
+                "GPTSanJapaneseModel does not use `inputs_embeds`. Make sure to pass in `input_ids` instead."
+            )
+        num_pasts_contexts = 0
+        num_batch = input_ids.shape[0]
+        pasts_or_spout_value = None
+        if past_key_values is not None:
+            num_pasts_contexts = past_key_values.get_seq_length()
+        elif self.config.d_spout and spout is not None:
+            # `spout` is a special input vector specific to GPTSAN
+            # This controls the output by projecting embedded information such as the class of sentences during learning.
+            # It should passed instead of the first past_key_values.
+            # See the original GPTSAN repository for details
+            num_pasts_contexts += 1
+
+        # If there is an attention_mask, increase first one for spout
+        if self.config.d_spout and spout is not None and attention_mask is not None:
+            attention_mask_with_spout = torch.ones(num_batch, attention_mask.shape[1] + 1, device=device)
+            attention_mask_with_spout[:, 1:] -= 1 - attention_mask  # 1st token should be spout
+            attention_mask = attention_mask_with_spout  # update attention_mask
+
+        if num_precontext is not None:
+            # `num_precontext` is the number of tokens that refer to each other in prefix-lm
+            # created per batch, so dimension of num_precontext should be [batch, 1]
+            if not (
+                len(num_precontext.shape) == 2 and num_precontext.shape[1] == 1
+            ):  # num_precontext Should be [batch,1]
+                raise ValueError("num_precontext should be [batch, 1] size.")
+            num_precontext = torch.reshape(num_precontext, [-1])
+        else:
+            num_precontext = torch.zeros([num_batch]).int().to(device)
+
+        num_input_contexts = input_ids.shape[1]
+        num_output_contexts = num_input_contexts + num_pasts_contexts
+
+        hidden_states = self.embed_tokens(input_ids)
+
+        if past_key_values is not None:
+            pasts_or_spout_value = past_key_values
+        elif self.config.d_spout and spout is not None:
+            # Make vector from `spout` of GPTSAN to the same shape as past_key_values
+            pasts_or_spout_value = self.spout(spout)  # projecting `spout` vector
+            pasts_or_spout_value = torch.reshape(
+                pasts_or_spout_value,
+                [
+                    num_batch,
+                    self.config.num_layers,
+                    2,
+                    self.config.num_heads,
+                    num_pasts_contexts,
+                    self.config.d_model // self.config.num_heads,
+                ],
+            )
+            pasts_or_spout_value = torch.split(pasts_or_spout_value, [1] * self.config.num_layers, dim=1)
+            # make same shape as past_key_values
+            pasts_or_spout_value = tuple(
+                tuple(b.squeeze(1) for b in torch.split(a.squeeze(1), [1, 1], dim=1)) for a in pasts_or_spout_value
+            )
+        else:
+            pasts_or_spout_value = [None] * self.config.num_layers
+
+        # Token position considering spout and pasts
+        token_position = torch.arange(num_input_contexts).to(device) + num_pasts_contexts
+
+        if attention_mask is None:
+            attention_mask = torch.ones(num_batch, num_input_contexts, device=device)
+
+        # positions for get position_embeddings
+        gather_position = (
+            (
+                torch.zeros((num_batch, self.config.d_model, num_input_contexts)).to(device)
+                + token_position.unsqueeze(0)
+            )
+            .transpose(1, 2)
+            .long()
+        )
+        # When padding with padding_side="left", zeros line up on the left side of attention_mask, so position_embeddings is shifted accordingly
+        gather_position -= (1 - attention_mask).argmin(dim=-1).unsqueeze(1).unsqueeze(2)
+        gather_position = torch.clip(gather_position, num_pasts_contexts, self.config.max_position_embeddings - 1)
+
+        # attention_mask is applied per batch
+        for i in range(num_batch):
+            hidden_states[i] += torch.gather(self.position_embeddings.weight, dim=0, index=gather_position[i])
+
+        # Create a mask to be used when making the prefix Input length of Prefix-LM variable
+        causal_mask = (
+            torch.tril(torch.ones((num_output_contexts, num_output_contexts), dtype=torch.uint8))
+            .view(1, 1, num_output_contexts, num_output_contexts)
+            .to(device)
+        )
+        prefix_lm_mask = causal_mask[:, :, -num_input_contexts:, :]
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.unsqueeze(1).unsqueeze(2)
+            prefix_lm_mask = ((prefix_lm_mask + token_type_ids) > 0).float()
+        # Merge prefix_lm_mask and attention_mask
+        extended_attention_mask = prefix_lm_mask * attention_mask.unsqueeze(1).unsqueeze(2)
+
+        # Prepare head mask if needed
+        if head_mask is not None:
+            head_mask = self.get_head_mask(
+                head_mask, self.config.num_switch_layers + self.config.num_ext_layers
+            )  # n_layer x batch x n_heads x N x N
+
+        # outputs
+        present_key_value_states = () if self.config.use_cache or use_cache else None
+        all_hidden_states = () if self.config.output_hidden_states or output_hidden_states else None
+        all_attentions = () if self.config.output_attentions or output_attentions else None
+        all_router_probs = () if self.config.output_router_logits or output_router_logits else None
+
+        for layer, past in enumerate(pasts_or_spout_value):
+            if layer == self.config.num_switch_layers:
+                if self.config.num_ext_layers > 0:
+                    # extra_position_embeddings are extra position embeddings that are only created when extending the model with code from the original GPTSAN repository. Not used in the default model.
+                    # However, it is created when you create an additional layer and partially train only that location.
+                    # Therefore, convert_gptsan_tf_checkpoint_to_pytorch.py is used when converting and loading models created in the original GPTSAN repository.
+                    for i in range(num_batch):
+                        hidden_states[i] += torch.gather(
+                            self.extra_position_embeddings.weight, dim=0, index=gather_position[i]
+                        )
+
+            output_router_tuple = (
+                self.config.output_router_logits or output_router_logits
+            ) and layer < self.config.num_switch_layers
+            block_output = self.blocks[layer](
+                hidden_states=hidden_states,
+                past_key_values=past,
+                attention_mask=extended_attention_mask,
+                head_mask=head_mask,
+                use_cache=self.config.use_cache or use_cache,
+                output_attentions=self.config.output_attentions or output_attentions,
+                output_router_tuple=output_router_tuple,
+            )
+
+            outpos = 0
+            hidden_states = block_output[outpos]
+            if self.config.output_hidden_states or output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.config.use_cache or use_cache:
+                outpos += 1
+                present = block_output[outpos]
+                present_key_value_states += (present,)
+            if self.config.output_attentions or output_attentions:
+                outpos += 1
+                attention_probs = block_output[outpos]
+                all_attentions += (attention_probs,)
+            if output_router_tuple:
+                outpos += 1
+                router_tuple = block_output[outpos]
+                all_router_probs.append(router_tuple[0])
+
+        hidden_states = self.last_project(hidden_states)
+        hidden_states = self.act(hidden_states)
+
+        if self.config.output_hidden_states or output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    present_key_value_states,
+                    all_hidden_states,
+                    all_attentions,
+                    all_router_probs,
+                ]
+                if v is not None
+            )
+
+        return MoEModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=present_key_value_states,
+            hidden_states=all_hidden_states,
+            attentions=all_attentions,
+            router_probs=all_router_probs,
+        )
+
+
+@add_start_docstrings(
+    "The bare GPTSAN-japanese Model with a language modeling head.",
+    GPTSAN_JAPANESE_START_DOCSTRING,
+)
+class GPTSanJapaneseForConditionalGeneration(GPTSanJapanesePreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config: GPTSanJapaneseConfig):
+        super().__init__(config)
+        self.model = GPTSanJapaneseModel(config)
+        self.register_buffer("final_logits_bias", torch.zeros([1, config.vocab_size]))
+        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)
+        if not self.config.torchscript:
+            self.lm_head.weight = self.model.embed_tokens.weight
+
+    @add_start_docstrings_to_model_forward(GPTSAN_JAPANESE_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.FloatTensor] = None,
+        spout: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = False,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        output_router_logits: Optional[bool] = None,
+        labels: Optional[torch.LongTensor] = None,
+    ) -> Union[tuple[torch.FloatTensor], MoECausalLMOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
+            labels in `[0, ..., config.vocab_size]`
+
+        Returns:
+            `MoECausalLMOutputWithPast` or `tuple` if `return_dict` returns MoECausalLMOutputWithPast instead of tuple
+
+        Example:
+
+        Text Generation with regular LM Model
+        ```python
+        >>> from transformers import AutoModel, AutoTokenizer, trainer_utils
+
+        >>> device = "cuda"
+        >>> model = AutoModel.from_pretrained("Tanrei/GPTSAN-japanese").to(device)
+        >>> tokenizer = AutoTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
+        >>> x_token = tokenizer("織田信長は、", return_tensors="pt")
+        >>> trainer_utils.set_seed(30)
+        >>> input_ids = x_token.input_ids.to(device)
+        >>> gen_token = model.generate(input_ids, max_new_tokens=50)
+        >>> tokenizer.decode(gen_token[0])
+        "織田信長は、政治・軍事の中枢まで掌握した政治家であり、日本史上類を見ない驚異的な軍事侵攻を続け..."
+        ```
+
+        Text Generation with Prefix-LM Model
+        ```python
+        >>> from transformers import AutoModel, AutoTokenizer, trainer_utils
+
+        >>> device = "cuda"
+        >>> model = AutoModel.from_pretrained("Tanrei/GPTSAN-japanese").to(device)
+        >>> tokenizer = AutoTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
+        >>> x_token = tokenizer("", prefix_text="織田信長は、", return_tensors="pt")
+        >>> trainer_utils.set_seed(30)
+        >>> input_ids = x_token.input_ids.to(device)
+        >>> token_type_ids = x_token.token_type_ids.to(device)
+        >>> gen_token = model.generate(input_ids, token_type_ids=token_type_ids, max_new_tokens=50)
+        >>> tokenizer.decode(gen_token[0])
+        "織田信長は、政治・外交で数々の戦果を上げるが、1568年からは、いわゆる本能寺の変で細川晴元に暗殺される..."
+        ```
+
+        Simultaneously Text Generation And Masked Language Model
+        ```python
+        >>> from transformers import AutoModel, AutoTokenizer, trainer_utils
+
+        >>> device = "cuda"
+        >>> model = AutoModel.from_pretrained("Tanrei/GPTSAN-japanese").to(device)
+        >>> tokenizer = AutoTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
+        >>> masked_sentence = "武田信玄は、<|inputmask|>時代ファンならぜひ押さえ<|inputmask|>きたい名将の一人。"
+        >>> x_token = tokenizer("", prefix_text=masked_sentence, return_tensors="pt")
+        >>> trainer_utils.set_seed(30)
+        >>> input_ids = x_token.input_ids.to(device)
+        >>> token_type_ids = x_token.token_type_ids.to(device)
+        >>> out_lm_token = model.generate(input_ids, token_type_ids=token_type_ids, max_new_tokens=50)
+        >>> out_mlm_token = model(input_ids, token_type_ids=token_type_ids).logits.argmax(axis=-1)
+        >>> tokenizer.decode(out_mlm_token[0])
+        "武田信玄は、戦国時代ファンならぜひ押さえておきたい名将の一人。"
+
+        >>> tokenizer.decode(out_lm_token[0][input_ids.shape[1] :])
+        "武田氏の三代に渡った武田家のひとり\n甲斐市に住む、日本史上最大の戦国大名。..."
+        ```"""
+        SEG_TOKEN = self.config.separator_token_id
+        use_cache = use_cache or self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        model_return_dict = True
+        num_precontext = None
+        if input_ids is not None:
+            num_batch = input_ids.shape[0]
+            num_precontext = torch.zeros([num_batch]).int().to(input_ids.device)
+            where_separators = torch.where(input_ids == SEG_TOKEN)
+            num_precontext[where_separators[0]] += where_separators[1]
+            num_precontext = num_precontext.unsqueeze(1)
+
+        outputs = self.model(
+            input_ids,
+            attention_mask,
+            token_type_ids,
+            spout,
+            past_key_values,
+            head_mask,
+            use_cache,
+            inputs_embeds,
+            decoder_inputs_embeds,
+            output_attentions,
+            output_hidden_states,
+            model_return_dict,
+            output_router_logits,
+            num_precontext,
+        )
+
+        lm_logits = self.lm_head(outputs[0])
+        if lm_logits.shape[-1] == self.final_logits_bias.shape[-1]:
+            lm_logits = lm_logits + self.final_logits_bias
+
+        loss = None
+        z_loss = None
+        router_probs = None
+        aux_loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(lm_logits.device)
+
+            loss_fct = nn.CrossEntropyLoss(ignore_index=-100)
+
+            if output_router_logits:
+                # Compute the router loss (z_loss + auxiliary loss) for each router in the encoder and decoder
+                router_logits, expert_indexes = self._unpack_router_logits(outputs.router_probs)
+                z_loss = router_z_loss_func(router_logits)
+                router_probs = nn.Softmax(dim=-1)(router_logits)
+                aux_loss = load_balancing_loss_func(router_probs, expert_indexes)
+
+            loss = loss_fct(lm_logits.view(-1, lm_logits.size(-1)), labels.view(-1))
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    loss,
+                    lm_logits,
+                    outputs.past_key_values,
+                    outputs.hidden_states,
+                    outputs.router_probs,
+                    z_loss,
+                    aux_loss,
+                ]
+                if v is not None
+            )
+
+        return MoECausalLMOutputWithPast(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            router_logits=outputs.router_probs,
+            z_loss=z_loss,
+            aux_loss=aux_loss,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: torch.LongTensor,
+        attention_mask: torch.FloatTensor,
+        token_type_ids: Optional[torch.FloatTensor] = None,
+        spout: Optional[Union[list, torch.FloatTensor]] = None,
+        past_key_values: Optional[Cache] = None,
+        **kwargs,
+    ):
+        if isinstance(spout, list):
+            spout = torch.tensor(spout).float()
+            if input_ids is not None:
+                spout = spout.to(input_ids.device)
+        if past_key_values is not None:
+            return {
+                "input_ids": input_ids[:, -1:] if input_ids is not None else None,
+                "attention_mask": attention_mask,
+                "token_type_ids": token_type_ids[:, -1:] if token_type_ids is not None else None,
+                "spout": spout,
+                "past_key_values": past_key_values,
+            }
+        return {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "token_type_ids": token_type_ids,
+            "spout": spout,
+            "past_key_values": None,
+        }
+
+    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
+        return self._shift_right(labels)
+
+    def resize_token_embeddings(
+        self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None, mean_resizing: bool = True
+    ) -> nn.Embedding:
+        new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of, mean_resizing)
+        self._resize_final_logits_bias(new_embeddings.weight.shape[0])
+        return new_embeddings
+
+    def _resize_final_logits_bias(self, new_num_tokens: int) -> None:
+        old_num_tokens = self.final_logits_bias.shape[-1]
+        if new_num_tokens <= old_num_tokens:
+            new_bias = self.final_logits_bias[:, :new_num_tokens]
+        else:
+            extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device)
+            new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
+        self.register_buffer("final_logits_bias", new_bias)
+
+    def get_input_embeddings(self):
+        return self.model.get_input_embeddings()
+
+    def set_input_embeddings(self, new_embeddings):
+        self.model.set_input_embeddings(new_embeddings)
+
+    def _unpack_router_logits(self, router_outputs):
+        total_router_logits = []
+        total_expert_indexes = []
+        for router_output in router_outputs:
+            if len(router_output[0].shape) > 1:
+                router_logits, expert_indexes = router_output
+                total_router_logits.append(router_logits)
+                total_expert_indexes.append(expert_indexes)
+        return torch.cat(total_router_logits, dim=1), torch.cat(total_expert_indexes, dim=1)
+
+
+__all__ = ["GPTSanJapaneseForConditionalGeneration", "GPTSanJapaneseModel", "GPTSanJapanesePreTrainedModel"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/gptsan_japanese/tokenization_gptsan_japanese.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/gptsan_japanese/tokenization_gptsan_japanese.py
new file mode 100644
index 0000000000000000000000000000000000000000..1025fdf75fb46a0d350359a66c09315bd42effd3
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/gptsan_japanese/tokenization_gptsan_japanese.py
@@ -0,0 +1,518 @@
+# coding=utf-8
+# Copyright 2023 HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for GPTSANJapanese."""
+
+import collections
+import json
+import os
+import re
+import sys
+from typing import Optional, Union
+
+import numpy as np
+
+from ....tokenization_utils import PreTrainedTokenizer
+from ....tokenization_utils_base import (
+    BatchEncoding,
+    PreTokenizedInput,
+    PreTokenizedInputPair,
+    TextInput,
+    TextInputPair,
+    TruncationStrategy,
+)
+from ....utils import PaddingStrategy, logging
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "emoji_file": "emoji.json"}
+
+
+def load_vocab_and_emoji(vocab_file, emoji_file):
+    """Loads a vocabulary file and emoji file into a dictionary."""
+    with open(emoji_file, "r", encoding="utf-8") as f:
+        emoji = json.loads(f.read())
+
+    vocab = collections.OrderedDict()
+    raw_vocab = collections.OrderedDict()
+    ids_to_tokens = collections.OrderedDict()
+    with open(vocab_file, "r", encoding="utf-8") as f:
+        token = f.readlines()
+    token = [[t.rstrip("\n")] if (t == ",\n" or "," not in t) else t.rstrip("\n").split(",") for t in token]
+    for idx, b in enumerate(token):
+        ids_to_tokens[idx] = b
+        raw_vocab[",".join(b)] = idx
+        for wd in b:
+            vocab[wd] = idx
+
+    return vocab, raw_vocab, ids_to_tokens, emoji
+
+
+class GPTSanJapaneseTokenizer(PreTrainedTokenizer):
+    """
+    This tokenizer is based on GPTNeoXJapaneseTokenizer and has the following modifications
+    - Decoding byte0~byte255 tokens correctly
+    - Added bagofword token handling
+    - Return token_type_ids for Prefix-LM model
+    The bagofword token represents a repetition of the previous token and is converted to 3 consecutive tokens when
+    decoding In addition, the original Japanese special Sub-Word-Encoding has been released in this repository
+    (https://github.com/tanreinama/Japanese-BPEEncoder_V2). The token_type_ids is a mask indicating the prefix input
+    position of the Prefix-LM model. To specify a prefix position, specify a prefix input for prefix_text, or specify a
+    sentence of the prefix part and the part after it as a text pair of batch input.
+
+    Example:
+
+    ```python
+    >>> from transformers import GPTSanJapaneseTokenizer
+
+    >>> tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
+    >>> # You can confirm both 慶応 and 慶應 are encoded to 17750
+    >>> tokenizer("吾輩は猫である🐯。実は慶応(慶應)大学出身")["input_ids"]
+    [35993, 35998, 34347, 31459, 30647, 31448, 25, 30659, 35729, 35676, 32417, 30647, 17750, 35589, 17750, 35590, 321, 1281]
+
+    >>> # Both 慶応 and 慶應 are decoded to 慶応
+    >>> tokenizer.decode(tokenizer("吾輩は猫である🐯。実は慶応(慶應)大学出身")["input_ids"])
+    '吾輩は猫である🐯。実は慶応(慶応)大学出身'
+    ```
+
+    Example for Prefix-LM:
+
+    ```python
+    >>> from transformers import GPTSanJapaneseTokenizer
+
+    >>> tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
+    >>> tokenizer("実は慶応(慶應)大学出身", prefix_text="吾輩は猫である🐯。")["input_ids"]
+    [35993, 34347, 31459, 30647, 31448, 25, 30659, 35729, 35676, 35998, 32417, 30647, 17750, 35589, 17750, 35590, 321, 1281]
+
+    >>> # Mask for Prefix-LM inputs
+    >>> tokenizer("実は慶応(慶應)大学出身", prefix_text="吾輩は猫である🐯。")["token_type_ids"]
+    [1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0]
+    ```
+
+    Example for batch encode:
+
+    ```python
+    >>> from transformers import GPTSanJapaneseTokenizer
+
+    >>> tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
+    >>> tokenizer([["武田信玄", "は、"], ["織田信長", "の配下の、"]], padding=True)["input_ids"]
+    [[35993, 35998, 8640, 25948, 35993, 35998, 30647, 35675, 35999, 35999], [35993, 35998, 10382, 9868, 35993, 35998, 30646, 9459, 30646, 35675]]
+
+    >>> # Mask for Prefix-LM inputs
+    >>> tokenizer([["武田信玄", "は、"], ["織田信長", "の配下の、"]], padding=True)["token_type_ids"]
+    [[1, 0, 0, 0, 0, 0, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0, 0, 0, 0, 0]]
+
+    >>> # Mask for padding
+    >>> tokenizer([["武田信玄", "は、"], ["織田信長", "の配下の、"]], padding=True)["attention_mask"]
+    [[1, 1, 1, 1, 1, 1, 1, 1, 0, 0], [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]
+    ```
+
+    Args:
+        vocab_file (`str`):
+            File containing the vocabulary.
+        emoji_file (`str`):
+            File containing the emoji.
+        unk_token (`str`, *optional*, defaults to `"<|nottoken|>"`):
+            The token used for unknown character
+        pad_token (`str`, *optional*, defaults to `"<|separator|>"`):
+            The token used for padding
+        bos_token (`str`, *optional*, defaults to `"<|startoftext|>"`):
+            The beginning of sequence token.
+        eos_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The end of sequence token.
+        sep_token (`str`, *optional*, defaults to `"<|segmenter|>"`):
+            A special token to separate token to prefix part and general input part.
+        do_clean_text (`bool`, *optional*, defaults to `False`):
+            Whether or not to clean text for URL, EMAIL, TEL, Japanese DATE and Japanese PRICE.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask", "token_type_ids"]
+
+    def __init__(
+        self,
+        vocab_file,
+        emoji_file,
+        unk_token="<|nottoken|>",
+        pad_token="<|separator|>",
+        bos_token="<|startoftext|>",
+        eos_token="<|endoftext|>",
+        sep_token="<|segmenter|>",
+        do_clean_text=False,
+        **kwargs,
+    ):
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
+                " model use `tokenizer = GPTSanJapaneseTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
+            )
+        if not os.path.isfile(emoji_file):
+            raise ValueError(
+                f"Can't find a emoji file at path '{emoji_file}'. To load the emoji information from a Google"
+                " pretrained model use `tokenizer = GPTSanJapaneseTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
+            )
+        self.do_clean_text = do_clean_text
+        self.vocab, self.raw_vocab, self.ids_to_tokens, self.emoji = load_vocab_and_emoji(vocab_file, emoji_file)
+        self.subword_tokenizer = SubWordJapaneseTokenizer(
+            vocab=self.vocab, ids_to_tokens=self.ids_to_tokens, emoji=self.emoji
+        )
+
+        super().__init__(
+            unk_token=unk_token,
+            pad_token=pad_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            sep_token=sep_token,
+            do_clean_text=do_clean_text,
+            **kwargs,
+        )
+
+    @property
+    def vocab_size(self):
+        # self.vocab contains support for character fluctuation unique to Japanese, and has a large number of vocab
+        return len(self.raw_vocab)
+
+    def get_vocab(self):
+        return dict(self.raw_vocab, **self.added_tokens_encoder)
+
+    def _tokenize(self, text):
+        return self.subword_tokenizer.tokenize(text, clean=self.do_clean_text)
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.subword_tokenizer.convert_id_to_token(index)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        words = []
+        byte_tokens = []
+        for word in tokens:
+            if word[:6] == "<|byte" and word[-2:] == "|>":
+                byte_tokens.append(int(word[6:-2]))
+            else:
+                if len(byte_tokens) > 0:
+                    words.append(bytearray(byte_tokens).decode("utf-8", errors="replace"))
+                    byte_tokens = []
+                if word[:7] == "<|emoji" and word[-2:] == "|>":
+                    words.append(self.emoji["emoji_inv"][word])
+                elif word == "<SP>":
+                    words.append(" ")
+                elif word == "<BR>":
+                    words.append("\n")
+                elif word == "<TAB>":
+                    words.append("\t")
+                elif word == "<BLOCK>":
+                    words.append("▀")
+                elif word == "<KIGOU>":
+                    words.append("ǀ")
+                elif word == "<U2000U2BFF>":
+                    words.append("‖")
+                elif word == "<|bagoftoken|>":
+                    if len(words) > 0:
+                        words.append(words[-1])
+                        words.append(words[-1])
+                        words.append(words[-1])
+                elif word.startswith("<|") and word.endswith("|>"):
+                    words.append("")
+                else:
+                    words.append(word)
+        if len(byte_tokens) > 0:
+            words.append(bytearray(byte_tokens).decode("utf-8", errors="replace"))
+        text = "".join(words)
+        return text
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        index = 0
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(
+                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+            )
+            emoji_file = os.path.join(
+                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["emoji_file"]
+            )
+        else:
+            vocab_file = (
+                (filename_prefix + "-" if filename_prefix else "") + save_directory + VOCAB_FILES_NAMES["vocab_file"]
+            )
+            emoji_file = (
+                (filename_prefix + "-" if filename_prefix else "") + save_directory + VOCAB_FILES_NAMES["emoji_file"]
+            )
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token_index, token in self.ids_to_tokens.items():
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
+                        " Please check that the vocabulary is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(",".join(token) + "\n")
+                index += 1
+        with open(emoji_file, "w", encoding="utf-8") as writer:
+            json.dump(self.emoji, writer)
+        return vocab_file, emoji_file
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        # docstyle-ignore
+        """
+        The tokenizer returns token_type_ids as separators between the Prefix part and the rest.
+        token_type_ids is 1 for the Prefix part and 0 for the rest of the token.
+
+        Example:
+        ```python
+        >>> from transformers import GPTSanJapaneseTokenizer
+
+        >>> tokenizer = GPTSanJapaneseTokenizer.from_pretrained("Tanrei/GPTSAN-japanese")
+        >>> x_token = tokenizer("ｱｲｳｴ")
+        >>> # input_ids:      | SOT | SEG | ｱ | ｲ | ｳ | ｴ |
+        >>> # token_type_ids: | 1   | 0   | 0 | 0 | 0 | 0 |
+
+        >>> x_token = tokenizer("", prefix_text="ｱｲｳｴ")
+        >>> # input_ids:      | SOT | ｱ | ｲ | ｳ | ｴ | SEG |
+        >>> # token_type_ids: | 1   | 1 | 1 | 1 | 1 | 0  |
+
+        >>> x_token = tokenizer("ｳｴ", prefix_text="ｱｲ")
+        >>> # input_ids:      | SOT | ｱ | ｲ | SEG | ｳ | ｴ |
+        >>> # token_type_ids: | 1   | 1 | 1 | 0   | 0 | 0 |
+        ```"""
+        prefix_len = 0
+        if self.sep_token in self.vocab:
+            segid = self.vocab[self.sep_token]
+            if segid in token_ids_0:
+                prefix_len = token_ids_0.index(segid)
+        if token_ids_1 is None:
+            total_len = len(token_ids_0)
+        else:
+            total_len = len(token_ids_0 + token_ids_1)
+        return prefix_len * [1] + (total_len - prefix_len) * [0]
+
+    def prepare_for_tokenization(self, text, prefix_text=None, add_sep_token=None, **kwargs):
+        # GPTSAN inserts extra SEP tokens in Prefix-LM in addition to SOT for text generation.
+        # SOT at the beginning of the text, and SEP at the separator between the Prefix part and the rest.
+        if add_sep_token is None:
+            add_sep_token = self.sep_token not in text  # If insert un-prefix position explicitly
+        prepared = self.bos_token if self.bos_token in self.vocab else ""
+        prepared += prefix_text if prefix_text is not None else ""
+        if add_sep_token:
+            prepared += self.sep_token if self.sep_token in self.vocab else ""
+        prepared += text
+        return (prepared, kwargs)
+
+    def _batch_encode_plus(
+        self,
+        batch_text_or_text_pairs: Union[
+            list[TextInput], list[TextInputPair], list[PreTokenizedInput], list[PreTokenizedInputPair]
+        ],
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        is_split_into_words: bool = False,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[str] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        # This tokenizer converts input text pairs into Prefix input and subsequent input
+        if isinstance(batch_text_or_text_pairs[0], tuple) or isinstance(tuple(batch_text_or_text_pairs[0]), list):
+            # As a single text with an explicit un-prefix position
+            batch_prefix_texts = []
+            for pref, txt in batch_text_or_text_pairs:
+                batch_prefix_texts.append(pref + self.sep_token + txt)
+            batch_text_or_text_pairs = batch_prefix_texts
+
+        return super()._batch_encode_plus(
+            batch_text_or_text_pairs,
+            add_special_tokens,
+            padding_strategy,
+            truncation_strategy,
+            max_length,
+            stride,
+            is_split_into_words,
+            pad_to_multiple_of,
+            return_tensors,
+            return_token_type_ids,
+            return_attention_mask,
+            return_overflowing_tokens,
+            return_special_tokens_mask,
+            return_offsets_mapping,
+            return_length,
+            verbose,
+            **kwargs,
+        )
+
+
+class SubWordJapaneseTokenizer:
+    """
+    This tokenizer is based on GPTNeoXJapaneseTokenizer and has the following modifications
+    - Decoding byte0~byte255 tokens correctly
+    - Added bagofword token handling
+
+    https://github.com/tanreinama/Japanese-BPEEncoder_V2 This tokenizer class is under MIT License according to the
+    original repository.
+
+    MIT License
+
+    Copyright (c) 2020 tanreinama
+
+    Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated
+    documentation files (the "Software"), to deal in the Software without restriction, including without limitation the
+    rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to
+    permit persons to whom the Software is furnished to do so, subject to the following conditions:
+
+    The above copyright notice and this permission notice shall be included in all copies or substantial portions of
+    the Software.
+
+    THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO
+    THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+    AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
+    TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+    SOFTWARE.
+    """
+
+    def __init__(self, vocab, ids_to_tokens, emoji):
+        self.vocab = vocab  # same as swe
+        self.ids_to_tokens = ids_to_tokens  # same as bpe
+        self.emoji = emoji
+        self.maxlen = np.max([len(w) for w in self.vocab])
+        self.content_repatter1 = re.compile(r"(https?|ftp)(:\/\/[-_\.!~*\'()a-zA-Z0-9;\/?:\@&=\+$,%#]+)")
+        self.content_repatter2 = re.compile(r"[A-Za-z0-9\._+]*@[\-_0-9A-Za-z]+(\.[A-Za-z]+)*")
+        self.content_repatter3 = re.compile(r"[\(]{0,1}[0-9]{2,4}[\)\-\(]{0,1}[0-9]{2,4}[\)\-]{0,1}[0-9]{3,4}")
+        self.content_repatter4 = re.compile(
+            r"([12]\d{3}[/\-年])*(0?[1-9]|1[0-2])[/\-月]((0?[1-9]|[12][0-9]|3[01])日?)*(\d{1,2}|:|\d{1,2}時|\d{1,2}分|\(日\)|\(月\)|\(火\)|\(水\)|\(木\)|\(金\)|\(土\)|㈰|㈪|㈫|㈬|㈭|㈮|㈯)*"
+        )
+        self.content_repatter5 = re.compile(
+            r"(明治|大正|昭和|平成|令和|㍾|㍽|㍼|㍻|\u32ff)\d{1,2}年(0?[1-9]|1[0-2])月(0?[1-9]|[12][0-9]|3[01])日(\d{1,2}|:|\d{1,2}時|\d{1,2}分|\(日\)|\(月\)|\(火\)|\(水\)|\(木\)|\(金\)|\(土\)|㈰|㈪|㈫|㈬|㈭|㈮|㈯)*"
+        )
+        # The original version of this regex displays catastrophic backtracking behaviour. We avoid this using
+        # possessive quantifiers in Py >= 3.11. In versions below this, we avoid the vulnerability using a slightly
+        # different regex that should generally have the same behaviour in most non-pathological cases.
+        if sys.version_info >= (3, 11):
+            self.content_repatter6 = re.compile(
+                r"(?:\d,\d{3}|[\d億])*+"
+                r"(?:\d,\d{3}|[\d万])*+"
+                r"(?:\d,\d{3}|[\d千])*+"
+                r"(?:千円|万円|千万円|円|千ドル|万ドル|千万ドル|ドル|千ユーロ|万ユーロ|千万ユーロ|ユーロ)+"
+                r"(?:\(税込\)|\(税抜\)|\+tax)*"
+            )
+        else:
+            self.content_repatter6 = re.compile(
+                r"(?:\d,\d{3}|[\d億万千])*"
+                r"(?:千円|万円|千万円|円|千ドル|万ドル|千万ドル|ドル|千ユーロ|万ユーロ|千万ユーロ|ユーロ)+"
+                r"(?:\(税込\)|\(税抜\)|\+tax)*"
+            )
+        keisen = "─━│┃┄┅┆┇┈┉┊┋┌┍┎┏┐┑┒┓└┕┖┗┘┙┚┛├┝┞┟┠┡┢┣┤┥┦┧┨┩┪┫┬┭┮┯┰┱┲┳┴┵┶┷┸┹┺┻┼┽┾┿╀╁╂╃╄╅╆╇╈╉╊╋╌╍╎╏═║╒╓╔╕╖╗╘╙╚╛╜╝╞╟╠╡╢╣╤╥╦╧╨╩╪╫╬╭╮╯╰╱╲╳╴╵╶╷╸╹╺╻╼╽╾╿"
+        blocks = "▀▁▂▃▄▅▆▇█▉▊▋▌▍▎▏▐░▒▓▔▕▖▗▘▙▚▛▜▝▞▟"
+        self.content_trans1 = str.maketrans(dict.fromkeys(keisen + blocks, "<BLOCK>"))
+
+    def __len__(self):
+        return len(self.ids_to_tokens)
+
+    def clean_text(self, content):
+        content = self.content_repatter1.sub("<URL>", content)
+        content = self.content_repatter2.sub("<EMAIL>", content)
+        content = self.content_repatter3.sub("<TEL>", content)
+        content = self.content_repatter4.sub("<DATE>", content)
+        content = self.content_repatter5.sub("<DATE>", content)
+        content = self.content_repatter6.sub("<PRICE>", content)
+        content = content.translate(self.content_trans1)
+        while "<BLOCK><BLOCK>" in content:
+            content = content.replace("<BLOCK><BLOCK>", "<BLOCK>")
+        return content
+
+    def tokenize(self, text, clean=False):
+        text = text.replace(" ", "<SP>")
+        text = text.replace("　", "<SP>")
+        text = text.replace("\r\n", "<BR>")
+        text = text.replace("\n", "<BR>")
+        text = text.replace("\r", "<BR>")
+        text = text.replace("\t", "<TAB>")
+        text = text.replace("—", "ー")
+        text = text.replace("−", "ー")
+        for k, v in self.emoji["emoji"].items():
+            if k in text:
+                text = text.replace(k, v)
+        if clean:
+            text = self.clean_text(text)
+
+        def check_simbol(x):
+            e = x.encode()
+            if len(x) == 1 and len(e) == 2:
+                c = (int(e[0]) << 8) + int(e[1])
+                if (
+                    (c >= 0xC2A1 and c <= 0xC2BF)
+                    or (c >= 0xC780 and c <= 0xC783)
+                    or (c >= 0xCAB9 and c <= 0xCBBF)
+                    or (c >= 0xCC80 and c <= 0xCDA2)
+                ):
+                    return True
+            return False
+
+        def checku2e(x):
+            e = x.encode()
+            if len(x) == 1 and len(e) == 3:
+                c = (int(e[0]) << 16) + (int(e[1]) << 8) + int(e[2])
+                if c >= 0xE28080 and c <= 0xE2B07F:
+                    return True
+            return False
+
+        pos = 0
+        result = []
+        while pos < len(text):
+            end = min(len(text), pos + self.maxlen + 1) if text[pos] == "<" else pos + 3
+            candidates = []  # (token_id, token, pos)
+            for e in range(end, pos, -1):
+                wd = text[pos:e]
+                if wd in self.vocab:
+                    if wd[0] == "<" and len(wd) > 2:
+                        candidates = [(self.vocab[wd], wd, e)]
+                        break
+                    else:
+                        candidates.append((self.vocab[wd], wd, e))
+            if len(candidates) > 0:
+                # the smallest token_id is adopted
+                _, wd, e = min(candidates, key=lambda x: x[0])
+                result.append(wd)
+                pos = e
+            else:
+                end = pos + 1
+                wd = text[pos:end]
+                if check_simbol(wd):
+                    result.append("<KIGOU>")
+                elif checku2e(wd):
+                    result.append("<U2000U2BFF>")
+                else:
+                    for i in wd.encode("utf-8"):
+                        result.append("<|byte%d|>" % i)
+                pos = end
+        return result
+
+    def convert_id_to_token(self, index):
+        return self.ids_to_tokens[index][0]
+
+
+__all__ = ["GPTSanJapaneseTokenizer"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..3a4b3eb1be2b4e69dcad1540abdd91f412de0ca2
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ....utils import _LazyModule
+from ....utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_graphormer import *
+    from .modeling_graphormer import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/algos_graphormer.pyx b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/algos_graphormer.pyx
new file mode 100644
index 0000000000000000000000000000000000000000..a0fafbdee53b55efb9596036817b03be0d006992
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/algos_graphormer.pyx
@@ -0,0 +1,107 @@
+# Copyright (c) Microsoft Corporation and HuggingFace
+# Licensed under the MIT License.
+
+import cython
+
+cimport numpy
+from cython.parallel cimport parallel, prange
+
+import numpy as np
+
+
+# Reduce this number if matrices are too big for large graphs
+UNREACHABLE_NODE_DISTANCE = 510 
+
+def floyd_warshall(adjacency_matrix):
+    """
+    Applies the Floyd-Warshall algorithm to the adjacency matrix, to compute the 
+    shortest paths distance between all nodes, up to UNREACHABLE_NODE_DISTANCE.
+    """
+    (nrows, ncols) = adjacency_matrix.shape
+    assert nrows == ncols
+    cdef unsigned int n = nrows
+
+    adj_mat_copy = adjacency_matrix.astype(np.int32, order='C', casting='safe', copy=True)
+    assert adj_mat_copy.flags['C_CONTIGUOUS']
+    cdef numpy.ndarray[numpy.int32_t, ndim=2, mode='c'] M = adj_mat_copy
+    cdef numpy.ndarray[numpy.int32_t, ndim=2, mode='c'] path = -1 * np.ones([n, n], dtype=np.int32)
+
+    cdef unsigned int i, j, k
+    cdef numpy.int32_t M_ij, M_ik, cost_ikkj
+    cdef numpy.int32_t* M_ptr = &M[0,0]
+    cdef numpy.int32_t* M_i_ptr
+    cdef numpy.int32_t* M_k_ptr
+
+    # set unreachable nodes distance to UNREACHABLE_NODE_DISTANCE
+    for i in range(n):
+        for j in range(n):
+            if i == j:
+                M[i][j] = 0
+            elif M[i][j] == 0:
+                M[i][j] = UNREACHABLE_NODE_DISTANCE
+
+    # floyed algo
+    for k in range(n):
+        M_k_ptr = M_ptr + n*k
+        for i in range(n):
+            M_i_ptr = M_ptr + n*i
+            M_ik = M_i_ptr[k]
+            for j in range(n):
+                cost_ikkj = M_ik + M_k_ptr[j]
+                M_ij = M_i_ptr[j]
+                if M_ij > cost_ikkj:
+                    M_i_ptr[j] = cost_ikkj
+                    path[i][j] = k
+
+    # set unreachable path to UNREACHABLE_NODE_DISTANCE
+    for i in range(n):
+        for j in range(n):
+            if M[i][j] >= UNREACHABLE_NODE_DISTANCE:
+                path[i][j] = UNREACHABLE_NODE_DISTANCE
+                M[i][j] = UNREACHABLE_NODE_DISTANCE
+
+    return M, path
+
+
+def get_all_edges(path, i, j):
+    """
+    Recursive function to compute all possible paths between two nodes from the graph adjacency matrix.
+    """
+    cdef int k = path[i][j]
+    if k == -1:
+        return []
+    else:
+        return get_all_edges(path, i, k) + [k] + get_all_edges(path, k, j)
+
+
+def gen_edge_input(max_dist, path, edge_feat):
+    """
+    Generates the full edge feature and adjacency matrix.
+    Shape: num_nodes * num_nodes * max_distance_between_nodes * num_edge_features
+    Dim 1 is the input node, dim 2 the output node of the edge, dim 3 the depth of the edge, dim 4 the feature
+    """
+    (nrows, ncols) = path.shape
+    assert nrows == ncols
+    cdef unsigned int n = nrows
+    cdef unsigned int max_dist_copy = max_dist
+
+    path_copy = path.astype(long, order='C', casting='safe', copy=True)
+    edge_feat_copy = edge_feat.astype(long, order='C', casting='safe', copy=True)
+    assert path_copy.flags['C_CONTIGUOUS']
+    assert edge_feat_copy.flags['C_CONTIGUOUS']
+
+    cdef numpy.ndarray[numpy.int32_t, ndim=4, mode='c'] edge_fea_all = -1 * np.ones([n, n, max_dist_copy, edge_feat.shape[-1]], dtype=np.int32)
+    cdef unsigned int i, j, k, num_path, cur
+
+    for i in range(n):
+        for j in range(n):
+            if i == j:
+                continue
+            if path_copy[i][j] == UNREACHABLE_NODE_DISTANCE:
+                continue
+            path = [i] + get_all_edges(path_copy, i, j) + [j]
+            num_path = len(path) - 1
+            for k in range(num_path):
+                edge_fea_all[i, j, k, :] = edge_feat_copy[path[k], path[k+1], :]
+
+    return edge_fea_all
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/collating_graphormer.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/collating_graphormer.py
new file mode 100644
index 0000000000000000000000000000000000000000..88657bab435d95cf5f06014524b184939e8582c7
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/collating_graphormer.py
@@ -0,0 +1,135 @@
+# Copyright (c) Microsoft Corporation and HuggingFace
+# Licensed under the MIT License.
+
+from collections.abc import Mapping
+from typing import Any
+
+import numpy as np
+import torch
+
+from ....utils import is_cython_available, requires_backends
+
+
+if is_cython_available():
+    import pyximport
+
+    pyximport.install(setup_args={"include_dirs": np.get_include()})
+    from . import algos_graphormer
+
+
+def convert_to_single_emb(x, offset: int = 512):
+    feature_num = x.shape[1] if len(x.shape) > 1 else 1
+    feature_offset = 1 + np.arange(0, feature_num * offset, offset, dtype=np.int64)
+    x = x + feature_offset
+    return x
+
+
+def preprocess_item(item, keep_features=True):
+    requires_backends(preprocess_item, ["cython"])
+
+    if keep_features and "edge_attr" in item:  # edge_attr
+        edge_attr = np.asarray(item["edge_attr"], dtype=np.int64)
+    else:
+        edge_attr = np.ones((len(item["edge_index"][0]), 1), dtype=np.int64)  # same embedding for all
+
+    if keep_features and "node_feat" in item:  # input_nodes
+        node_feature = np.asarray(item["node_feat"], dtype=np.int64)
+    else:
+        node_feature = np.ones((item["num_nodes"], 1), dtype=np.int64)  # same embedding for all
+
+    edge_index = np.asarray(item["edge_index"], dtype=np.int64)
+
+    input_nodes = convert_to_single_emb(node_feature) + 1
+    num_nodes = item["num_nodes"]
+
+    if len(edge_attr.shape) == 1:
+        edge_attr = edge_attr[:, None]
+    attn_edge_type = np.zeros([num_nodes, num_nodes, edge_attr.shape[-1]], dtype=np.int64)
+    attn_edge_type[edge_index[0], edge_index[1]] = convert_to_single_emb(edge_attr) + 1
+
+    # node adj matrix [num_nodes, num_nodes] bool
+    adj = np.zeros([num_nodes, num_nodes], dtype=bool)
+    adj[edge_index[0], edge_index[1]] = True
+
+    shortest_path_result, path = algos_graphormer.floyd_warshall(adj)
+    max_dist = np.amax(shortest_path_result)
+
+    input_edges = algos_graphormer.gen_edge_input(max_dist, path, attn_edge_type)
+    attn_bias = np.zeros([num_nodes + 1, num_nodes + 1], dtype=np.single)  # with graph token
+
+    # combine
+    item["input_nodes"] = input_nodes + 1  # we shift all indices by one for padding
+    item["attn_bias"] = attn_bias
+    item["attn_edge_type"] = attn_edge_type
+    item["spatial_pos"] = shortest_path_result.astype(np.int64) + 1  # we shift all indices by one for padding
+    item["in_degree"] = np.sum(adj, axis=1).reshape(-1) + 1  # we shift all indices by one for padding
+    item["out_degree"] = item["in_degree"]  # for undirected graph
+    item["input_edges"] = input_edges + 1  # we shift all indices by one for padding
+    if "labels" not in item:
+        item["labels"] = item["y"]
+
+    return item
+
+
+class GraphormerDataCollator:
+    def __init__(self, spatial_pos_max=20, on_the_fly_processing=False):
+        if not is_cython_available():
+            raise ImportError("Graphormer preprocessing needs Cython (pyximport)")
+
+        self.spatial_pos_max = spatial_pos_max
+        self.on_the_fly_processing = on_the_fly_processing
+
+    def __call__(self, features: list[dict]) -> dict[str, Any]:
+        if self.on_the_fly_processing:
+            features = [preprocess_item(i) for i in features]
+
+        if not isinstance(features[0], Mapping):
+            features = [vars(f) for f in features]
+        batch = {}
+
+        max_node_num = max(len(i["input_nodes"]) for i in features)
+        node_feat_size = len(features[0]["input_nodes"][0])
+        edge_feat_size = len(features[0]["attn_edge_type"][0][0])
+        max_dist = max(len(i["input_edges"][0][0]) for i in features)
+        edge_input_size = len(features[0]["input_edges"][0][0][0])
+        batch_size = len(features)
+
+        batch["attn_bias"] = torch.zeros(batch_size, max_node_num + 1, max_node_num + 1, dtype=torch.float)
+        batch["attn_edge_type"] = torch.zeros(batch_size, max_node_num, max_node_num, edge_feat_size, dtype=torch.long)
+        batch["spatial_pos"] = torch.zeros(batch_size, max_node_num, max_node_num, dtype=torch.long)
+        batch["in_degree"] = torch.zeros(batch_size, max_node_num, dtype=torch.long)
+        batch["input_nodes"] = torch.zeros(batch_size, max_node_num, node_feat_size, dtype=torch.long)
+        batch["input_edges"] = torch.zeros(
+            batch_size, max_node_num, max_node_num, max_dist, edge_input_size, dtype=torch.long
+        )
+
+        for ix, f in enumerate(features):
+            for k in ["attn_bias", "attn_edge_type", "spatial_pos", "in_degree", "input_nodes", "input_edges"]:
+                f[k] = torch.tensor(f[k])
+
+            if len(f["attn_bias"][1:, 1:][f["spatial_pos"] >= self.spatial_pos_max]) > 0:
+                f["attn_bias"][1:, 1:][f["spatial_pos"] >= self.spatial_pos_max] = float("-inf")
+
+            batch["attn_bias"][ix, : f["attn_bias"].shape[0], : f["attn_bias"].shape[1]] = f["attn_bias"]
+            batch["attn_edge_type"][ix, : f["attn_edge_type"].shape[0], : f["attn_edge_type"].shape[1], :] = f[
+                "attn_edge_type"
+            ]
+            batch["spatial_pos"][ix, : f["spatial_pos"].shape[0], : f["spatial_pos"].shape[1]] = f["spatial_pos"]
+            batch["in_degree"][ix, : f["in_degree"].shape[0]] = f["in_degree"]
+            batch["input_nodes"][ix, : f["input_nodes"].shape[0], :] = f["input_nodes"]
+            batch["input_edges"][
+                ix, : f["input_edges"].shape[0], : f["input_edges"].shape[1], : f["input_edges"].shape[2], :
+            ] = f["input_edges"]
+
+        batch["out_degree"] = batch["in_degree"]
+
+        sample = features[0]["labels"]
+        if len(sample) == 1:  # one task
+            if isinstance(sample[0], float):  # regression
+                batch["labels"] = torch.from_numpy(np.concatenate([i["labels"] for i in features]))
+            else:  # binary classification
+                batch["labels"] = torch.from_numpy(np.concatenate([i["labels"] for i in features]))
+        else:  # multi task classification, left to float to keep the NaNs
+            batch["labels"] = torch.from_numpy(np.stack([i["labels"] for i in features], axis=0))
+
+        return batch
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/configuration_graphormer.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/configuration_graphormer.py
new file mode 100644
index 0000000000000000000000000000000000000000..81c13c6a802b3128d4f88b23f59ac31e8484e901
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/configuration_graphormer.py
@@ -0,0 +1,220 @@
+# coding=utf-8
+# Copyright 2022 Microsoft, clefourrier and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Graphormer model configuration"""
+
+from typing import Optional
+
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class GraphormerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`~GraphormerModel`]. It is used to instantiate an
+    Graphormer model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the Graphormer
+    [graphormer-base-pcqm4mv1](https://huggingface.co/graphormer-base-pcqm4mv1) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        num_classes (`int`, *optional*, defaults to 1):
+            Number of target classes or labels, set to n for binary classification of n tasks.
+        num_atoms (`int`, *optional*, defaults to 512*9):
+            Number of node types in the graphs.
+        num_edges (`int`, *optional*, defaults to 512*3):
+            Number of edges types in the graph.
+        num_in_degree (`int`, *optional*, defaults to 512):
+            Number of in degrees types in the input graphs.
+        num_out_degree (`int`, *optional*, defaults to 512):
+            Number of out degrees types in the input graphs.
+        num_edge_dis (`int`, *optional*, defaults to 128):
+            Number of edge dis in the input graphs.
+        multi_hop_max_dist (`int`, *optional*, defaults to 20):
+            Maximum distance of multi hop edges between two nodes.
+        spatial_pos_max (`int`, *optional*, defaults to 1024):
+            Maximum distance between nodes in the graph attention bias matrices, used during preprocessing and
+            collation.
+        edge_type (`str`, *optional*, defaults to multihop):
+            Type of edge relation chosen.
+        max_nodes (`int`, *optional*, defaults to 512):
+            Maximum number of nodes which can be parsed for the input graphs.
+        share_input_output_embed (`bool`, *optional*, defaults to `False`):
+            Shares the embedding layer between encoder and decoder - careful, True is not implemented.
+        num_layers (`int`, *optional*, defaults to 12):
+            Number of layers.
+        embedding_dim (`int`, *optional*, defaults to 768):
+            Dimension of the embedding layer in encoder.
+        ffn_embedding_dim (`int`, *optional*, defaults to 768):
+            Dimension of the "intermediate" (often named feed-forward) layer in encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads in the encoder.
+        self_attention (`bool`, *optional*, defaults to `True`):
+            Model is self attentive (False not implemented).
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the attention weights.
+        activation_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the activation of the linear transformer layer.
+        layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://huggingface.co/papers/1909.11556)
+            for more details.
+        bias (`bool`, *optional*, defaults to `True`):
+            Uses bias in the attention module - unsupported at the moment.
+        embed_scale(`float`, *optional*, defaults to None):
+            Scaling factor for the node embeddings.
+        num_trans_layers_to_freeze (`int`, *optional*, defaults to 0):
+            Number of transformer layers to freeze.
+        encoder_normalize_before (`bool`, *optional*, defaults to `False`):
+            Normalize features before encoding the graph.
+        pre_layernorm (`bool`, *optional*, defaults to `False`):
+            Apply layernorm before self attention and the feed forward network. Without this, post layernorm will be
+            used.
+        apply_graphormer_init (`bool`, *optional*, defaults to `False`):
+            Apply a custom graphormer initialisation to the model before training.
+        freeze_embeddings (`bool`, *optional*, defaults to `False`):
+            Freeze the embedding layer, or train it along the model.
+        encoder_normalize_before (`bool`, *optional*, defaults to `False`):
+            Apply the layer norm before each encoder block.
+        q_noise (`float`, *optional*, defaults to 0.0):
+            Amount of quantization noise (see "Training with Quantization Noise for Extreme Model Compression"). (For
+            more detail, see fairseq's documentation on quant_noise).
+        qn_block_size (`int`, *optional*, defaults to 8):
+            Size of the blocks for subsequent quantization with iPQ (see q_noise).
+        kdim (`int`, *optional*, defaults to None):
+            Dimension of the key in the attention, if different from the other values.
+        vdim (`int`, *optional*, defaults to None):
+            Dimension of the value in the attention, if different from the other values.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        traceable (`bool`, *optional*, defaults to `False`):
+            Changes return value of the encoder's inner_state to stacked tensors.
+
+        Example:
+            ```python
+            >>> from transformers import GraphormerForGraphClassification, GraphormerConfig
+
+            >>> # Initializing a Graphormer graphormer-base-pcqm4mv2 style configuration
+            >>> configuration = GraphormerConfig()
+
+            >>> # Initializing a model from the graphormer-base-pcqm4mv1 style configuration
+            >>> model = GraphormerForGraphClassification(configuration)
+
+            >>> # Accessing the model configuration
+            >>> configuration = model.config
+            ```
+    """
+
+    model_type = "graphormer"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        num_classes: int = 1,
+        num_atoms: int = 512 * 9,
+        num_edges: int = 512 * 3,
+        num_in_degree: int = 512,
+        num_out_degree: int = 512,
+        num_spatial: int = 512,
+        num_edge_dis: int = 128,
+        multi_hop_max_dist: int = 5,  # sometimes is 20
+        spatial_pos_max: int = 1024,
+        edge_type: str = "multi_hop",
+        max_nodes: int = 512,
+        share_input_output_embed: bool = False,
+        num_hidden_layers: int = 12,
+        embedding_dim: int = 768,
+        ffn_embedding_dim: int = 768,
+        num_attention_heads: int = 32,
+        dropout: float = 0.1,
+        attention_dropout: float = 0.1,
+        activation_dropout: float = 0.1,
+        layerdrop: float = 0.0,
+        encoder_normalize_before: bool = False,
+        pre_layernorm: bool = False,
+        apply_graphormer_init: bool = False,
+        activation_fn: str = "gelu",
+        embed_scale: Optional[float] = None,
+        freeze_embeddings: bool = False,
+        num_trans_layers_to_freeze: int = 0,
+        traceable: bool = False,
+        q_noise: float = 0.0,
+        qn_block_size: int = 8,
+        kdim: Optional[int] = None,
+        vdim: Optional[int] = None,
+        bias: bool = True,
+        self_attention: bool = True,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        **kwargs,
+    ):
+        self.num_classes = num_classes
+        self.num_atoms = num_atoms
+        self.num_in_degree = num_in_degree
+        self.num_out_degree = num_out_degree
+        self.num_edges = num_edges
+        self.num_spatial = num_spatial
+        self.num_edge_dis = num_edge_dis
+        self.edge_type = edge_type
+        self.multi_hop_max_dist = multi_hop_max_dist
+        self.spatial_pos_max = spatial_pos_max
+        self.max_nodes = max_nodes
+        self.num_hidden_layers = num_hidden_layers
+        self.embedding_dim = embedding_dim
+        self.hidden_size = embedding_dim
+        self.ffn_embedding_dim = ffn_embedding_dim
+        self.num_attention_heads = num_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.layerdrop = layerdrop
+        self.encoder_normalize_before = encoder_normalize_before
+        self.pre_layernorm = pre_layernorm
+        self.apply_graphormer_init = apply_graphormer_init
+        self.activation_fn = activation_fn
+        self.embed_scale = embed_scale
+        self.freeze_embeddings = freeze_embeddings
+        self.num_trans_layers_to_freeze = num_trans_layers_to_freeze
+        self.share_input_output_embed = share_input_output_embed
+        self.traceable = traceable
+        self.q_noise = q_noise
+        self.qn_block_size = qn_block_size
+
+        # These parameters are here for future extensions
+        # atm, the model only supports self attention
+        self.kdim = kdim
+        self.vdim = vdim
+        self.self_attention = self_attention
+        self.bias = bias
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs,
+        )
+
+
+__all__ = ["GraphormerConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/modeling_graphormer.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/modeling_graphormer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b3e8ea742c8dabd4bd13e34ea84fa60547fc0645
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/graphormer/modeling_graphormer.py
@@ -0,0 +1,912 @@
+# coding=utf-8
+# Copyright 2022 Microsoft, clefourrier The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Graphormer model."""
+
+import math
+from collections.abc import Iterable, Iterator
+from typing import Optional, Union
+
+import torch
+import torch.nn as nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ....activations import ACT2FN
+from ....modeling_outputs import (
+    BaseModelOutputWithNoAttention,
+    SequenceClassifierOutput,
+)
+from ....modeling_utils import PreTrainedModel
+from ....utils import logging
+from .configuration_graphormer import GraphormerConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "graphormer-base-pcqm4mv1"
+_CONFIG_FOR_DOC = "GraphormerConfig"
+
+
+def quant_noise(module: nn.Module, p: float, block_size: int):
+    """
+    From:
+    https://github.com/facebookresearch/fairseq/blob/dd0079bde7f678b0cd0715cbd0ae68d661b7226d/fairseq/modules/quant_noise.py
+
+    Wraps modules and applies quantization noise to the weights for subsequent quantization with Iterative Product
+    Quantization as described in "Training with Quantization Noise for Extreme Model Compression"
+
+    Args:
+        - module: nn.Module
+        - p: amount of Quantization Noise
+        - block_size: size of the blocks for subsequent quantization with iPQ
+
+    Remarks:
+        - Module weights must have the right sizes wrt the block size
+        - Only Linear, Embedding and Conv2d modules are supported for the moment
+        - For more detail on how to quantize by blocks with convolutional weights, see "And the Bit Goes Down:
+          Revisiting the Quantization of Neural Networks"
+        - We implement the simplest form of noise here as stated in the paper which consists in randomly dropping
+          blocks
+    """
+
+    # if no quantization noise, don't register hook
+    if p <= 0:
+        return module
+
+    # supported modules
+    if not isinstance(module, (nn.Linear, nn.Embedding, nn.Conv2d)):
+        raise NotImplementedError("Module unsupported for quant_noise.")
+
+    # test whether module.weight has the right sizes wrt block_size
+    is_conv = module.weight.ndim == 4
+
+    # 2D matrix
+    if not is_conv:
+        if module.weight.size(1) % block_size != 0:
+            raise AssertionError("Input features must be a multiple of block sizes")
+
+    # 4D matrix
+    else:
+        # 1x1 convolutions
+        if module.kernel_size == (1, 1):
+            if module.in_channels % block_size != 0:
+                raise AssertionError("Input channels must be a multiple of block sizes")
+        # regular convolutions
+        else:
+            k = module.kernel_size[0] * module.kernel_size[1]
+            if k % block_size != 0:
+                raise AssertionError("Kernel size must be a multiple of block size")
+
+    def _forward_pre_hook(mod, input):
+        # no noise for evaluation
+        if mod.training:
+            if not is_conv:
+                # gather weight and sizes
+                weight = mod.weight
+                in_features = weight.size(1)
+                out_features = weight.size(0)
+
+                # split weight matrix into blocks and randomly drop selected blocks
+                mask = torch.zeros(in_features // block_size * out_features, device=weight.device)
+                mask.bernoulli_(p)
+                mask = mask.repeat_interleave(block_size, -1).view(-1, in_features)
+
+            else:
+                # gather weight and sizes
+                weight = mod.weight
+                in_channels = mod.in_channels
+                out_channels = mod.out_channels
+
+                # split weight matrix into blocks and randomly drop selected blocks
+                if mod.kernel_size == (1, 1):
+                    mask = torch.zeros(
+                        int(in_channels // block_size * out_channels),
+                        device=weight.device,
+                    )
+                    mask.bernoulli_(p)
+                    mask = mask.repeat_interleave(block_size, -1).view(-1, in_channels)
+                else:
+                    mask = torch.zeros(weight.size(0), weight.size(1), device=weight.device)
+                    mask.bernoulli_(p)
+                    mask = mask.unsqueeze(2).unsqueeze(3).repeat(1, 1, mod.kernel_size[0], mod.kernel_size[1])
+
+            # scale weights and apply mask
+            mask = mask.to(torch.bool)  # x.bool() is not currently supported in TorchScript
+            s = 1 / (1 - p)
+            mod.weight.data = s * weight.masked_fill(mask, 0)
+
+    module.register_forward_pre_hook(_forward_pre_hook)
+    return module
+
+
+class LayerDropModuleList(nn.ModuleList):
+    """
+    From:
+    https://github.com/facebookresearch/fairseq/blob/dd0079bde7f678b0cd0715cbd0ae68d661b7226d/fairseq/modules/layer_drop.py
+    A LayerDrop implementation based on [`torch.nn.ModuleList`]. LayerDrop as described in
+    https://huggingface.co/papers/1909.11556.
+
+    We refresh the choice of which layers to drop every time we iterate over the LayerDropModuleList instance. During
+    evaluation we always iterate over all layers.
+
+    Usage:
+
+    ```python
+    layers = LayerDropList(p=0.5, modules=[layer1, layer2, layer3])
+    for layer in layers:  # this might iterate over layers 1 and 3
+        x = layer(x)
+    for layer in layers:  # this might iterate over all layers
+        x = layer(x)
+    for layer in layers:  # this might not iterate over any layers
+        x = layer(x)
+    ```
+
+    Args:
+        p (float): probability of dropping out each layer
+        modules (iterable, optional): an iterable of modules to add
+    """
+
+    def __init__(self, p: float, modules: Optional[Iterable[nn.Module]] = None):
+        super().__init__(modules)
+        self.p = p
+
+    def __iter__(self) -> Iterator[nn.Module]:
+        dropout_probs = torch.empty(len(self)).uniform_()
+        for i, m in enumerate(super().__iter__()):
+            if not self.training or (dropout_probs[i] > self.p):
+                yield m
+
+
+class GraphormerGraphNodeFeature(nn.Module):
+    """
+    Compute node features for each node in the graph.
+    """
+
+    def __init__(self, config: GraphormerConfig):
+        super().__init__()
+        self.num_heads = config.num_attention_heads
+        self.num_atoms = config.num_atoms
+
+        self.atom_encoder = nn.Embedding(config.num_atoms + 1, config.hidden_size, padding_idx=config.pad_token_id)
+        self.in_degree_encoder = nn.Embedding(
+            config.num_in_degree, config.hidden_size, padding_idx=config.pad_token_id
+        )
+        self.out_degree_encoder = nn.Embedding(
+            config.num_out_degree, config.hidden_size, padding_idx=config.pad_token_id
+        )
+
+        self.graph_token = nn.Embedding(1, config.hidden_size)
+
+    def forward(
+        self,
+        input_nodes: torch.LongTensor,
+        in_degree: torch.LongTensor,
+        out_degree: torch.LongTensor,
+    ) -> torch.Tensor:
+        n_graph, n_node = input_nodes.size()[:2]
+
+        node_feature = (  # node feature + graph token
+            self.atom_encoder(input_nodes).sum(dim=-2)  # [n_graph, n_node, n_hidden]
+            + self.in_degree_encoder(in_degree)
+            + self.out_degree_encoder(out_degree)
+        )
+
+        graph_token_feature = self.graph_token.weight.unsqueeze(0).repeat(n_graph, 1, 1)
+
+        graph_node_feature = torch.cat([graph_token_feature, node_feature], dim=1)
+
+        return graph_node_feature
+
+
+class GraphormerGraphAttnBias(nn.Module):
+    """
+    Compute attention bias for each head.
+    """
+
+    def __init__(self, config: GraphormerConfig):
+        super().__init__()
+        self.num_heads = config.num_attention_heads
+        self.multi_hop_max_dist = config.multi_hop_max_dist
+
+        # We do not change edge feature embedding learning, as edge embeddings are represented as a combination of the original features
+        # + shortest path
+        self.edge_encoder = nn.Embedding(config.num_edges + 1, config.num_attention_heads, padding_idx=0)
+
+        self.edge_type = config.edge_type
+        if self.edge_type == "multi_hop":
+            self.edge_dis_encoder = nn.Embedding(
+                config.num_edge_dis * config.num_attention_heads * config.num_attention_heads,
+                1,
+            )
+
+        self.spatial_pos_encoder = nn.Embedding(config.num_spatial, config.num_attention_heads, padding_idx=0)
+
+        self.graph_token_virtual_distance = nn.Embedding(1, config.num_attention_heads)
+
+    def forward(
+        self,
+        input_nodes: torch.LongTensor,
+        attn_bias: torch.Tensor,
+        spatial_pos: torch.LongTensor,
+        input_edges: torch.LongTensor,
+        attn_edge_type: torch.LongTensor,
+    ) -> torch.Tensor:
+        n_graph, n_node = input_nodes.size()[:2]
+        graph_attn_bias = attn_bias.clone()
+        graph_attn_bias = graph_attn_bias.unsqueeze(1).repeat(
+            1, self.num_heads, 1, 1
+        )  # [n_graph, n_head, n_node+1, n_node+1]
+
+        # spatial pos
+        # [n_graph, n_node, n_node, n_head] -> [n_graph, n_head, n_node, n_node]
+        spatial_pos_bias = self.spatial_pos_encoder(spatial_pos).permute(0, 3, 1, 2)
+        graph_attn_bias[:, :, 1:, 1:] = graph_attn_bias[:, :, 1:, 1:] + spatial_pos_bias
+
+        # reset spatial pos here
+        t = self.graph_token_virtual_distance.weight.view(1, self.num_heads, 1)
+        graph_attn_bias[:, :, 1:, 0] = graph_attn_bias[:, :, 1:, 0] + t
+        graph_attn_bias[:, :, 0, :] = graph_attn_bias[:, :, 0, :] + t
+
+        # edge feature
+        if self.edge_type == "multi_hop":
+            spatial_pos_ = spatial_pos.clone()
+
+            spatial_pos_[spatial_pos_ == 0] = 1  # set pad to 1
+            # set 1 to 1, input_nodes > 1 to input_nodes - 1
+            spatial_pos_ = torch.where(spatial_pos_ > 1, spatial_pos_ - 1, spatial_pos_)
+            if self.multi_hop_max_dist > 0:
+                spatial_pos_ = spatial_pos_.clamp(0, self.multi_hop_max_dist)
+                input_edges = input_edges[:, :, :, : self.multi_hop_max_dist, :]
+            # [n_graph, n_node, n_node, max_dist, n_head]
+
+            input_edges = self.edge_encoder(input_edges).mean(-2)
+            max_dist = input_edges.size(-2)
+            edge_input_flat = input_edges.permute(3, 0, 1, 2, 4).reshape(max_dist, -1, self.num_heads)
+            edge_input_flat = torch.bmm(
+                edge_input_flat,
+                self.edge_dis_encoder.weight.reshape(-1, self.num_heads, self.num_heads)[:max_dist, :, :],
+            )
+            input_edges = edge_input_flat.reshape(max_dist, n_graph, n_node, n_node, self.num_heads).permute(
+                1, 2, 3, 0, 4
+            )
+            input_edges = (input_edges.sum(-2) / (spatial_pos_.float().unsqueeze(-1))).permute(0, 3, 1, 2)
+        else:
+            # [n_graph, n_node, n_node, n_head] -> [n_graph, n_head, n_node, n_node]
+            input_edges = self.edge_encoder(attn_edge_type).mean(-2).permute(0, 3, 1, 2)
+
+        graph_attn_bias[:, :, 1:, 1:] = graph_attn_bias[:, :, 1:, 1:] + input_edges
+        graph_attn_bias = graph_attn_bias + attn_bias.unsqueeze(1)  # reset
+
+        return graph_attn_bias
+
+
+class GraphormerMultiheadAttention(nn.Module):
+    """Multi-headed attention.
+
+    See "Attention Is All You Need" for more details.
+    """
+
+    def __init__(self, config: GraphormerConfig):
+        super().__init__()
+        self.embedding_dim = config.embedding_dim
+        self.kdim = config.kdim if config.kdim is not None else config.embedding_dim
+        self.vdim = config.vdim if config.vdim is not None else config.embedding_dim
+        self.qkv_same_dim = self.kdim == config.embedding_dim and self.vdim == config.embedding_dim
+
+        self.num_heads = config.num_attention_heads
+        self.attention_dropout_module = torch.nn.Dropout(p=config.attention_dropout, inplace=False)
+
+        self.head_dim = config.embedding_dim // config.num_attention_heads
+        if not (self.head_dim * config.num_attention_heads == self.embedding_dim):
+            raise AssertionError("The embedding_dim must be divisible by num_heads.")
+        self.scaling = self.head_dim**-0.5
+
+        self.self_attention = True  # config.self_attention
+        if not (self.self_attention):
+            raise NotImplementedError("The Graphormer model only supports self attention for now.")
+        if self.self_attention and not self.qkv_same_dim:
+            raise AssertionError("Self-attention requires query, key and value to be of the same size.")
+
+        self.k_proj = quant_noise(
+            nn.Linear(self.kdim, config.embedding_dim, bias=config.bias),
+            config.q_noise,
+            config.qn_block_size,
+        )
+        self.v_proj = quant_noise(
+            nn.Linear(self.vdim, config.embedding_dim, bias=config.bias),
+            config.q_noise,
+            config.qn_block_size,
+        )
+        self.q_proj = quant_noise(
+            nn.Linear(config.embedding_dim, config.embedding_dim, bias=config.bias),
+            config.q_noise,
+            config.qn_block_size,
+        )
+
+        self.out_proj = quant_noise(
+            nn.Linear(config.embedding_dim, config.embedding_dim, bias=config.bias),
+            config.q_noise,
+            config.qn_block_size,
+        )
+
+        self.onnx_trace = False
+
+    def reset_parameters(self):
+        if self.qkv_same_dim:
+            # Empirically observed the convergence to be much better with
+            # the scaled initialization
+            nn.init.xavier_uniform_(self.k_proj.weight, gain=1 / math.sqrt(2))
+            nn.init.xavier_uniform_(self.v_proj.weight, gain=1 / math.sqrt(2))
+            nn.init.xavier_uniform_(self.q_proj.weight, gain=1 / math.sqrt(2))
+        else:
+            nn.init.xavier_uniform_(self.k_proj.weight)
+            nn.init.xavier_uniform_(self.v_proj.weight)
+            nn.init.xavier_uniform_(self.q_proj.weight)
+
+        nn.init.xavier_uniform_(self.out_proj.weight)
+        if self.out_proj.bias is not None:
+            nn.init.constant_(self.out_proj.bias, 0.0)
+
+    def forward(
+        self,
+        query: torch.LongTensor,
+        key: Optional[torch.Tensor],
+        value: Optional[torch.Tensor],
+        attn_bias: Optional[torch.Tensor],
+        key_padding_mask: Optional[torch.Tensor] = None,
+        need_weights: bool = True,
+        attn_mask: Optional[torch.Tensor] = None,
+        before_softmax: bool = False,
+        need_head_weights: bool = False,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """
+        Args:
+            key_padding_mask (Bytetorch.Tensor, optional): mask to exclude
+                keys that are pads, of shape `(batch, src_len)`, where padding elements are indicated by 1s.
+            need_weights (bool, optional): return the attention weights,
+                averaged over heads (default: False).
+            attn_mask (Bytetorch.Tensor, optional): typically used to
+                implement causal attention, where the mask prevents the attention from looking forward in time
+                (default: None).
+            before_softmax (bool, optional): return the raw attention
+                weights and values before the attention softmax.
+            need_head_weights (bool, optional): return the attention
+                weights for each head. Implies *need_weights*. Default: return the average attention weights over all
+                heads.
+        """
+        if need_head_weights:
+            need_weights = True
+
+        tgt_len, bsz, embedding_dim = query.size()
+        src_len = tgt_len
+        if not (embedding_dim == self.embedding_dim):
+            raise AssertionError(
+                f"The query embedding dimension {embedding_dim} is not equal to the expected embedding_dim"
+                f" {self.embedding_dim}."
+            )
+        if not (list(query.size()) == [tgt_len, bsz, embedding_dim]):
+            raise AssertionError("Query size incorrect in Graphormer, compared to model dimensions.")
+
+        if key is not None:
+            src_len, key_bsz, _ = key.size()
+            if not torch.jit.is_scripting():
+                if (key_bsz != bsz) or (value is None) or not (src_len, bsz == value.shape[:2]):
+                    raise AssertionError(
+                        "The batch shape does not match the key or value shapes provided to the attention."
+                    )
+
+        q = self.q_proj(query)
+        k = self.k_proj(query)
+        v = self.v_proj(query)
+
+        q *= self.scaling
+
+        q = q.contiguous().view(tgt_len, bsz * self.num_heads, self.head_dim).transpose(0, 1)
+        if k is not None:
+            k = k.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1)
+        if v is not None:
+            v = v.contiguous().view(-1, bsz * self.num_heads, self.head_dim).transpose(0, 1)
+
+        if (k is None) or not (k.size(1) == src_len):
+            raise AssertionError("The shape of the key generated in the attention is incorrect")
+
+        # This is part of a workaround to get around fork/join parallelism
+        # not supporting Optional types.
+        if key_padding_mask is not None and key_padding_mask.dim() == 0:
+            key_padding_mask = None
+
+        if key_padding_mask is not None:
+            if key_padding_mask.size(0) != bsz or key_padding_mask.size(1) != src_len:
+                raise AssertionError(
+                    "The shape of the generated padding mask for the key does not match expected dimensions."
+                )
+        attn_weights = torch.bmm(q, k.transpose(1, 2))
+        attn_weights = self.apply_sparse_mask(attn_weights, tgt_len, src_len, bsz)
+
+        if list(attn_weights.size()) != [bsz * self.num_heads, tgt_len, src_len]:
+            raise AssertionError("The attention weights generated do not match the expected dimensions.")
+
+        if attn_bias is not None:
+            attn_weights += attn_bias.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if attn_mask is not None:
+            attn_mask = attn_mask.unsqueeze(0)
+            attn_weights += attn_mask
+
+        if key_padding_mask is not None:
+            # don't attend to padding symbols
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.masked_fill(
+                key_padding_mask.unsqueeze(1).unsqueeze(2).to(torch.bool), float("-inf")
+            )
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if before_softmax:
+            return attn_weights, v
+
+        attn_weights_float = torch.nn.functional.softmax(attn_weights, dim=-1)
+        attn_weights = attn_weights_float.type_as(attn_weights)
+        attn_probs = self.attention_dropout_module(attn_weights)
+
+        if v is None:
+            raise AssertionError("No value generated")
+        attn = torch.bmm(attn_probs, v)
+        if list(attn.size()) != [bsz * self.num_heads, tgt_len, self.head_dim]:
+            raise AssertionError("The attention generated do not match the expected dimensions.")
+
+        attn = attn.transpose(0, 1).contiguous().view(tgt_len, bsz, embedding_dim)
+        attn: torch.Tensor = self.out_proj(attn)
+
+        attn_weights = None
+        if need_weights:
+            attn_weights = attn_weights_float.contiguous().view(bsz, self.num_heads, tgt_len, src_len).transpose(1, 0)
+            if not need_head_weights:
+                # average attention weights over heads
+                attn_weights = attn_weights.mean(dim=0)
+
+        return attn, attn_weights
+
+    def apply_sparse_mask(self, attn_weights: torch.Tensor, tgt_len: int, src_len: int, bsz: int) -> torch.Tensor:
+        return attn_weights
+
+
+class GraphormerGraphEncoderLayer(nn.Module):
+    def __init__(self, config: GraphormerConfig) -> None:
+        super().__init__()
+
+        # Initialize parameters
+        self.embedding_dim = config.embedding_dim
+        self.num_attention_heads = config.num_attention_heads
+        self.q_noise = config.q_noise
+        self.qn_block_size = config.qn_block_size
+        self.pre_layernorm = config.pre_layernorm
+
+        self.dropout_module = torch.nn.Dropout(p=config.dropout, inplace=False)
+
+        self.activation_dropout_module = torch.nn.Dropout(p=config.activation_dropout, inplace=False)
+
+        # Initialize blocks
+        self.activation_fn = ACT2FN[config.activation_fn]
+        self.self_attn = GraphormerMultiheadAttention(config)
+
+        # layer norm associated with the self attention layer
+        self.self_attn_layer_norm = nn.LayerNorm(self.embedding_dim)
+
+        self.fc1 = self.build_fc(
+            self.embedding_dim,
+            config.ffn_embedding_dim,
+            q_noise=config.q_noise,
+            qn_block_size=config.qn_block_size,
+        )
+        self.fc2 = self.build_fc(
+            config.ffn_embedding_dim,
+            self.embedding_dim,
+            q_noise=config.q_noise,
+            qn_block_size=config.qn_block_size,
+        )
+
+        # layer norm associated with the position wise feed-forward NN
+        self.final_layer_norm = nn.LayerNorm(self.embedding_dim)
+
+    def build_fc(
+        self, input_dim: int, output_dim: int, q_noise: float, qn_block_size: int
+    ) -> Union[nn.Module, nn.Linear, nn.Embedding, nn.Conv2d]:
+        return quant_noise(nn.Linear(input_dim, output_dim), q_noise, qn_block_size)
+
+    def forward(
+        self,
+        input_nodes: torch.Tensor,
+        self_attn_bias: Optional[torch.Tensor] = None,
+        self_attn_mask: Optional[torch.Tensor] = None,
+        self_attn_padding_mask: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """
+        nn.LayerNorm is applied either before or after the self-attention/ffn modules similar to the original
+        Transformer implementation.
+        """
+        residual = input_nodes
+        if self.pre_layernorm:
+            input_nodes = self.self_attn_layer_norm(input_nodes)
+
+        input_nodes, attn = self.self_attn(
+            query=input_nodes,
+            key=input_nodes,
+            value=input_nodes,
+            attn_bias=self_attn_bias,
+            key_padding_mask=self_attn_padding_mask,
+            need_weights=False,
+            attn_mask=self_attn_mask,
+        )
+        input_nodes = self.dropout_module(input_nodes)
+        input_nodes = residual + input_nodes
+        if not self.pre_layernorm:
+            input_nodes = self.self_attn_layer_norm(input_nodes)
+
+        residual = input_nodes
+        if self.pre_layernorm:
+            input_nodes = self.final_layer_norm(input_nodes)
+        input_nodes = self.activation_fn(self.fc1(input_nodes))
+        input_nodes = self.activation_dropout_module(input_nodes)
+        input_nodes = self.fc2(input_nodes)
+        input_nodes = self.dropout_module(input_nodes)
+        input_nodes = residual + input_nodes
+        if not self.pre_layernorm:
+            input_nodes = self.final_layer_norm(input_nodes)
+
+        return input_nodes, attn
+
+
+class GraphormerGraphEncoder(nn.Module):
+    def __init__(self, config: GraphormerConfig):
+        super().__init__()
+
+        self.dropout_module = torch.nn.Dropout(p=config.dropout, inplace=False)
+        self.layerdrop = config.layerdrop
+        self.embedding_dim = config.embedding_dim
+        self.apply_graphormer_init = config.apply_graphormer_init
+        self.traceable = config.traceable
+
+        self.graph_node_feature = GraphormerGraphNodeFeature(config)
+        self.graph_attn_bias = GraphormerGraphAttnBias(config)
+
+        self.embed_scale = config.embed_scale
+
+        if config.q_noise > 0:
+            self.quant_noise = quant_noise(
+                nn.Linear(self.embedding_dim, self.embedding_dim, bias=False),
+                config.q_noise,
+                config.qn_block_size,
+            )
+        else:
+            self.quant_noise = None
+
+        if config.encoder_normalize_before:
+            self.emb_layer_norm = nn.LayerNorm(self.embedding_dim)
+        else:
+            self.emb_layer_norm = None
+
+        if config.pre_layernorm:
+            self.final_layer_norm = nn.LayerNorm(self.embedding_dim)
+
+        if self.layerdrop > 0.0:
+            self.layers = LayerDropModuleList(p=self.layerdrop)
+        else:
+            self.layers = nn.ModuleList([])
+        self.layers.extend([GraphormerGraphEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+
+        # Apply initialization of model params after building the model
+        if config.freeze_embeddings:
+            raise NotImplementedError("Freezing embeddings is not implemented yet.")
+
+        for layer in range(config.num_trans_layers_to_freeze):
+            m = self.layers[layer]
+            if m is not None:
+                for p in m.parameters():
+                    p.requires_grad = False
+
+    def forward(
+        self,
+        input_nodes: torch.LongTensor,
+        input_edges: torch.LongTensor,
+        attn_bias: torch.Tensor,
+        in_degree: torch.LongTensor,
+        out_degree: torch.LongTensor,
+        spatial_pos: torch.LongTensor,
+        attn_edge_type: torch.LongTensor,
+        perturb=None,
+        last_state_only: bool = False,
+        token_embeddings: Optional[torch.Tensor] = None,
+        attn_mask: Optional[torch.Tensor] = None,
+    ) -> tuple[Union[torch.Tensor, list[torch.LongTensor]], torch.Tensor]:
+        # compute padding mask. This is needed for multi-head attention
+        data_x = input_nodes
+        n_graph, n_node = data_x.size()[:2]
+        padding_mask = (data_x[:, :, 0]).eq(0)
+        padding_mask_cls = torch.zeros(n_graph, 1, device=padding_mask.device, dtype=padding_mask.dtype)
+        padding_mask = torch.cat((padding_mask_cls, padding_mask), dim=1)
+
+        attn_bias = self.graph_attn_bias(input_nodes, attn_bias, spatial_pos, input_edges, attn_edge_type)
+
+        if token_embeddings is not None:
+            input_nodes = token_embeddings
+        else:
+            input_nodes = self.graph_node_feature(input_nodes, in_degree, out_degree)
+
+        if perturb is not None:
+            input_nodes[:, 1:, :] += perturb
+
+        if self.embed_scale is not None:
+            input_nodes = input_nodes * self.embed_scale
+
+        if self.quant_noise is not None:
+            input_nodes = self.quant_noise(input_nodes)
+
+        if self.emb_layer_norm is not None:
+            input_nodes = self.emb_layer_norm(input_nodes)
+
+        input_nodes = self.dropout_module(input_nodes)
+
+        input_nodes = input_nodes.transpose(0, 1)
+
+        inner_states = []
+        if not last_state_only:
+            inner_states.append(input_nodes)
+
+        for layer in self.layers:
+            input_nodes, _ = layer(
+                input_nodes,
+                self_attn_padding_mask=padding_mask,
+                self_attn_mask=attn_mask,
+                self_attn_bias=attn_bias,
+            )
+            if not last_state_only:
+                inner_states.append(input_nodes)
+
+        graph_rep = input_nodes[0, :, :]
+
+        if last_state_only:
+            inner_states = [input_nodes]
+
+        if self.traceable:
+            return torch.stack(inner_states), graph_rep
+        else:
+            return inner_states, graph_rep
+
+
+class GraphormerDecoderHead(nn.Module):
+    def __init__(self, embedding_dim: int, num_classes: int):
+        super().__init__()
+        """num_classes should be 1 for regression, or the number of classes for classification"""
+        self.lm_output_learned_bias = nn.Parameter(torch.zeros(1))
+        self.classifier = nn.Linear(embedding_dim, num_classes, bias=False)
+        self.num_classes = num_classes
+
+    def forward(self, input_nodes: torch.Tensor, **unused) -> torch.Tensor:
+        input_nodes = self.classifier(input_nodes)
+        input_nodes = input_nodes + self.lm_output_learned_bias
+        return input_nodes
+
+
+class GraphormerPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config: GraphormerConfig
+    base_model_prefix = "graphormer"
+    main_input_name_nodes = "input_nodes"
+    main_input_name_edges = "input_edges"
+
+    def normal_(self, data: torch.Tensor):
+        # with FSDP, module params will be on CUDA, so we cast them back to CPU
+        # so that the RNG is consistent with and without FSDP
+        data.copy_(data.cpu().normal_(mean=0.0, std=0.02).to(data.device))
+
+    def init_graphormer_params(self, module: Union[nn.Linear, nn.Embedding, GraphormerMultiheadAttention]):
+        """
+        Initialize the weights specific to the Graphormer Model.
+        """
+        if isinstance(module, nn.Linear):
+            self.normal_(module.weight.data)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        if isinstance(module, nn.Embedding):
+            self.normal_(module.weight.data)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        if isinstance(module, GraphormerMultiheadAttention):
+            self.normal_(module.q_proj.weight.data)
+            self.normal_(module.k_proj.weight.data)
+            self.normal_(module.v_proj.weight.data)
+
+    def _init_weights(
+        self,
+        module: Union[
+            nn.Linear, nn.Conv2d, nn.Embedding, nn.LayerNorm, GraphormerMultiheadAttention, GraphormerGraphEncoder
+        ],
+    ):
+        """
+        Initialize the weights
+        """
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # We might be missing part of the Linear init, dependent on the layer num
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=0.02)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, GraphormerMultiheadAttention):
+            module.q_proj.weight.data.normal_(mean=0.0, std=0.02)
+            module.k_proj.weight.data.normal_(mean=0.0, std=0.02)
+            module.v_proj.weight.data.normal_(mean=0.0, std=0.02)
+            module.reset_parameters()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, GraphormerGraphEncoder):
+            if module.apply_graphormer_init:
+                module.apply(self.init_graphormer_params)
+
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+class GraphormerModel(GraphormerPreTrainedModel):
+    """The Graphormer model is a graph-encoder model.
+
+    It goes from a graph to its representation. If you want to use the model for a downstream classification task, use
+    GraphormerForGraphClassification instead. For any other downstream task, feel free to add a new class, or combine
+    this model with a downstream model of your choice, following the example in GraphormerForGraphClassification.
+    """
+
+    def __init__(self, config: GraphormerConfig):
+        super().__init__(config)
+        self.max_nodes = config.max_nodes
+
+        self.graph_encoder = GraphormerGraphEncoder(config)
+
+        self.share_input_output_embed = config.share_input_output_embed
+        self.lm_output_learned_bias = None
+
+        # Remove head is set to true during fine-tuning
+        self.load_softmax = not getattr(config, "remove_head", False)
+
+        self.lm_head_transform_weight = nn.Linear(config.embedding_dim, config.embedding_dim)
+        self.activation_fn = ACT2FN[config.activation_fn]
+        self.layer_norm = nn.LayerNorm(config.embedding_dim)
+
+        self.post_init()
+
+    def reset_output_layer_parameters(self):
+        self.lm_output_learned_bias = nn.Parameter(torch.zeros(1))
+
+    def forward(
+        self,
+        input_nodes: torch.LongTensor,
+        input_edges: torch.LongTensor,
+        attn_bias: torch.Tensor,
+        in_degree: torch.LongTensor,
+        out_degree: torch.LongTensor,
+        spatial_pos: torch.LongTensor,
+        attn_edge_type: torch.LongTensor,
+        perturb: Optional[torch.FloatTensor] = None,
+        masked_tokens: None = None,
+        return_dict: Optional[bool] = None,
+        **unused,
+    ) -> Union[tuple[torch.LongTensor], BaseModelOutputWithNoAttention]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        inner_states, graph_rep = self.graph_encoder(
+            input_nodes, input_edges, attn_bias, in_degree, out_degree, spatial_pos, attn_edge_type, perturb=perturb
+        )
+
+        # last inner state, then revert Batch and Graph len
+        input_nodes = inner_states[-1].transpose(0, 1)
+
+        # project masked tokens only
+        if masked_tokens is not None:
+            raise NotImplementedError
+
+        input_nodes = self.layer_norm(self.activation_fn(self.lm_head_transform_weight(input_nodes)))
+
+        # project back to size of vocabulary
+        if self.share_input_output_embed and hasattr(self.graph_encoder.embed_tokens, "weight"):
+            input_nodes = torch.nn.functional.linear(input_nodes, self.graph_encoder.embed_tokens.weight)
+
+        if not return_dict:
+            return tuple(x for x in [input_nodes, inner_states] if x is not None)
+        return BaseModelOutputWithNoAttention(last_hidden_state=input_nodes, hidden_states=inner_states)
+
+    def max_nodes(self):
+        """Maximum output length supported by the encoder."""
+        return self.max_nodes
+
+
+class GraphormerForGraphClassification(GraphormerPreTrainedModel):
+    """
+    This model can be used for graph-level classification or regression tasks.
+
+    It can be trained on
+    - regression (by setting config.num_classes to 1); there should be one float-type label per graph
+    - one task classification (by setting config.num_classes to the number of classes); there should be one integer
+      label per graph
+    - binary multi-task classification (by setting config.num_classes to the number of labels); there should be a list
+      of integer labels for each graph.
+    """
+
+    def __init__(self, config: GraphormerConfig):
+        super().__init__(config)
+        self.encoder = GraphormerModel(config)
+        self.embedding_dim = config.embedding_dim
+        self.num_classes = config.num_classes
+        self.classifier = GraphormerDecoderHead(self.embedding_dim, self.num_classes)
+        self.is_encoder_decoder = True
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        input_nodes: torch.LongTensor,
+        input_edges: torch.LongTensor,
+        attn_bias: torch.Tensor,
+        in_degree: torch.LongTensor,
+        out_degree: torch.LongTensor,
+        spatial_pos: torch.LongTensor,
+        attn_edge_type: torch.LongTensor,
+        labels: Optional[torch.LongTensor] = None,
+        return_dict: Optional[bool] = None,
+        **unused,
+    ) -> Union[tuple[torch.Tensor], SequenceClassifierOutput]:
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_outputs = self.encoder(
+            input_nodes,
+            input_edges,
+            attn_bias,
+            in_degree,
+            out_degree,
+            spatial_pos,
+            attn_edge_type,
+            return_dict=True,
+        )
+        outputs, hidden_states = encoder_outputs["last_hidden_state"], encoder_outputs["hidden_states"]
+
+        head_outputs = self.classifier(outputs)
+        logits = head_outputs[:, 0, :].contiguous()
+
+        loss = None
+        if labels is not None:
+            mask = ~torch.isnan(labels)
+
+            if self.num_classes == 1:  # regression
+                loss_fct = MSELoss()
+                loss = loss_fct(logits[mask].squeeze(), labels[mask].squeeze().float())
+            elif self.num_classes > 1 and len(labels.shape) == 1:  # One task classification
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits[mask].view(-1, self.num_classes), labels[mask].view(-1))
+            else:  # Binary multi-task classification
+                loss_fct = BCEWithLogitsLoss(reduction="sum")
+                loss = loss_fct(logits[mask], labels[mask])
+
+        if not return_dict:
+            return tuple(x for x in [loss, logits, hidden_states] if x is not None)
+        return SequenceClassifierOutput(loss=loss, logits=logits, hidden_states=hidden_states, attentions=None)
+
+
+__all__ = ["GraphormerForGraphClassification", "GraphormerModel", "GraphormerPreTrainedModel"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/jukebox/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/jukebox/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..826bdbddc1f182de43a795ab0d78ad4009507c14
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/jukebox/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ....utils import _LazyModule
+from ....utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_jukebox import *
+    from .modeling_jukebox import *
+    from .tokenization_jukebox import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/jukebox/configuration_jukebox.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/jukebox/configuration_jukebox.py
new file mode 100644
index 0000000000000000000000000000000000000000..9bb2e3da54aa232bf934a7922f731796a7146937
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/jukebox/configuration_jukebox.py
@@ -0,0 +1,613 @@
+# coding=utf-8
+# Copyright 2022 The OpenAI Team Authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Jukebox configuration"""
+
+import os
+from typing import Union
+
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+_LARGE_ATTENTION = [
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "cross_attention",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "cross_attention",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "cross_attention",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "cross_attention",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "cross_attention",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "cross_attention",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "block_attn",
+    "transpose_block_attn",
+    "prev_block_attn",
+    "cross_attention",
+]
+_RawColumnPreviousRowAttention = ["block_attn", "transpose_block_attn", "prev_block_attn"]
+_FullDenseAttention = ["dense_attention"]
+_PrimePrimeDenseAttention = ["prime_attn", "prime_attn", "dense_attn"]
+
+
+def full_dense_attention(layer):
+    return _FullDenseAttention[0]
+
+
+def raw_column_previous_row_attention(layer):
+    return _RawColumnPreviousRowAttention[layer % 3]
+
+
+def large_separated_enc_dec_w_lyrics(layer):
+    return _LARGE_ATTENTION[layer % 79]
+
+
+def enc_dec_with_lyrics(layer):
+    if layer % 16 == 15:
+        return _PrimePrimeDenseAttention[layer % 3]
+    return _RawColumnPreviousRowAttention[layer % 3]
+
+
+ATTENTION_PATTERNS = {
+    "full_dense_attention": full_dense_attention,
+    "raw_column_previous_row_attention": raw_column_previous_row_attention,  # Alternate row, column and previous row attn
+    "large_separated_enc_dec_w_lyrics": large_separated_enc_dec_w_lyrics,  # Used by large separated_enc_dec model with lyrics
+    "enc_dec_with_lyrics": enc_dec_with_lyrics,  # Used by encoder_decoder model with lyrics
+}
+
+
+class JukeboxPriorConfig(PretrainedConfig):
+    """
+        This is the configuration class to store the configuration of a [`JukeboxPrior`]. It is used to instantiate a
+        `JukeboxPrior` according to the specified arguments, defining the model architecture. Instantiating a
+        configuration with the defaults will yield a similar configuration to that of the top level prior from the
+        [openai/jukebox-1b-lyrics](https://huggingface.co/openai/jukebox
+    -1b-lyrics) architecture.
+
+        Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+        documentation from [`PretrainedConfig`] for more information.
+
+
+
+    Args:
+        act_fn (`str`, *optional*, defaults to `"quick_gelu"`):
+            Activation function.
+        alignment_head (`int`, *optional*, defaults to 2):
+            Head that is responsible of the alignment between lyrics and music. Only used to compute the lyric to audio
+            alignment
+        alignment_layer (`int`, *optional*, defaults to 68):
+            Index of the layer that is responsible of the alignment between lyrics and music. Only used to compute the
+            lyric to audio alignment
+        attention_multiplier (`float`, *optional*, defaults to 0.25):
+            Multiplier coefficient used to define the hidden dimension of the attention layers. 0.25 means that
+            0.25*width of the model will be used.
+        attention_pattern (`str`, *optional*, defaults to `"enc_dec_with_lyrics"`):
+            Which attention pattern to use for the decoder/
+        attn_dropout (`int`, *optional*, defaults to 0):
+            Dropout probability for the post-attention layer dropout in the decoder.
+        attn_res_scale (`bool`, *optional*, defaults to `False`):
+            Whether or not to scale the residuals in the attention conditioner block.
+        blocks (`int`, *optional*, defaults to 64):
+            Number of blocks used in the `block_attn`. A sequence of length seq_len is factored as `[blocks, seq_len //
+            blocks]` in the `JukeboxAttention` layer.
+        conv_res_scale (`int`, *optional*):
+            Whether or not to scale the residuals in the conditioner block. Since the top level prior does not have a
+            conditioner, the default value is to None and should not be modified.
+        num_layers (`int`, *optional*, defaults to 72):
+            Number of layers of the transformer architecture.
+        emb_dropout (`int`, *optional*, defaults to 0):
+            Embedding dropout used in the lyric decoder.
+        encoder_config (`JukeboxPriorConfig`, *optional*) :
+            Configuration of the encoder which models the prior on the lyrics.
+        encoder_loss_fraction (`float`, *optional*, defaults to 0.4):
+            Multiplication factor used in front of the lyric encoder loss.
+        hidden_size (`int`, *optional*, defaults to 2048):
+            Hidden dimension of the attention layers.
+        init_scale (`float`, *optional*, defaults to 0.2):
+            Initialization scales for the prior modules.
+        is_encoder_decoder (`bool`, *optional*, defaults to `True`):
+            Whether or not the prior is an encoder-decoder model. In case it is not, and `nb_relevant_lyric_tokens` is
+            greater than 0, the `encoder` args should be specified for the lyric encoding.
+        mask (`bool`, *optional*, defaults to `False`):
+            Whether or not to mask the previous positions in the attention.
+        max_duration (`int`, *optional*, defaults to 600):
+            Maximum supported duration of the generated song in seconds.
+        max_nb_genres (`int`, *optional*, defaults to 1):
+            Maximum number of genres that can be used to condition the model.
+        merged_decoder (`bool`, *optional*, defaults to `True`):
+            Whether or not the decoder and the encoder inputs are merged. This is used for the separated
+            encoder-decoder architecture
+        metadata_conditioning (`bool`, *optional*, defaults to `True)`:
+            Whether or not to condition on the artist and genre metadata.
+        metadata_dims (`List[int]`, *optional*, defaults to `[604, 7898]`):
+            Number of genres and the number of artists that were used to train the embedding layers of the prior
+            models.
+        min_duration (`int`, *optional*, defaults to 0):
+            Minimum duration of the generated audio on which the model was trained.
+        mlp_multiplier (`float`, *optional*, defaults to 1.0):
+            Multiplier coefficient used to define the hidden dimension of the MLP layers. 0.25 means that 0.25*width of
+            the model will be used.
+        music_vocab_size (`int`, *optional*, defaults to 2048):
+            Number of different music tokens. Should be similar to the `JukeboxVQVAEConfig.nb_discrete_codes`.
+        n_ctx (`int`, *optional*, defaults to 6144):
+            Number of context tokens for each prior. The context tokens are the music tokens that are attended to when
+            generating music tokens.
+        n_heads (`int`, *optional*, defaults to 2):
+                Number of attention heads.
+        nb_relevant_lyric_tokens (`int`, *optional*, defaults to 384):
+            Number of lyric tokens that are used when sampling a single window of length `n_ctx`
+        res_conv_depth (`int`, *optional*, defaults to 3):
+            Depth of the `JukeboxDecoderConvBock` used to upsample the previously sampled audio in the
+            `JukeboxMusicTokenConditioner`.
+        res_conv_width (`int`, *optional*, defaults to 128):
+            Width of the `JukeboxDecoderConvBock` used to upsample the previously sampled audio in the
+            `JukeboxMusicTokenConditioner`.
+        res_convolution_multiplier (`int`, *optional*, defaults to 1):
+            Multiplier used to scale the `hidden_dim` of the `JukeboxResConv1DBlock`.
+        res_dilation_cycle (`int`, *optional*):
+            Dilation cycle used to define the `JukeboxMusicTokenConditioner`. Usually similar to the ones used in the
+            corresponding level of the VQVAE. The first prior does not use it as it is not conditioned on upper level
+            tokens.
+        res_dilation_growth_rate (`int`, *optional*, defaults to 1):
+            Dilation grow rate used between each convolutionnal block of the `JukeboxMusicTokenConditioner`
+        res_downs_t (`List[int]`, *optional*, defaults to `[3, 2, 2]`):
+            Downsampling rates used in the audio conditioning network
+        res_strides_t (`List[int]`, *optional*, defaults to `[2, 2, 2]`):
+            Striding used in the audio conditioning network
+        resid_dropout (`int`, *optional*, defaults to 0):
+            Residual dropout used in the attention pattern.
+        sampling_rate (`int`, *optional*, defaults to 44100):
+            Sampling rate used for training.
+        spread (`int`, *optional*):
+            Spread used in the `summary_spread_attention` pattern
+        timing_dims (`int`, *optional*, defaults to 64):
+            Dimension of the timing embedding.
+        zero_out (`bool`, *optional*, defaults to `False`):
+            Whether or not to zero out convolution weights when initializing.
+    """
+
+    model_type = "jukebox_prior"
+    attribute_map = {
+        "max_position_embeddings": "n_positions",
+        "num_attention_heads": "n_head",
+    }
+
+    def __init__(
+        self,
+        act_fn="quick_gelu",
+        level=0,
+        alignment_head=2,
+        alignment_layer=68,
+        attention_multiplier=0.25,
+        attention_pattern="enc_dec_with_lyrics",
+        attn_dropout=0,
+        attn_res_scale=False,
+        blocks=64,
+        conv_res_scale=None,
+        num_layers=72,
+        emb_dropout=0,
+        encoder_config=None,
+        encoder_loss_fraction=0.4,
+        hidden_size=2048,
+        init_scale=0.2,
+        is_encoder_decoder=True,
+        lyric_vocab_size=80,
+        mask=False,
+        max_duration=600,
+        max_nb_genres=1,
+        merged_decoder=True,
+        metadata_conditioning=True,
+        metadata_dims=[604, 7898],
+        min_duration=0,
+        mlp_multiplier=1.0,
+        music_vocab_size=2048,
+        n_ctx=6144,
+        n_heads=2,
+        nb_relevant_lyric_tokens=384,
+        res_conv_depth=3,
+        res_conv_width=128,
+        res_convolution_multiplier=1,
+        res_dilation_cycle=None,
+        res_dilation_growth_rate=1,
+        res_downs_t=[3, 2, 2],
+        res_strides_t=[2, 2, 2],
+        resid_dropout=0,
+        sampling_rate=44100,
+        spread=None,
+        timing_dims=64,
+        zero_out=False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.act_fn = act_fn
+        self.alignment_head = alignment_head
+        self.alignment_layer = alignment_layer
+        self.attention_multiplier = attention_multiplier
+        self.attention_pattern = attention_pattern
+        self.attn_dropout = attn_dropout
+        self.attn_res_scale = attn_res_scale
+        self.blocks = blocks
+        self.conv_res_scale = conv_res_scale
+        self.num_layers = num_layers
+        self.emb_dropout = emb_dropout
+        self.music_vocab_size = music_vocab_size
+        if encoder_config is not None:
+            self.encoder_config = JukeboxPriorConfig(**encoder_config)
+        else:
+            self.encoder_config = None
+        self.encoder_loss_fraction = encoder_loss_fraction
+        self.init_scale = init_scale
+        self.is_encoder_decoder = is_encoder_decoder
+        self.lyric_vocab_size = lyric_vocab_size
+        self.level = level
+        self.mask = mask
+        self.max_duration = max_duration
+        self.max_nb_genres = max_nb_genres
+        self.merged_decoder = merged_decoder
+        self.metadata_conditioning = metadata_conditioning
+        self.metadata_dims = metadata_dims
+        self.min_duration = min_duration
+        self.mlp_multiplier = mlp_multiplier
+        self.n_ctx = n_ctx
+        self.n_heads = n_heads
+        self.nb_relevant_lyric_tokens = nb_relevant_lyric_tokens
+        self.res_conv_depth = res_conv_depth
+        self.res_conv_width = res_conv_width
+        self.res_convolution_multiplier = res_convolution_multiplier
+        self.res_dilation_cycle = res_dilation_cycle
+        self.res_dilation_growth_rate = res_dilation_growth_rate
+        self.res_downs_t = res_downs_t
+        self.res_strides_t = res_strides_t
+        self.resid_dropout = resid_dropout
+        self.sampling_rate = sampling_rate
+        self.spread = spread
+        self.timing_dims = timing_dims
+        self.hidden_size = hidden_size
+        self.zero_out = zero_out
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], level=0, **kwargs):
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the prior config dict if we are loading from JukeboxConfig
+        if config_dict.get("model_type") == "jukebox":
+            config_dict = config_dict[f"prior_{level}"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class JukeboxVQVAEConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`JukeboxVQVAE`]. It is used to instantiate a
+    `JukeboxVQVAE` according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the VQVAE from
+    [openai/jukebox-1b-lyrics](https://huggingface.co/openai/jukebox-1b-lyrics) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        act_fn (`str`, *optional*, defaults to `"relu"`):
+            Activation function of the model.
+        nb_discrete_codes (`int`, *optional*, defaults to 2048):
+            Number of codes of the VQVAE.
+        commit (`float`, *optional*, defaults to 0.02):
+            Commit loss multiplier.
+        conv_input_shape (`int`, *optional*, defaults to 1):
+            Number of audio channels.
+        conv_res_scale (`bool`, *optional*, defaults to `False`):
+            Whether or not to scale the residuals of the `JukeboxResConv1DBlock`.
+        embed_dim (`int`, *optional*, defaults to 64):
+            Embedding dimension of the codebook vectors.
+        hop_fraction (`List[int]`, *optional*, defaults to `[0.125, 0.5, 0.5]`):
+            Fraction of non-intersecting window used when continuing the sampling process.
+        levels (`int`, *optional*, defaults to 3):
+            Number of hierarchical levels that used in the VQVAE.
+        lmu (`float`, *optional*, defaults to 0.99):
+            Used in the codebook update, exponential moving average coefficient. For more detail refer to Appendix A.1
+            of the original [VQVAE paper](https://huggingface.co/papers/1711.00937v2.pdf)
+        multipliers (`List[int]`, *optional*, defaults to `[2, 1, 1]`):
+            Depth and width multipliers used for each level. Used on the `res_conv_width` and `res_conv_depth`
+        res_conv_depth (`int`, *optional*, defaults to 4):
+            Depth of the encoder and decoder block. If no `multipliers` are used, this is the same for each level.
+        res_conv_width (`int`, *optional*, defaults to 32):
+            Width of the encoder and decoder block. If no `multipliers` are used, this is the same for each level.
+        res_convolution_multiplier (`int`, *optional*, defaults to 1):
+            Scaling factor of the hidden dimension used in the `JukeboxResConv1DBlock`.
+        res_dilation_cycle (`int`, *optional*):
+            Dilation cycle value used in the `JukeboxResnet`. If an int is used, each new Conv1 block will have a depth
+            reduced by a power of `res_dilation_cycle`.
+        res_dilation_growth_rate (`int`, *optional*, defaults to 3):
+            Resnet dilation growth rate used in the VQVAE (dilation_growth_rate ** depth)
+        res_downs_t (`List[int]`, *optional*, defaults to `[3, 2, 2]`):
+            Downsampling rate for each level of the hierarchical VQ-VAE.
+        res_strides_t (`List[int]`, *optional*, defaults to `[2, 2, 2]`):
+            Stride used for each level of the hierarchical VQ-VAE.
+        sample_length (`int`, *optional*, defaults to 1058304):
+            Provides the max input shape of the VQVAE. Is used to compute the input shape of each level.
+        init_scale (`float`, *optional*, defaults to 0.2):
+            Initialization scale.
+        zero_out (`bool`, *optional*, defaults to `False`):
+            Whether or not to zero out convolution weights when initializing.
+    """
+
+    model_type = "jukebox_vqvae"
+
+    def __init__(
+        self,
+        act_fn="relu",
+        nb_discrete_codes=2048,
+        commit=0.02,
+        conv_input_shape=1,
+        conv_res_scale=False,
+        embed_dim=64,
+        hop_fraction=[0.125, 0.5, 0.5],
+        levels=3,
+        lmu=0.99,
+        multipliers=[2, 1, 1],
+        res_conv_depth=4,
+        res_conv_width=32,
+        res_convolution_multiplier=1,
+        res_dilation_cycle=None,
+        res_dilation_growth_rate=3,
+        res_downs_t=[3, 2, 2],
+        res_strides_t=[2, 2, 2],
+        sample_length=1058304,
+        init_scale=0.2,
+        zero_out=False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hop_fraction = hop_fraction
+        self.conv_input_shape = conv_input_shape
+        self.sample_length = sample_length
+
+        # VQVAE parameters (all used)
+        self.levels = levels
+        self.embed_dim = embed_dim
+        self.nb_discrete_codes = nb_discrete_codes
+        self.res_conv_width = res_conv_width
+        self.res_conv_depth = res_conv_depth
+        self.res_convolution_multiplier = res_convolution_multiplier
+        self.res_dilation_growth_rate = res_dilation_growth_rate
+        self.res_dilation_cycle = res_dilation_cycle
+        self.multipliers = multipliers
+        self.res_downs_t = res_downs_t
+        self.res_strides_t = res_strides_t
+        self.lmu = lmu
+        self.commit = commit
+        self.conv_res_scale = conv_res_scale
+        self.act_fn = act_fn
+        self.init_scale = init_scale
+        self.zero_out = zero_out
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Union[str, os.PathLike], **kwargs):
+        cls._set_token_in_kwargs(kwargs)
+
+        config_dict, kwargs = cls.get_config_dict(pretrained_model_name_or_path, **kwargs)
+
+        # get the text config dict if we are loading from CLIPConfig
+        if config_dict.get("model_type") == "jukebox":
+            config_dict = config_dict["vqvae_config"]
+
+        if "model_type" in config_dict and hasattr(cls, "model_type") and config_dict["model_type"] != cls.model_type:
+            logger.warning(
+                f"You are using a model of type {config_dict['model_type']} to instantiate a model of type "
+                f"{cls.model_type}. This is not supported for all configurations of models and can yield errors."
+            )
+
+        return cls.from_dict(config_dict, **kwargs)
+
+
+class JukeboxConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`JukeboxModel`].
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information. Instantiating a configuration with the defaults will
+    yield a similar configuration to that of
+    [openai/jukebox-1b-lyrics](https://huggingface.co/openai/jukebox-1b-lyrics) architecture.
+
+
+    The downsampling and stride are used to determine downsampling of the input sequence. For example, downsampling =
+    (5,3), and strides = (2, 2) will downsample the audio by 2^5 = 32 to get the first level of codes, and 2**8 = 256
+    to get the second level codes. This is mostly true for training the top level prior and the upsamplers.
+
+    Args:
+        vqvae_config (`JukeboxVQVAEConfig`, *optional*):
+            Configuration for the `JukeboxVQVAE` model.
+        prior_config_list (`List[JukeboxPriorConfig]`, *optional*):
+            List of the configs for each of the `JukeboxPrior` of the model. The original architecture uses 3 priors.
+        nb_priors (`int`, *optional*, defaults to 3):
+            Number of prior models that will sequentially sample tokens. Each prior is conditional auto regressive
+            (decoder) model, apart from the top prior, which can include a lyric encoder. The available models were
+            trained using a top prior and 2 upsampler priors.
+        sampling_rate (`int`, *optional*, defaults to 44100):
+            Sampling rate of the raw audio.
+        timing_dims (`int`, *optional*, defaults to 64):
+            Dimensions of the JukeboxRangeEmbedding layer which is equivalent to traditional positional embedding
+            layer. The timing embedding layer converts the absolute and relative position in the currently sampled
+            audio to a tensor of length `timing_dims` that will be added to the music tokens.
+        min_duration (`int`, *optional*, defaults to 0):
+            Minimum duration of the audios to generate
+        max_duration (`float`, *optional*, defaults to 600.0):
+            Maximum duration of the audios to generate
+        max_nb_genres (`int`, *optional*, defaults to 5):
+            Maximum number of genres that can be used to condition a single sample.
+        metadata_conditioning (`bool`, *optional*, defaults to `True`):
+            Whether or not to use metadata conditioning, corresponding to the artist, the genre and the min/maximum
+            duration.
+
+    Example:
+
+    ```python
+    >>> from transformers import JukeboxModel, JukeboxConfig
+
+    >>> # Initializing a Jukebox configuration
+    >>> configuration = JukeboxConfig()
+
+    >>> # Initializing a model from the configuration
+    >>> model = JukeboxModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+
+    model_type = "jukebox"
+
+    def __init__(
+        self,
+        vqvae_config=None,
+        prior_config_list=None,
+        nb_priors=3,
+        sampling_rate=44100,
+        timing_dims=64,
+        min_duration=0,
+        max_duration=600.0,
+        max_nb_genres=5,
+        metadata_conditioning=True,
+        **kwargs,
+    ):
+        if vqvae_config is None:
+            vqvae_config = {}
+            logger.info("vqvae_config is None. initializing the JukeboxVQVAE with default values.")
+
+        self.vqvae_config = JukeboxVQVAEConfig(**vqvae_config)
+        if prior_config_list is not None:
+            self.prior_configs = [JukeboxPriorConfig(**prior_config) for prior_config in prior_config_list]
+        else:
+            self.prior_configs = []
+            for prior_idx in range(nb_priors):
+                prior_config = kwargs.pop(f"prior_{prior_idx}", None)
+                if prior_config is None:
+                    prior_config = {}
+                    logger.info(
+                        f"prior_{prior_idx}'s  config is None. Initializing the JukeboxPriorConfig list with default"
+                        " values."
+                    )
+                self.prior_configs.append(JukeboxPriorConfig(**prior_config))
+
+        self.hop_fraction = self.vqvae_config.hop_fraction
+
+        self.nb_priors = nb_priors
+
+        # Metadata conditioning
+        self.max_nb_genres = max_nb_genres
+        self.sampling_rate = sampling_rate
+        self.timing_dims = timing_dims
+        self.min_duration = min_duration
+        self.max_duration = max_duration
+        self.metadata_conditioning = metadata_conditioning
+
+        super().__init__(**kwargs)
+
+    @classmethod
+    def from_configs(cls, prior_configs: list[JukeboxPriorConfig], vqvae_config: JukeboxVQVAEConfig, **kwargs):
+        r"""
+        Instantiate a [`JukeboxConfig`] (or a derived class) from clip text model configuration and clip vision model
+        configuration.
+
+        Returns:
+            [`JukeboxConfig`]: An instance of a configuration object
+        """
+        prior_config_list = [config.to_dict() for config in prior_configs]
+        return cls(prior_config_list=prior_config_list, vqvae_config_dict=vqvae_config.to_dict(), **kwargs)
+
+    def to_dict(self):
+        # Override the default to_dict to apply to_dict to the list of prior configs.
+        result = super().to_dict()
+        result["prior_config_list"] = [config.to_dict() for config in result.pop("prior_configs")]
+        return result
+
+
+__all__ = ["JukeboxConfig", "JukeboxPriorConfig", "JukeboxVQVAEConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/jukebox/modeling_jukebox.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/jukebox/modeling_jukebox.py
new file mode 100644
index 0000000000000000000000000000000000000000..f928d49cf5f73156c1352e1a5cb8695a57a08c1c
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/jukebox/modeling_jukebox.py
@@ -0,0 +1,2670 @@
+# coding=utf-8
+# Copyright 2022 The OpenAI Team Authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Jukebox model."""
+
+import math
+import os
+from typing import Optional
+
+import numpy as np
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.nn import LayerNorm as FusedLayerNorm
+
+from ....activations import ACT2FN
+from ....modeling_utils import PreTrainedModel
+from ....utils import add_start_docstrings, logging
+from ....utils.logging import tqdm
+from .configuration_jukebox import ATTENTION_PATTERNS, JukeboxConfig, JukeboxPriorConfig, JukeboxVQVAEConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+def filter_logits(logits, top_k=0, top_p=0.0, filter_value=-float("Inf")):
+    """
+    Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
+
+    Args:
+        logits (`torch.Tensor`):
+            logits distribution shape (vocabulary size)
+        top_k (`int`, *optional*, defaults to 0):
+            When `top_k >0` keep only top key tokens with highest probability (top-k filtering).
+        top_p (`int`, *optional*, defaults to 0):
+            When `top_p>0.0` keep the top tokens with cumulative probability >= `top_p` (nucleus filtering).
+    """
+    logits = logits.clone()
+    top_k = min(top_k, logits.size(-1))  # Safety check
+
+    if top_k > 0:
+        # Remove all tokens with a probability less than the last token of the top-k
+        indices_to_remove = logits < torch.topk(logits, top_k, dim=-1)[0][..., -1:]
+        logits[indices_to_remove] = filter_value
+
+    if top_p > 0.0:
+        sorted_logits, sorted_indices = torch.sort(logits, descending=True, dim=-1)
+        cumulative_probs = torch.cumsum(F.softmax(sorted_logits, dim=-1), dim=-1)
+
+        # Remove tokens with cumulative probability above the threshold
+        sorted_indices_to_remove = cumulative_probs > top_p
+        # Shift the indices to the right to keep also the first token above the threshold
+        sorted_indices_to_remove[..., 1:] = sorted_indices_to_remove[..., :-1].clone()
+        sorted_indices_to_remove[..., 0] = 0
+
+        # indices_to_remove = sorted_indices[sorted_indices_to_remove]
+        indices_to_remove = torch.zeros_like(logits, dtype=torch.bool).scatter_(
+            dim=-1, index=sorted_indices, src=sorted_indices_to_remove
+        )
+        logits[indices_to_remove] = filter_value
+    return logits
+
+
+def get_relevant_lyric_tokens(full_tokens, max_n_lyric_tokens, total_length, offset, duration):
+    """
+    Extract only the relevant tokens based on the character position. A total of `max_n_lyric_tokens` tokens will be
+    returned. If the provided token sequence is smaller, it will be padded, otherwise, only characters ranging from the
+    midpoint - `max_n_lyric_tokens//2` to the midpoint + `max_n_lyric_tokens//2` will be returned. This *focuses* on
+    the most relevant tokens (in time) for the sequence.
+
+    Args:
+        full_tokens (`list[int]`):
+            List containing the token ids of the entire lyrics.
+        total_length (`int`):
+            Total expected length of the music (not all of it is generated, see duration), in samples.
+        offset (`int`):
+            Starting sample in the music. If the offset is greater than 0, the lyrics will be shifted take that into
+            account
+        duration (`int`):
+            Expected duration of the generated music, in samples. The duration has to be smaller than the total length,
+            which represent the overall length of the signal,
+    """
+    full_tokens = full_tokens[0]
+    if len(full_tokens) < max_n_lyric_tokens:
+        tokens = torch.cat(
+            [torch.zeros(max_n_lyric_tokens - len(full_tokens), dtype=torch.long).to(full_tokens.device), full_tokens]
+        )
+        indices = [-1] * (max_n_lyric_tokens - len(full_tokens)) + list(range(0, len(full_tokens)))
+    else:
+        midpoint = int(len(full_tokens) * (offset + duration / 2.0) / total_length)
+        midpoint = min(max(midpoint, max_n_lyric_tokens // 2), len(full_tokens) - max_n_lyric_tokens // 2)
+        tokens = full_tokens[midpoint - max_n_lyric_tokens // 2 : midpoint + max_n_lyric_tokens // 2]
+        indices = list(range(midpoint - max_n_lyric_tokens // 2, midpoint + max_n_lyric_tokens // 2))
+    return tokens.unsqueeze(dim=0), indices
+
+
+# Break total_length into hops/windows of size n_ctx separated by hop_length
+def get_starts(total_length, n_ctx, hop_length):
+    starts = []
+    for start in range(0, total_length - n_ctx + hop_length, hop_length):
+        if start + n_ctx >= total_length:
+            # Last hop could be smaller, we make it n_ctx to maximise context
+            start = total_length - n_ctx
+        starts.append(start)
+    return starts
+
+
+def get_alignment(music_tokens, labels, prior, config):
+    level = prior.levels - 1  # Top level used
+    n_ctx = prior.n_ctx
+    tokens = music_tokens[level]
+    batch_size, total_length = tokens.shape[0], tokens.shape[1]
+    if total_length < n_ctx:
+        padding_length = n_ctx - total_length
+        tokens = torch.cat(
+            [tokens, torch.zeros(batch_size, n_ctx - total_length, dtype=tokens.dtype, device=tokens.device)], dim=1
+        )
+        total_length = tokens.shape[1]
+    else:
+        padding_length = 0
+
+    hop_length = int(config.hop_fraction[-level - 1] * prior.n_ctx)
+    alignment_head, alignment_layer = config.prior_alignment_head[0], config.prior_alignment_layer[0]
+    attn_layers = {alignment_layer}
+    alignment_hops = {}
+    indices_hops = {}
+    for start in tqdm(get_starts(total_length, n_ctx, hop_length), desc="Computing lyric to music alignment "):
+        end = start + n_ctx
+        # set metadata offset, sample_length and lyrics tokens
+        metadata, indices_hop = prior.get_metadata(labels, start, config.sample_length, get_indices=True, offset=0)
+        tokens_bs = torch.chunk(tokens, batch_size, dim=0)
+        metadata_bs = torch.chunk(metadata, batch_size, dim=0)
+        w_hops = []
+        for tokens_i, metadata_i in zip(tokens_bs, metadata_bs):
+            w_hop = prior.forward_tokens(tokens_i[:, start:end], [], metadata_i, get_attn_weights=attn_layers)
+            w_hops.append(w_hop[0][:, alignment_head])
+            del w_hop
+        weights = torch.cat(w_hops, dim=0)
+        del w_hops
+        alignment_hop = weights.to(device="cpu", dtype=torch.float).numpy()
+        del weights
+
+        # alignment_hop has shape (bs, n_ctx, nb_relevant_lyric_tokens)
+        # indices_hop is a list of len=bs, each entry of len hps.nb_relevant_lyric_tokens
+        indices_hops[start] = indices_hop
+        alignment_hops[start] = alignment_hop
+
+    # Combine attn for each hop into attn for full range
+    # Use indices to place them into correct place for corresponding source tokens
+    alignments = []
+    for item in range(batch_size):
+        # Note each item has different length lyrics
+        full_tokens = labels[0, 3:]
+        alignment = np.zeros((total_length, len(full_tokens) + 1))
+        for start in reversed(get_starts(total_length, n_ctx, hop_length)):
+            end = start + n_ctx
+            alignment_hop = alignment_hops[start][item]
+            indices = indices_hops[start][item]
+            alignment[start:end, indices] = alignment_hop
+        alignment = alignment[: total_length - padding_length, :-1]  # remove token padding, and last lyric index
+        alignments.append(alignment)
+    return alignments
+
+
+def save_temp_audio(fname, lvl, metas, aud):
+    aud = torch.clamp(aud, -1, 1).cpu().numpy()
+    for i in list(range(aud.shape[0])):
+        if metas is not None:
+            artists, genres, lyrics = list(metas)[i].values()
+            path = f"{fname}/lvl_{lvl}-{artists}-{genres}-{lyrics[:5]}-{i}"
+            np.save(path, aud[i])
+        else:
+            np.save(f"{fname}/lvl_{lvl}-sample-{i}", aud[i])
+
+
+def get_mask(mask, query_length, key_value_length, blocks, spread, device, sample, sample_t):
+    # returns a mask of shape 1 x 1 x query_length x key_value_length or None if masking is not needed.
+    if mask is None or query_length == 1:
+        return None
+    offset = sample_t - query_length if sample else max(key_value_length - query_length, 0)
+    if mask == "autoregressive":
+        # Masked dense
+        mask = torch.ones(query_length, key_value_length, device=device).tril(offset)
+    elif mask == "summary":
+        # Masked summary
+        mask = torch.ones(query_length, query_length, device=device).tril()
+        mask = torch.ones(query_length, query_length, device=device).tril()
+        mask = mask.view(query_length, blocks, query_length // blocks)[:, :-1, -key_value_length // blocks :]
+        mask = (
+            torch.nn.functional.pad(
+                mask,
+                (0, 0, 1, 0),
+                value=1,
+            )
+            .contiguous()
+            .view(query_length, key_value_length)
+        )
+    elif mask == "prime":
+        mask = torch.ones(query_length, key_value_length, device=device).tril(offset)
+    return mask.view(1, 1, query_length, key_value_length)
+
+
+class JukeboxConv1D(nn.Module):
+    def __init__(self, input_width, output_width):
+        super().__init__()
+        self.input_width = input_width
+        self.output_width = output_width
+        weight = torch.empty(input_width, output_width)
+        bias = torch.zeros(output_width)
+        self.weight = nn.Parameter(weight)
+        self.bias = nn.Parameter(bias)
+
+    def forward(self, hidden_states):
+        size_out = (*hidden_states.size()[:-1], self.output_width)
+        hidden_states = torch.addmm(
+            self.bias.type_as(hidden_states),
+            hidden_states.view(-1, hidden_states.size(-1)),
+            self.weight.type_as(hidden_states),
+        )
+        hidden_states = hidden_states.view(*size_out)
+        return hidden_states
+
+
+class JukeboxResConv1DBlock(nn.Module):
+    def __init__(self, config, conv_width, depth=1, res_scale=1.0):
+        super().__init__()
+        hidden_dim = config.res_convolution_multiplier * conv_width
+        dilation = config.res_dilation_growth_rate**depth
+        padding = dilation
+
+        self.res_scale = res_scale
+        self.activation = nn.ReLU()
+        self.conv1d_1 = nn.Conv1d(conv_width, hidden_dim, 3, 1, padding, dilation)
+        self.conv1d_2 = nn.Conv1d(hidden_dim, conv_width, 1, 1, 0)
+
+    def forward(self, hidden_states):
+        residuals = hidden_states
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.conv1d_1(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.conv1d_2(hidden_states)
+        return residuals + self.res_scale * hidden_states
+
+
+class JukeboxResnet1D(nn.Module):
+    def __init__(self, config, conv_width, n_depth, reverse_dilation=False):
+        super().__init__()
+        self.dilation_cycle = config.res_dilation_cycle
+        res_scale = 1.0 if not config.conv_res_scale else 1.0 / math.sqrt(n_depth)
+
+        blocks = []
+        for depth in range(n_depth):
+            block_depth = depth if self.dilation_cycle is None else depth % self.dilation_cycle
+            blocks.append(JukeboxResConv1DBlock(config, conv_width, block_depth, res_scale))
+
+        if reverse_dilation:
+            blocks = blocks[::-1]
+        self.resnet_block = nn.ModuleList(blocks)
+
+    def forward(self, hidden_states):
+        for block in self.resnet_block:
+            hidden_states = block(hidden_states)
+        return hidden_states
+
+
+class JukeboxEncoderConvBlock(nn.Module):
+    def __init__(self, config, embed_dim, hidden_dim, depth, down_t, stride_t):
+        super().__init__()
+        blocks = []
+        filter_t = stride_t * 2
+        pad_t = stride_t // 2
+        if down_t > 0:
+            for i in range(down_t):
+                blocks.append(nn.Conv1d(embed_dim if i == 0 else hidden_dim, hidden_dim, filter_t, stride_t, pad_t))
+                blocks.append(JukeboxResnet1D(config, hidden_dim, depth))
+        self.proj_out = nn.Conv1d(hidden_dim, config.embed_dim, 3, 1, 1)
+        self.downsample_block = nn.ModuleList(blocks)
+
+    def forward(self, hidden_states):
+        for block in self.downsample_block:
+            hidden_states = block(hidden_states)
+        hidden_states = self.proj_out(hidden_states)
+        return hidden_states
+
+
+class JukeboxEncoder(nn.Module):
+    def __init__(self, config, width, depth, levels, downs_t, strides_t):
+        super().__init__()
+        self.levels = levels
+        self.level_blocks = nn.ModuleList()
+
+        iterator = zip(list(range(self.levels)), downs_t, strides_t)
+        for i, down_t, stride_t in iterator:
+            self.level_blocks.append(
+                JukeboxEncoderConvBlock(
+                    config, config.conv_input_shape if i == 0 else config.embed_dim, width, depth, down_t, stride_t
+                )
+            )
+
+    def forward(self, hidden_states):
+        all_hidden_states = []
+
+        # 64, 32, ...
+        for level in range(self.levels):
+            level_block = self.level_blocks[level]
+            hidden_states = level_block(hidden_states)
+            all_hidden_states.append(hidden_states)
+
+        return all_hidden_states
+
+
+class JukeboxDecoderConvBock(nn.Module):
+    def __init__(self, config, embed_dim, hidden_dim, depth, down_t, stride_t, reverse_dilation=True):
+        self.embed_dim = embed_dim
+        self.hidden_dim = hidden_dim
+        super().__init__()
+        blocks = []
+        if down_t > 0:
+            filter_t = stride_t * 2
+            pad_t = stride_t // 2
+            self.proj_in = nn.Conv1d(embed_dim, hidden_dim, 3, 1, 1)
+            for i in range(down_t):
+                blocks.append(JukeboxResnet1D(config, hidden_dim, depth, reverse_dilation))
+                blocks.append(
+                    nn.ConvTranspose1d(
+                        hidden_dim, hidden_dim if i < down_t - 1 else embed_dim, filter_t, stride_t, pad_t
+                    )
+                )
+        self.upsample_block = nn.ModuleList(blocks)
+
+    def forward(self, hidden_states):
+        hidden_states = self.proj_in(hidden_states)
+        for block in self.upsample_block:
+            hidden_states = block(hidden_states)
+        return hidden_states
+
+
+class JukeboxDecoder(nn.Module):
+    def __init__(self, config, hidden_dim, depth, levels, downs_t, strides_t):
+        super().__init__()
+        self.levels = levels
+        self.level_blocks = nn.ModuleList()
+        for level, down_t, stride_t in zip(list(range(self.levels)), downs_t, strides_t):
+            self.level_blocks.append(
+                JukeboxDecoderConvBock(config, config.embed_dim, hidden_dim, depth, down_t, stride_t)
+            )
+
+        self.out = nn.Conv1d(config.embed_dim, config.conv_input_shape, 3, 1, 1)
+
+    def forward(self, hidden_states, all_levels=True):
+        hidden_state = hidden_states[-1]
+
+        # 32, 64 ...
+        for level in reversed(range(self.levels)):
+            level_block = self.level_blocks[level]
+            hidden_state = level_block(hidden_state)
+
+            if level != 0 and all_levels:
+                hidden_state = hidden_state + hidden_states[level - 1]
+
+        hidden_state = self.out(hidden_state)
+        return hidden_state
+
+
+class JukeboxBottleneckBlock(nn.Module):
+    def __init__(self, config: JukeboxVQVAEConfig):
+        super().__init__()
+        self.nb_discrete_codes = config.nb_discrete_codes
+        self.codebook_width = config.embed_dim
+        self.mu = config.lmu
+        self.threshold = 1.0
+        self.init = False
+        self.codebook_sum = None
+        self.codebook_elem = None
+        self.register_buffer("codebook", torch.zeros(self.nb_discrete_codes, self.codebook_width))
+
+    def _tile(self, hidden_states):
+        dim, embed_width = hidden_states.shape
+        if dim < self.nb_discrete_codes:
+            n_repeats = (self.nb_discrete_codes + dim - 1) // dim
+            std = 0.01 / np.sqrt(embed_width)
+            hidden_states = hidden_states.repeat(n_repeats, 1)
+            hidden_states = hidden_states + torch.randn_like(hidden_states) * std
+        return hidden_states
+
+    def init_codebook(self, hidden_states):
+        nb_discrete_codes = self.nb_discrete_codes
+        self.init = True
+        codes = self._tile(hidden_states)
+        self.codebook = codes[torch.randperm(codes.shape[0])][:nb_discrete_codes]
+        self.codebook_sum = self.codebook
+        self.codebook_elem = torch.ones(nb_discrete_codes, device=self.codebook.device)
+
+    def update_codebook(self, hidden_states, latent_states):
+        mu, codebook_width, nb_discrete_codes = self.mu, self.codebook_width, self.nb_discrete_codes
+        with torch.no_grad():
+            # Calculate new centres
+            # nb_discrete_codes, batch_size * seq_length
+            latent_states_onehot = torch.zeros(nb_discrete_codes, hidden_states.shape[0], device=hidden_states.device)
+            latent_states_onehot.scatter_(0, latent_states.view(1, hidden_states.shape[0]), 1)
+
+            _codebook_sum = torch.matmul(latent_states_onehot, hidden_states)
+            _codebook_elem = latent_states_onehot.sum(dim=-1)  # nb_discrete_codes
+            codes = self._tile(hidden_states)
+            _random_codebook = codes[torch.randperm(codes.shape[0])][:nb_discrete_codes]
+
+            # Update centres
+            old_codebook = self.codebook
+            self.codebook_sum = mu * self.codebook_sum + (1.0 - mu) * _codebook_sum
+            self.codebook_elem = mu * self.codebook_elem + (1.0 - mu) * _codebook_elem  # nb_discrete_codes
+            usage = (self.codebook_elem.view(nb_discrete_codes, 1) >= self.threshold).float()
+
+            norm_code = self.codebook_sum.view(nb_discrete_codes, codebook_width) / self.codebook_elem.view(
+                nb_discrete_codes, 1
+            )
+            self.codebook = usage * (norm_code) + (1 - usage) * _random_codebook
+            _codebook_prob = _codebook_elem / torch.sum(_codebook_elem)  # prob of each bin
+            entropy = -torch.sum(_codebook_prob * torch.log(_codebook_prob + 1e-8))  # entropy ie how diverse
+            used_curr = (_codebook_elem >= self.threshold).sum()
+            usage = torch.sum(usage)
+            dk = torch.linalg.norm(self.codebook - old_codebook) / np.sqrt(np.prod(old_codebook.shape))
+        return {"entropy": entropy, "used_curr": used_curr, "usage": usage, "dk": dk}
+
+    def preprocess(self, hidden_states):
+        hidden_states = hidden_states.permute(0, 2, 1).contiguous()
+        hidden_states = hidden_states.view(-1, hidden_states.shape[-1])
+
+        if hidden_states.shape[-1] == self.codebook_width:
+            prenorm = torch.linalg.norm(hidden_states - torch.mean(hidden_states)) / np.sqrt(
+                np.prod(hidden_states.shape)
+            )
+        elif hidden_states.shape[-1] == 2 * self.codebook_width:
+            x1, x2 = hidden_states[..., : self.codebook_width], hidden_states[..., self.codebook_width :]
+            prenorm = (torch.linalg.norm(x1 - torch.mean(x1)) / np.sqrt(np.prod(x1.shape))) + (
+                torch.linalg.norm(x2 - torch.mean(x2)) / np.sqrt(np.prod(x2.shape))
+            )
+
+            # Normalise
+            hidden_states = x1 + x2
+
+        return hidden_states, prenorm
+
+    def postprocess(self, latent_states, dequantised_states, x_shape):
+        batch_size, time = x_shape
+        dequantised_states = dequantised_states.view(batch_size, time, -1).permute(0, 2, 1).contiguous()
+        latent_states = latent_states.view(batch_size, time)
+        return latent_states, dequantised_states
+
+    def quantise(self, latent_states):
+        # Calculate latent code latent_states
+        codebook_weights = self.codebook.t()
+        distance = (
+            torch.sum(latent_states**2, dim=-1, keepdim=True)
+            - 2 * torch.matmul(latent_states, codebook_weights)
+            + torch.sum(codebook_weights**2, dim=0, keepdim=True)
+        )  # (batch_size * latent_states , codebook_weights)
+        min_distance, music_tokens = torch.min(distance, dim=-1)
+        fit = torch.mean(min_distance)
+        return music_tokens, fit
+
+    def dequantise(self, music_tokens):
+        dequantised_states = F.embedding(music_tokens, self.codebook)
+        return dequantised_states
+
+    def encode(self, latent_states):
+        samples, _, seq_len = latent_states.shape
+
+        # Preprocess.
+        latent_states, _ = self.preprocess(latent_states)
+
+        # Quantise
+        music_tokens, _ = self.quantise(latent_states)
+
+        # Postprocess.
+        music_tokens = music_tokens.view(samples, seq_len)
+        return music_tokens
+
+    def decode(self, music_tokens):
+        samples, seq_len = music_tokens.shape
+
+        # Dequantise
+        dequantised_states = self.dequantise(music_tokens)
+
+        # Postprocess
+        dequantised_states = (
+            dequantised_states.view(samples, seq_len, self.codebook_width).permute(0, 2, 1).contiguous()
+        )
+        return dequantised_states
+
+    def forward(self, hidden_states, update_codebook=True):
+        samples, _, seq_len = hidden_states.shape
+
+        # Preprocess
+        hidden_states, prenorm = self.preprocess(hidden_states)
+
+        # Init codebook if not inited
+        if update_codebook and not self.init:
+            self.init_codebook(hidden_states)
+
+        # Quantise and dequantise through bottleneck
+        music_tokens, fit = self.quantise(hidden_states)
+        dequantised_states = self.dequantise(music_tokens)
+
+        # Update embeddings
+        if update_codebook:
+            update_metrics = self.update_codebook(hidden_states, music_tokens)
+        else:
+            update_metrics = {}
+
+        # Loss
+        commit_loss = torch.linalg.norm(dequantised_states.detach() - hidden_states) ** 2 / np.prod(
+            hidden_states.shape
+        )
+
+        # Passthrough
+        dequantised_states = hidden_states + (dequantised_states - hidden_states).detach()
+
+        # Postprocess
+        music_tokens, dequantised_states = self.postprocess(music_tokens, dequantised_states, (samples, seq_len))
+        return music_tokens, dequantised_states, commit_loss, dict(fit=fit, pn=prenorm, **update_metrics)
+
+
+class JukeboxBottleneck(nn.Module):
+    def __init__(self, config, levels):
+        super().__init__()
+        self.levels = levels
+        self.level_blocks = nn.ModuleList()
+        for level in range(self.levels):
+            self.level_blocks.append(JukeboxBottleneckBlock(config))
+
+    def encode(self, raw_audio):
+        music_tokens = [
+            level_block.encode(hidden_states) for (level_block, hidden_states) in zip(self.level_blocks, raw_audio)
+        ]
+        return music_tokens
+
+    def decode(self, music_tokens, start_level=0, end_level=None):
+        if end_level is None:
+            end_level = self.levels
+        quantised_audio = [
+            level_block.decode(z) for (level_block, z) in zip(self.level_blocks[start_level:end_level], music_tokens)
+        ]
+        return quantised_audio
+
+    def forward(self, input_audio):
+        music_tokens, quantised_states, commit_losses, metrics = [], [], [], []
+        for level in range(self.levels):
+            level_block = self.level_blocks[-level - 1]
+            hidden_states = input_audio[level]
+            sampled_tokens, quantised_state, commit_loss, metric = level_block(
+                hidden_states, update_codebook=self.training
+            )
+            music_tokens.append(sampled_tokens)
+            if not self.training:
+                # Be extra paranoid and make sure the encoder weights can't
+                # change from straight-through estimator
+                quantised_state = quantised_state.detach()
+            quantised_states.append(quantised_state)
+            commit_losses.append(commit_loss)
+            if self.training:
+                metrics.append(metric)
+        return music_tokens, quantised_states, commit_losses, metrics
+
+
+JUKEBOX_START_DOCSTRING = r"""
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config (`JukeboxConfig`): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    """The Hierarchical VQ-VAE model used in Jukebox. This model follows the Hierarchical VQVAE paper from [Will Williams, Sam
+Ringer, Tom Ash, John Hughes, David MacLeod, Jamie Dougherty](https://huggingface.co/papers/2002.08111).
+
+    """,
+    JUKEBOX_START_DOCSTRING,
+)
+class JukeboxVQVAE(PreTrainedModel):
+    config: JukeboxVQVAEConfig
+    base_model_prefix = "vqvae"
+
+    def _init_weights(self, module):
+        if isinstance(module, nn.Embedding):  # embed_tokens
+            module.weight.data.normal_(mean=0.0, std=0.02 * self.config.init_scale)
+        elif isinstance(module, JukeboxConv1D):
+            if self.config.zero_out:
+                module.weight.data.zero_()
+            else:
+                module.weight.data.normal_(mean=0.0, std=0.02 * self.config.init_scale)
+        elif isinstance(module, JukeboxResConv1DBlock) and self.config.zero_out:
+            module.conv1d_2.weight.data.zero_()
+            module.conv1d_2.bias.data.zero_()
+        if isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+    def __init__(self, config: JukeboxVQVAEConfig):
+        super().__init__(config)
+        downs_t = config.res_downs_t
+        strides_t = config.res_strides_t
+        if not config.sample_length:
+            downsamples = [stride**down for stride, down in zip(strides_t, downs_t)]
+            top_raw_to_tokens = np.prod(downsamples)
+            config.sample_length = (
+                config.sample_length_in_seconds * config.sampling_rate // top_raw_to_tokens
+            ) * top_raw_to_tokens
+            config.sample_length = config.sample_length.astype(int)
+
+        self.nb_discrete_codes = config.nb_discrete_codes
+        self.commit = config.commit
+        self.sample_length = config.sample_length
+
+        self.downsamples = [stride**down for stride, down in zip(strides_t, downs_t)]
+        self.hop_lengths = np.cumprod(self.downsamples)
+        self.levels = levels = config.levels
+        self.music_tokens_shapes = [
+            (int(self.sample_length // self.hop_lengths[-level - 1])) for level in range(levels)
+        ]
+
+        self.multipliers = config.multipliers if config.multipliers is not None else [1] * levels
+
+        self.encoders = nn.ModuleList()
+        self.decoders = nn.ModuleList()
+        for level in range(levels):
+            width = config.res_conv_width * self.multipliers[level]
+            depth = config.res_conv_depth * self.multipliers[level]
+            self.encoders.append(
+                JukeboxEncoder(config, width, depth, level + 1, downs_t[: level + 1], strides_t[: level + 1])
+            )
+            self.decoders.append(
+                JukeboxDecoder(config, width, depth, level + 1, downs_t[: level + 1], strides_t[: level + 1])
+            )
+
+        self.bottleneck = JukeboxBottleneck(config, levels)
+
+    def _decode(self, music_tokens, start_level=0, end_level=None):
+        # Decode
+        if end_level is None:
+            end_level = self.levels
+        latent_states = self.bottleneck.decode(music_tokens, start_level=start_level, end_level=end_level)
+        # Use only lowest level
+        decoder, dequantised_state = self.decoders[start_level], latent_states[0:1]
+        dequantised_state = decoder(dequantised_state, all_levels=False)
+        dequantised_state = dequantised_state.permute(0, 2, 1)
+        return dequantised_state
+
+    def decode(self, music_tokens, start_level=0, end_level=None, bs_chunks=1) -> torch.Tensor:
+        """
+        Transforms the input `music_tokens` to their `raw_audio` representation.
+
+        Args:
+            music_tokens (`torch.LongTensor`):
+                Tensor of music tokens which will be decoded to raw audio by using the codebook. Each music token
+                should be an index to a corresponding `code` vector in the codebook.
+            start_level (`int`, *optional*):
+                Level at which the decoding process will start. Default to 0.
+            end_level (`int`, *optional*):
+                Level at which the decoding process will start. Default to None.
+            bs_chunks (int, *optional*):
+                Number of chunks to process at the same time.
+        """
+        token_chunks = [torch.chunk(token, bs_chunks, dim=0) for token in music_tokens]
+        dequantised_states = []
+        for i in range(bs_chunks):
+            music_tokens_i = [chunks[i] for chunks in token_chunks]
+            dequantised_state = self._decode(music_tokens_i, start_level=start_level, end_level=end_level)
+            dequantised_states.append(dequantised_state)
+        return torch.cat(dequantised_states, dim=0)
+
+    def _encode(self, raw_audio, start_level=0, end_level=None):
+        # Encode
+        if end_level is None:
+            end_level = self.levels
+        input_audio = raw_audio.permute(0, 2, 1).float()
+        latent_states = []
+        for level in range(self.levels):
+            encoder = self.encoders[level]
+            latent_state = encoder(input_audio)
+            latent_states.append(latent_state[-1])
+        music_tokens = self.bottleneck.encode(latent_states)
+        return music_tokens[start_level:end_level]
+
+    def encode(self, input_audio, start_level=0, end_level=None, bs_chunks=1):
+        """
+        Transforms the `input_audio` to a discrete representation made out of `music_tokens`.
+
+        Args:
+            input_audio (`torch.Tensor`):
+                Raw audio which will be encoded to its discrete representation using the codebook. The closest `code`
+                form the codebook will be computed for each sequence of samples.
+            start_level (`int`, *optional*, defaults to 0):
+                Level at which the encoding process will start. Default to 0.
+            end_level (`int`, *optional*):
+                Level at which the encoding process will start. Default to None.
+            bs_chunks (int, *optional*, defaults to 1):
+                Number of chunks of raw audio to process at the same time.
+        """
+        audio_chunks = torch.chunk(input_audio, bs_chunks, dim=0)
+        music_tokens_list = []
+        for chunk_i in audio_chunks:
+            music_tokens_i = self._encode(chunk_i, start_level=start_level, end_level=end_level)
+            music_tokens_list.append(music_tokens_i)
+        music_tokens = [torch.cat(music_tokens_level, dim=0) for music_tokens_level in zip(*music_tokens_list)]
+        return music_tokens
+
+    def sample(self, n_samples):
+        music_tokens = [
+            torch.randint(0, self.nb_discrete_codes, size=(n_samples, *music_tokens_shape), device="cpu")
+            for music_tokens_shape in self.music_tokens_shapes
+        ]
+        return self.decode(music_tokens)
+
+    def forward(self, raw_audio: torch.FloatTensor) -> tuple[torch.Tensor, torch.Tensor]:
+        """
+        Forward pass of the VQ-VAE, encodes the `raw_audio` to latent states, which are then decoded for each level.
+        The commit loss, which ensure that the encoder's computed embeddings are close to the codebook vectors, is
+        computed.
+
+        Args:
+            raw_audio (`torch.FloatTensor`):
+                Audio input which will be encoded and decoded.
+
+        Returns:
+            `tuple[torch.Tensor, torch.Tensor]`
+
+
+        Example:
+        ```python
+        >>> from transformers import JukeboxVQVAE, set_seed
+        >>> import torch
+
+        >>> model = JukeboxVQVAE.from_pretrained("openai/jukebox-1b-lyrics").eval()
+        >>> set_seed(0)
+        >>> zs = [torch.randint(100, (4, 1))]
+        >>> model.decode(zs).shape
+        torch.Size([4, 8, 1])
+        ```
+        """
+
+        # Encode/Decode
+        input_audio = raw_audio.permute(0, 2, 1).float()
+        latent_states = []
+        for level in range(self.levels):
+            encoder = self.encoders[level]
+            latent_state = encoder(input_audio)
+            latent_states.append(latent_state[-1])
+
+        _, music_tokens, commit_losses, _ = self.bottleneck(latent_states)
+        dequantised_states = []
+        for level in range(self.levels):
+            decoder = self.decoders[level]
+            dequantised_state = decoder(music_tokens[level : level + 1], all_levels=False)
+            dequantised_states.append(dequantised_state.permute(0, 2, 1))
+
+        commit_loss = sum(commit_losses)
+        loss = self.commit * commit_loss
+
+        return dequantised_states, loss
+
+
+class JukeboxMLP(nn.Module):
+    def __init__(self, config):
+        # a single channel is always used in original code
+        super().__init__()
+        embed_dim = config.hidden_size
+        hidden_dim = int(config.mlp_multiplier * embed_dim)
+
+        self.c_fc = JukeboxConv1D(embed_dim, hidden_dim)
+        self.c_proj = JukeboxConv1D(hidden_dim, embed_dim)
+        self.act = ACT2FN[config.act_fn]
+        self.dropout = nn.Dropout(config.resid_dropout)
+
+    def forward(self, hidden_states):
+        hidden_states = self.c_fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.c_proj(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+class JukeboxLayerNorm(FusedLayerNorm):
+    def __init__(self, normalized_shape, eps=1e-5, elementwise_affine=True):
+        super().__init__(normalized_shape, eps=eps, elementwise_affine=elementwise_affine)
+        self.width = np.prod(normalized_shape)
+        self.max_numel = 65535 * self.width
+
+    def forward(self, input):
+        if input.numel() > self.max_numel:
+            return F.layer_norm(input, self.normalized_shape, self.weight, self.bias, self.eps).type_as(input)
+        else:
+            return super().forward(input).type_as(input)
+
+
+class JukeboxAttention(nn.Module):
+    def __init__(self, config, n_ctx, attn_func="dense_attn"):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.n_heads = config.n_heads
+        self.dropout = config.attn_dropout
+        hidden_dim = int(config.attention_multiplier * self.embed_dim)
+
+        self.head_dim = hidden_dim // config.n_heads
+        self.n_ctx = n_ctx
+        self.hidden_dim = hidden_dim
+        self.scale = self.head_dim**-0.25
+        self.mask = config.mask
+
+        if attn_func == "cross_attention":
+            self.c_attn = JukeboxConv1D(self.embed_dim, hidden_dim)
+            self.c_enc_kv = JukeboxConv1D(self.embed_dim, hidden_dim * 2)
+        else:
+            self.c_attn = JukeboxConv1D(self.embed_dim, hidden_dim * 3)
+
+        self.c_proj = JukeboxConv1D(hidden_dim, self.embed_dim)
+        self.attn_dropout = nn.Dropout(config.attn_dropout)
+        self.resid_dropout = nn.Dropout(config.resid_dropout)
+
+        # Sequence of length seq_len is factored as [blocks, seq_len // blocks]
+        self.attn_func = attn_func
+        if attn_func == "cross_attention":
+            self.qkv = self.decode_qkv
+        elif attn_func == "prime_attn":
+            self.qkv = self.prime_qkv
+        else:
+            self.qkv = self.factored_qkv
+
+        ATTENTION_MAP = {
+            "dense_attn": (self.dense_attn, "autoregressive"),
+            "block_attn": (self.block_attn, "autoregressive"),
+            "transpose_block_attn": (self.transpose_block_attn, "autoregressive"),
+            "prev_block_attn": (self.prev_block_attn, None),
+            "summary_attn": (self.summary_attn, "summary"),
+            "summary_spread_attn": (self.summary_spread_attn, "summary"),
+            "cross_attention": (self.dense_attn, None),
+            "prime_attn": (self.prime_attn, "prime"),
+        }
+        self.attn, self.attn_mask = ATTENTION_MAP[attn_func]
+
+        self.blocks = config.blocks
+        self.spread = config.spread
+        if self.blocks is not None:
+            self.block_ctx = self.n_ctx // self.blocks
+
+        self.sample_t = 0
+        self.cache = {}
+        self.encoder_len = config.nb_relevant_lyric_tokens  # length of the encoder input ids
+        self.record_attn = False
+
+    def _attn(self, query_states, key_states, value_states, sample):
+        scale = self.scale
+        if self.training:
+            attention_weight = torch.matmul(query_states * scale, key_states * scale)
+        else:
+            attention_weight = torch.matmul(query_states, key_states)
+            attention_weight.mul_(scale * scale)
+        attn_weight_type = attention_weight.dtype
+        attention_weight = attention_weight.float()
+        if self.mask:
+            # Generate appropriate mask to mask out all positions before current
+            # Might take up lot of memory for dense, so can cache it
+            mask = get_mask(
+                self.attn_mask,
+                query_states.size(-2),
+                key_states.size(-1),
+                self.blocks,
+                self.spread,
+                attention_weight.device,
+                sample,
+                self.sample_t,
+            )
+            if mask is not None:
+                attention_weight = attention_weight * mask + -1e9 * (1 - mask)
+        attention_prob = F.softmax(attention_weight, dim=-1).type(attn_weight_type)
+        if self.record_attn:
+            self.attention_prob = attention_prob
+            if self.attn_func == "prime_attn":
+                # only keep music queries and lyrics keys/values
+                self.attention_prob = self.attention_prob[:, :, self.encoder_len :, : self.encoder_len]
+        attention_prob = self.attn_dropout(attention_prob)
+        context_states = torch.matmul(attention_prob, value_states)
+        return context_states
+
+    def merge_heads(self, hidden_states):
+        hidden_states = hidden_states.permute(0, 2, 1, 3).contiguous()
+        new_hidden_states_shape = (*hidden_states.size()[:-2], hidden_states.size(-2) * hidden_states.size(-1))
+        return hidden_states.view(*new_hidden_states_shape)  # in Tensorflow implem: fct merge_states
+
+    def split_heads(self, hidden_states, is_key=False):
+        new_hidden_states_shape = (
+            *hidden_states.size()[:-1],
+            self.n_heads,
+            hidden_states.size(-1) // self.n_heads,
+        )
+        hidden_states = hidden_states.view(*new_hidden_states_shape)  # in Tensorflow implem: fct split_states
+        if is_key:
+            return hidden_states.permute(0, 2, 3, 1)
+        else:
+            return hidden_states.permute(0, 2, 1, 3)
+
+    def dense_attn(self, query, key, value, sample):
+        query = self.split_heads(query)
+        key = self.split_heads(key, is_key=True)
+        value = self.split_heads(value)
+        context_states = self._attn(query, key, value, sample)
+        context_states = self.merge_heads(context_states)
+        return context_states
+
+    def block_attn(self, query, key, value, sample):
+        block_ctx = self.block_ctx
+        batch_size, seq_len, embed_dim = value.shape  # For sample, query_len= 1, key_len = value_len = sample_t
+        if sample:
+            return self.dense_attn(query, key, value, sample).view(batch_size, 1, embed_dim)
+        else:
+            query_length = query.shape[1]
+            query = query.view(batch_size * query_length // block_ctx, block_ctx, embed_dim)
+            if query_length < seq_len:
+                seq_len = query_length
+                key = key[:, -seq_len:].contiguous()
+                value = value[:, -seq_len:].contiguous()
+            key = key.view(batch_size * seq_len // block_ctx, block_ctx, embed_dim)
+            value = value.view(batch_size * seq_len // block_ctx, block_ctx, embed_dim)
+            return self.dense_attn(query, key, value, sample).view(batch_size, seq_len, embed_dim)
+
+    def transpose_block_attn(self, query, key, value, sample):
+        block_ctx = self.block_ctx
+        batch_size, seq_len, embed_dim = value.shape  # For sample, query_len= 1, key_len = value_len = sample_t
+        if sample:
+            block_len = (seq_len - 1) % block_ctx
+            key = key[:, block_len::block_ctx, :]
+            value = value[:, block_len::block_ctx, :]
+            return self.dense_attn(query, key, value, sample).view(batch_size, 1, embed_dim)
+        else:
+            query_length = query.shape[1]
+            query = query.view(batch_size, query_length // block_ctx, block_ctx, embed_dim)
+            query = query.transpose(1, 2).contiguous()
+            query = query.view(batch_size * block_ctx, query_length // block_ctx, embed_dim)
+
+            key = key.view(batch_size, seq_len // block_ctx, block_ctx, embed_dim)
+            key = key.transpose(1, 2).contiguous()
+            key = key.view(batch_size * block_ctx, seq_len // block_ctx, embed_dim)
+
+            value = value.view(batch_size, seq_len // block_ctx, block_ctx, embed_dim)
+            value = value.transpose(1, 2).contiguous()
+            value = value.view(batch_size * block_ctx, seq_len // block_ctx, embed_dim)
+
+            block_attn = self.dense_attn(query, key, value, sample)
+            block_attn = block_attn.view(batch_size, block_ctx, query_length // block_ctx, embed_dim)
+            block_attn = block_attn.transpose(1, 2).contiguous()
+            block_attn = block_attn.view(batch_size, query_length, embed_dim)
+
+            return block_attn
+
+    def prev_block_attn(self, query, key, value, sample):
+        block_ctx = self.block_ctx
+        batch_size, seq_len, embed_dim = value.shape  # For sample, query_len= 1, key_len = value_len = sample_t
+        if sample:
+            block = (seq_len - 1) // block_ctx
+            prev_l = (block - 1) * block_ctx
+            if block > 0:
+                key = key[:, prev_l : prev_l + block_ctx, :]
+                value = value[:, prev_l : prev_l + block_ctx, :]
+            else:
+                key = torch.zeros(batch_size, block_ctx, embed_dim, device=query.device, dtype=query.dtype)
+                value = torch.zeros(batch_size, block_ctx, embed_dim, device=query.device, dtype=query.dtype)
+            return self.dense_attn(query, key, value, sample).view(batch_size, 1, embed_dim)
+        else:
+            query_length = query.shape[1]
+            query = query.view(batch_size * query_length // block_ctx, block_ctx, embed_dim)
+
+            key = key.view(batch_size, seq_len // block_ctx, block_ctx, embed_dim)[:, :-1, :, :]
+            key = torch.nn.functional.pad(key, (0, 0, 0, 0, 1, 0))
+            key = key.view(batch_size * seq_len // block_ctx, block_ctx, embed_dim)
+
+            value = value.view(batch_size, seq_len // block_ctx, block_ctx, embed_dim)[:, :-1, :, :]
+            value = torch.nn.functional.pad(value, (0, 0, 0, 0, 1, 0))
+            value = value.view(batch_size * seq_len // block_ctx, block_ctx, embed_dim)
+
+            if query_length < seq_len:
+                nb_query_blocks = query_length // block_ctx
+                nb_key_blocks = seq_len // block_ctx
+                seq_len = query_length
+                key = key.view(batch_size, nb_key_blocks, block_ctx, embed_dim)[:, -nb_query_blocks:]
+                key = key.contiguous().view(batch_size * nb_query_blocks, block_ctx, embed_dim)
+
+                value = value.view(batch_size, nb_key_blocks, block_ctx, embed_dim)[:, -nb_query_blocks:]
+                value = value.contiguous().view(batch_size * nb_query_blocks, block_ctx, embed_dim)
+
+            return self.dense_attn(query, key, value, sample).view(batch_size, seq_len, embed_dim)
+
+    def summary_attn(self, query, key, value, sample):
+        blocks = self.blocks
+        block_ctx = self.block_ctx
+        batch_size, seq_len, embed_dim = value.shape  # For sample, query_len= 1, key_len = value_len = sample_t
+        if sample:
+            key = key[:, block_ctx - 1 : blocks * block_ctx - 1 : block_ctx, :]
+            key = torch.nn.functional.pad(key, (0, 0, 1, 0))
+
+            value = value[:, block_ctx - 1 : blocks * block_ctx - 1 : block_ctx, :]
+            value = torch.nn.functional.pad(value, (0, 0, 1, 0))
+            return self.dense_attn(query, key, value, sample).view(batch_size, 1, embed_dim)
+        else:
+            key = key.view(batch_size, blocks, seq_len // blocks, embed_dim)[:, :-1, -1, :]
+            key = torch.nn.functional.pad(key, (0, 0, 1, 0))  # batch_size, blocks, embed_dim
+
+            value = value.view(batch_size, blocks, seq_len // blocks, embed_dim)[:, :-1, -1, :]
+            value = torch.nn.functional.pad(value, (0, 0, 1, 0))  # batch_size, blocks, embed_dim
+            return self.dense_attn(query, key, value, sample).view(batch_size, seq_len, embed_dim)
+
+    def summary_spread_attn(self, query, key, value, sample):
+        blocks = self.blocks
+        spread = self.spread
+
+        batch_size, seq_len, embed_dim = value.shape  # For sample, query_len= 1, key_len = value_len = sample_t
+        if sample:
+            raise NotImplementedError
+        else:
+            key = key.view(batch_size, blocks, seq_len // blocks, embed_dim)[:, :-1, -spread:, :]
+            key = torch.nn.functional.pad(key, (0, 0, 0, 0, 1, 0)).contiguous()
+            key = key.view(batch_size, blocks * spread, embed_dim)
+
+            value = value.view(batch_size, blocks, seq_len // blocks, embed_dim)[:, :-1, -spread:, :]
+            value = torch.nn.functional.pad(value, (0, 0, 0, 0, 1, 0)).contiguous()
+            value = value.view(batch_size, blocks * spread, embed_dim)
+
+            return self.dense_attn(query, key, value, sample).view(batch_size, seq_len, embed_dim)
+
+    def prime_attn(self, query, key, value, sample):
+        encoder_len = self._encoder_len
+        key = key[:, :encoder_len]
+        value = value[:, :encoder_len]
+        return self.dense_attn(query, key, value, sample)
+
+    def factored_qkv(self, hidden_states, last_encoder_hidden_states=None, sample=False):
+        curr_ctx = hidden_states.shape[1]
+        if last_encoder_hidden_states is not None:
+            raise TypeError("last_encoder_hidden_states should be None")
+
+        query, key, value = hidden_states.chunk(3, dim=2)
+        if sample:
+            self.sample_t += curr_ctx
+            key, value = self._append_cache(key, value)
+            l_cache = self._suff_cache_len()
+            if self._cache_len() > l_cache:
+                self._slice_cache(-l_cache)
+            if curr_ctx > 1:
+                if self.attn_func != "dense_attn":
+                    query = self._pad_to_block_ctx(query, query=True)
+                    key = self._pad_to_block_ctx(key)
+                    value = self._pad_to_block_ctx(value)
+                sample = False
+            else:
+                key = self.cache["key"]
+                value = self.cache["value"]
+        return query, key, value, sample
+
+    def prime_qkv(self, hidden_states, last_encoder_hidden_states=None, sample=False):
+        curr_ctx = hidden_states.shape[1]
+        if last_encoder_hidden_states is not None:
+            raise TypeError("last_encoder_hidden_states should be None")
+        query, key, value = hidden_states.chunk(3, dim=2)
+        if sample:
+            if self._cache_len() < self._encoder_len:
+                self._append_cache(key, value)
+            if self._cache_len() > self._encoder_len:
+                self._slice_cache(0, self._encoder_len)
+            key, value = self.cache["key"], self.cache["value"]
+            self.sample_t += curr_ctx
+        return query, key, value, sample
+
+    def decode_qkv(self, hidden_states, last_encoder_hidden_states=None, sample=False):
+        curr_ctx = hidden_states.shape[1]
+        query = hidden_states
+        if sample:
+            if self.sample_t == 0:
+                self.cache["key"], self.cache["value"] = self.c_enc_kv(
+                    last_encoder_hidden_states.type_as(hidden_states)
+                ).chunk(2, dim=2)
+            key, value = self.cache["key"], self.cache["value"]
+            self.sample_t += curr_ctx
+        else:
+            key, value = self.c_enc_kv(last_encoder_hidden_states.type_as(hidden_states)).chunk(2, dim=2)
+        return query, key, value, sample
+
+    def forward(self, hidden_states, last_encoder_hidden_states=None, sample=False):
+        curr_ctx = hidden_states.shape[1]
+        hidden_states = self.c_attn(hidden_states)
+        query, key, value, sample = self.qkv(
+            hidden_states, last_encoder_hidden_states=last_encoder_hidden_states, sample=sample
+        )
+        attention_scores = self.attn(query, key, value, sample)
+        if attention_scores.shape[1] != curr_ctx:
+            offset = self._offset(curr_ctx)
+            attention_scores = attention_scores[:, offset : offset + curr_ctx, :].contiguous()
+        attention_scores = self.c_proj(attention_scores)
+        return self.resid_dropout(attention_scores)
+
+    @property
+    def _encoder_len(self):
+        encoder_len = self.encoder_len
+        encoder_blocks = (encoder_len // self.blocks) + 1
+        return encoder_blocks * self.blocks
+
+    def _offset(self, curr_ctx):
+        if self.attn_func == "dense_attn":
+            return 0
+        return (self.sample_t - curr_ctx) % self.block_ctx
+
+    def _pad_to_block_ctx(self, hidden_states, query=False):
+        seq_len = hidden_states.shape[1]
+        offset = self._offset(seq_len) if query else 0
+        n_blocks = (seq_len + offset + self.block_ctx - 1) // self.block_ctx
+        pad = n_blocks * self.block_ctx - seq_len - offset
+        if pad == 0 and offset == 0:
+            return hidden_states
+        else:
+            return F.pad(hidden_states, (0, 0, offset, pad))
+
+    def _cache_len(self):
+        return 0 if "key" not in self.cache else self.cache["key"].shape[1]
+
+    def _suff_cache_len(self):
+        """
+        Precondition:
+            key and value are appended with the current context and self.sample_t reflects the 1-indexed sample
+            location in the context.
+        """
+        previous_block_length = (self.sample_t - 1) % self.block_ctx + 1 + self.block_ctx
+        REQUIRED_CACHE_LEN = {
+            "dense_attn": self.sample_t,
+            "block_attn": (self.sample_t - 1) % self.block_ctx + 1,
+            "transpose_block_attn": self.sample_t,
+            "prev_block_attn": self.sample_t if self.sample_t <= self.block_ctx else previous_block_length,
+            "cross_attn": self.encoder_len,
+            "prime_attn": min(self.sample_t, self._encoder_len),
+        }
+
+        return REQUIRED_CACHE_LEN[self.attn_func]
+
+    def _slice_cache(self, start, end=None):
+        self.cache["key"] = self.cache["key"][:, start:end]
+        self.cache["value"] = self.cache["value"][:, start:end]
+
+    def _append_cache(self, key, value):
+        if "key" not in self.cache:
+            self.cache["key"] = key
+            self.cache["value"] = value
+        else:
+            old_key, old_value = key, value
+            key = torch.cat([self.cache["key"], old_key], dim=1)
+            value = torch.cat([self.cache["value"], old_value], dim=1)
+            del self.cache["key"]
+            del self.cache["value"]
+            del old_key
+            del old_value
+            self.cache["key"] = key
+            self.cache["value"] = value
+        return self.cache["key"], self.cache["value"]
+
+    def del_cache(self):
+        self.sample_t = 0
+        if "key" in self.cache:
+            del self.cache["key"]
+        if "value" in self.cache:
+            del self.cache["value"]
+        self.cache = {}
+
+
+class JukeboxBlock(nn.Module):
+    def __init__(self, config, n_ctx, attn_func="dense_attn"):
+        super().__init__()
+        self.width = config.hidden_size
+        self.attn = JukeboxAttention(config, n_ctx, attn_func=attn_func)
+
+        self.layer_norm_0 = JukeboxLayerNorm(config.hidden_size)
+        self.mlp = JukeboxMLP(config)
+        self.layer_norm_1 = JukeboxLayerNorm(config.hidden_size)
+        self.res_scale = 1.0 / config.num_layers if config.attn_res_scale else 1.0
+        self.attn_func = attn_func
+
+    def forward(self, hidden_states, last_encoder_hidden_states, sample=False):
+        residuals = hidden_states
+        hidden_states = self.layer_norm_0(hidden_states)
+        hidden_states = self.attn(hidden_states, last_encoder_hidden_states, sample)
+
+        output_states = self.layer_norm_1(residuals + hidden_states)
+        output_states = self.mlp(output_states)
+        if self.res_scale == 1.0:
+            output = residuals + hidden_states + output_states
+        else:
+            output = residuals + self.res_scale * (hidden_states + output_states)
+        return output
+
+
+class JukeboxLayerStack(nn.Module):
+    def __init__(self, config, n_ctx):
+        super().__init__()
+        self.n_ctx = n_ctx
+        self.width = config.hidden_size
+        self.num_layers = config.num_layers
+        self.blocks = config.blocks
+        self.attention_pattern = config.attention_pattern
+        if self.blocks is not None:
+            self.block_ctx = n_ctx // self.blocks
+        self.encoder_len = config.nb_relevant_lyric_tokens
+        self.n_heads = config.n_heads
+
+        # Orders of attn_func
+        attention_pattern = ATTENTION_PATTERNS[self.attention_pattern]
+        self._attn_mods = nn.ModuleList()
+        for depth in range(self.num_layers):
+            self._attn_mods.append(JukeboxBlock(config, n_ctx, attn_func=attention_pattern(depth)))
+
+        self.saved_attn_weights = []
+
+    def set_record_attn(self, record_attn):
+        """
+        Makes forward prop dump self-attention softmaxes to self.saved_attn_weights.
+
+        Args:
+            record_attn (`Union[bool,set]`):
+                Either a set of layer indices indicating which layers to store, or a boolean value indicating Whether
+                to dump all.
+        """
+
+        def _should_record_attn(layer_idx):
+            if isinstance(record_attn, bool):
+                return record_attn
+            return layer_idx in record_attn
+
+        for i, layer in enumerate(self._attn_mods):
+            layer.attn.record_attn = _should_record_attn(i)
+
+        if not record_attn:
+            self.saved_attn_weights = []
+
+    def forward(self, hidden_states, last_encoder_hidden_states=None, sample=False):
+        # Blocks
+        for i, attn_layer in enumerate(self._attn_mods):
+            if attn_layer.attn_func == "cross_attention":  # attend to the lyrics
+                hidden_states = attn_layer(
+                    hidden_states, last_encoder_hidden_states=last_encoder_hidden_states, sample=sample
+                )
+            else:
+                hidden_states = attn_layer(hidden_states, last_encoder_hidden_states=None, sample=sample)
+            if attn_layer.attn.record_attn:
+                self.saved_attn_weights.append(attn_layer.attn.c_attn.weight)
+        return hidden_states
+
+    def del_cache(self):
+        for attn_layer in self._attn_mods:
+            attn_layer.attn.del_cache()
+
+
+class JukeboxPositionalEmbedding(nn.Module):
+    def __init__(self, embed_dim, width):
+        super().__init__()
+        self.pos_emb = nn.Parameter(torch.empty((embed_dim, width)))
+
+    def forward(self):
+        pos_emb = self.pos_emb
+        return pos_emb
+
+
+class JukeboxConditionalAutoregressive(nn.Module):
+    def __init__(
+        self,
+        config,
+        n_ctx=None,
+        embed_dim=None,
+        audio_conditioning=False,
+        metadata_conditioning=False,
+        is_encoder=False,
+    ):
+        """
+        Autoregressive model on either lyric tokens or music tokens, or both. The attention pattern should be properly
+        set for each configuration.
+
+        Args:
+            config (`JukeboxPriorConfig`):
+                Model configuration class with all the parameters of the model. Initializing with a config file does
+                not load the weights associated with the model, only the configuration. Check out the
+                [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+            n_ctx (`int`, *optional*):
+                Number of tokens or lyrics tokens provided in a single pass.
+            embed_dim (`int`, *optional*):
+                Either equals to the dimension of the codebook, or the sum of n_vocab (lyrics) and codebook dimension,
+                if the model combines lyrics and music tokens, or simply n_vocab if the model is a separate encoder
+            audio_conditioning (`bool`, *optional*, defaults to `False`):
+                Whether or not the prior supports conditioning on audio.
+            metadata_conditioning (`bool`, *optional*, defaults to `False`):
+                Whether or not the prior supports conditioning on artitst, genres, lyrics and timing.
+            is_encoder (`bool`, *optional*, defaults to `False`):
+                Whether the model is an encoder only model.
+        """
+
+        super().__init__()
+        self.width = config.hidden_size
+        self.num_layers = config.num_layers
+        self.n_ctx = n_ctx if n_ctx is not None else config.n_ctx
+        self.embed_dim = embed_dim if embed_dim is not None else config.music_vocab_size
+        self.embed_tokens = nn.Embedding(self.embed_dim, config.hidden_size)
+        self.embed_tokens_dropout = nn.Dropout(config.emb_dropout)
+        self.metadata_conditioning = metadata_conditioning
+        self.audio_conditioning = audio_conditioning
+        if not metadata_conditioning:
+            self.start_token = nn.Parameter(torch.empty((1, config.hidden_size)))
+        self.pos_emb = JukeboxPositionalEmbedding(self.n_ctx, config.hidden_size)
+        self.pos_emb_dropout = nn.Dropout(config.emb_dropout)
+
+        self.transformer = JukeboxLayerStack(config, n_ctx=self.n_ctx)
+        self.is_encoder = is_encoder
+        self.encoder_len = config.nb_relevant_lyric_tokens
+
+        if config.merged_decoder:
+            # Merged piped model uses this setup
+            self.add_cond_after_transformer = False
+            self.share_embed_tokens_fc_proj_out = False
+        else:
+            self.add_cond_after_transformer = True
+            self.share_embed_tokens_fc_proj_out = True
+
+        if not is_encoder:
+            self.fc_proj_out = nn.Linear(config.hidden_size, self.embed_dim, bias=False)
+            if self.share_embed_tokens_fc_proj_out:
+                self.fc_proj_out.weight = self.embed_tokens.weight
+            self.loss = torch.nn.CrossEntropyLoss()
+
+    def forward(
+        self,
+        tokens,
+        audio_conditioning=None,
+        metadata_conditioning=None,
+        last_encoder_hidden_states=None,
+        get_preds=False,
+        get_acts=False,
+        get_sep_loss=False,
+    ):
+        """
+        Args:
+            tokens (`torch.tensor`):
+                Can represent music tokens, lyrics tokens or both, depending on the configuration.
+        """
+        # Preprocess.
+        batch_size = tokens.shape[0]
+        with torch.no_grad():
+            tokens = tokens.view(batch_size, -1).long()
+
+        if not self.audio_conditioning:
+            audio_conditioning = torch.zeros(
+                (batch_size, 1, self.width),
+                device=tokens.device,
+                dtype=self.transformer._attn_mods[0].mlp.c_fc.weight.dtype,
+            )
+
+        target = tokens  # Target
+        hidden_states = self.embed_tokens(tokens)
+        # Shift by 1, and fill in start token
+        hidden_states = torch.cat((hidden_states[:, -1:], hidden_states[:, :-1]), dim=1)
+        if self.metadata_conditioning:
+            hidden_states[:, 0] = metadata_conditioning.view(batch_size, self.width)
+        else:
+            hidden_states[:, 0] = self.start_token
+
+        hidden_states = (
+            self.embed_tokens_dropout(hidden_states) + self.pos_emb_dropout(self.pos_emb()) + audio_conditioning
+        )  # Pos emb and dropout
+
+        hidden_states = self.transformer(
+            hidden_states, last_encoder_hidden_states=last_encoder_hidden_states
+        )  # Transformer
+        if self.add_cond_after_transformer:  # Piped doesn't add x_cond
+            hidden_states = hidden_states + audio_conditioning
+
+        activations = hidden_states
+        if self.is_encoder:
+            return hidden_states
+
+        hidden_states = self.fc_proj_out(hidden_states)  # Predictions
+        loss_fn = nn.CrossEntropyLoss()
+        if get_sep_loss:
+            lyric_hidden_states = hidden_states[:, : self.encoder_len].reshape(-1, self.embed_dim)
+            token_hidden_states = hidden_states[:, self.encoder_len :].reshape(-1, self.embed_dim)
+
+            lyric_loss = loss_fn(lyric_hidden_states, target[:, : self.encoder_len].reshape(-1)) / np.log(2.0)
+            music_token_loss = loss_fn(token_hidden_states, target[:, self.encoder_len :].reshape(-1)) / np.log(2.0)
+
+            loss = (lyric_loss, music_token_loss)  # Note order! Lyric is first
+        else:
+            loss = loss_fn(hidden_states.view(-1, self.embed_dim), target.view(-1)) / np.log(2.0)  # Loss
+
+        if get_preds:
+            return loss, hidden_states
+        elif get_acts:
+            return loss, activations
+        else:
+            return loss, None
+
+    def get_emb(self, sample_t, n_samples, tokens, audio_conditioning, metadata_conditioning):
+        if sample_t == 0:
+            hidden_states = torch.empty(n_samples, 1, self.width, dtype=self.embed_tokens.weight.dtype).to(
+                self.embed_tokens.weight.device
+            )
+            if self.metadata_conditioning:
+                hidden_states[:, 0] = metadata_conditioning.view(n_samples, self.width)
+            else:
+                hidden_states[:, 0] = self.start_token
+        else:
+            hidden_states = self.embed_tokens(tokens)
+        if audio_conditioning.shape == (n_samples, self.n_ctx, self.width):
+            cond = audio_conditioning[:, sample_t : sample_t + 1, :]
+        else:
+            cond = audio_conditioning
+        # Pos emb, dropout is identity at eval time
+        hidden_states = hidden_states + self.pos_emb()[sample_t : sample_t + 1] + cond
+        return hidden_states, cond
+
+    def sample(
+        self,
+        n_samples,
+        audio_conditioning=None,
+        metadata_conditioning=None,
+        last_encoder_hidden_states=None,
+        temp=1.0,
+        top_k=0,
+        top_p=0.0,
+        get_preds=False,
+        sample_tokens=None,
+    ):
+        if sample_tokens is None:
+            sample_tokens = self.n_ctx
+
+        if not self.audio_conditioning:
+            audio_conditioning = torch.zeros(
+                (n_samples, 1, self.width), dtype=self.transformer._attn_mods[0].mlp.c_fc.weight.dtype
+            ).to(self.fc_proj_out.device)
+
+        with torch.no_grad():
+            sampled_tokens = []
+            tokens = None
+            if get_preds:
+                preds = []
+
+            iter = tqdm(range(0, sample_tokens), leave=False)
+            for sample_t in iter:
+                iter.set_description(f"Ancestral sampling {sample_tokens} music tokens", refresh=True)
+                hidden_states, cond = self.get_emb(
+                    sample_t, n_samples, tokens, audio_conditioning, metadata_conditioning
+                )
+
+                hidden_states = self.transformer(
+                    hidden_states, last_encoder_hidden_states=last_encoder_hidden_states, sample=True
+                )
+                if self.add_cond_after_transformer:
+                    hidden_states = hidden_states + cond
+                hidden_states = self.fc_proj_out(hidden_states)  # Predictions
+                if get_preds:
+                    preds.append(hidden_states.clone())
+                # Adjust logits
+                hidden_states = hidden_states / temp
+                hidden_states = filter_logits(hidden_states, top_k=top_k, top_p=top_p)
+                # Sample and replace hidden_states
+                tokens = torch.distributions.Categorical(logits=hidden_states).sample()
+                sampled_tokens.append(tokens.clone())
+
+            del tokens
+            self.transformer.del_cache()
+
+            tokens = torch.cat(sampled_tokens, dim=1)
+            if get_preds:
+                preds = torch.cat(preds, dim=1)
+        if get_preds:
+            return tokens, preds
+        else:
+            return tokens
+
+    def split_chunks(self, length, chunk_size):
+        n_passes = (length + chunk_size - 1) // chunk_size
+        chunk_sizes = [*[chunk_size] * (n_passes - 1), (length - 1) % chunk_size + 1]
+        return chunk_sizes
+
+    def primed_sample(
+        self,
+        n_samples,
+        lyric_and_music_tokens,
+        audio_conditioning=None,
+        metadata_conditioning=None,
+        last_encoder_hidden_states=None,
+        temp=1.0,
+        top_k=0,
+        top_p=0.0,
+        get_preds=False,
+        chunk_size=None,
+        sample_tokens=None,
+    ):
+        if sample_tokens is None:
+            sample_tokens = self.n_ctx
+        # Preprocess.
+        batch_size = lyric_and_music_tokens.shape[0]
+        with torch.no_grad():
+            lyric_and_music_tokens = lyric_and_music_tokens.view(batch_size, -1).long()
+
+        sampled_audio = torch.split(lyric_and_music_tokens, 1, dim=1)
+        sampled_audio = list(sampled_audio)
+
+        if not self.audio_conditioning:
+            audio_conditioning = torch.zeros(
+                (n_samples, 1, self.width), dtype=self.transformer._attn_mods[0].mlp.c_fc.weight.dtype
+            ).to(lyric_and_music_tokens.device)
+
+        with torch.no_grad():
+            if get_preds:
+                preds = []
+
+            # Fill up key/value cache for past context by running forward pass.
+            # We do so in chunks instead of doing the whole past in one forward pass to reduce max memory usage.
+            if chunk_size is None:
+                chunk_size = len(sampled_audio)
+            chunk_sizes = self.split_chunks(len(sampled_audio), chunk_size)
+            x_primes = []
+            start = 0
+            token = None
+
+            for current_chunk_size in tqdm(chunk_sizes, desc="Preparing past key value", leave=False):
+                sampled_audio_prime, conds_prime = [], []
+                for sample_t in range(start, start + current_chunk_size):
+                    x_prime, cond_prime = self.get_emb(
+                        sample_t, n_samples, token, audio_conditioning, metadata_conditioning
+                    )
+                    token = sampled_audio[sample_t]
+                    sampled_audio_prime.append(x_prime)
+                    conds_prime.append(cond_prime)
+                start = start + current_chunk_size
+                x_prime, cond_prime = torch.cat(sampled_audio_prime, dim=1), torch.cat(conds_prime, dim=1)
+                del sampled_audio_prime
+                del conds_prime
+                if not get_preds:
+                    del cond_prime
+                x_prime = self.transformer(x_prime, last_encoder_hidden_states=last_encoder_hidden_states, sample=True)
+
+                if get_preds:
+                    if self.add_cond_after_transformer:
+                        x_prime = x_prime + cond_prime
+                    del cond_prime
+                    x_primes.append(x_prime)
+                else:
+                    del x_prime
+
+            if get_preds:
+                x_prime = torch.cat(x_primes, dim=1)
+                x_prime = self.fc_proj_out(x_prime)  # Predictions
+                preds.append(x_prime)
+
+            # the input of the encoder and decoder can be merged into (lyrics, music tokens)
+            input_tokens = sampled_audio[-1]
+
+            itererator = tqdm(
+                range(len(sampled_audio), sample_tokens),
+                desc=f"Sampling {len(range(len(sampled_audio), sample_tokens))} music tokens",
+                leave=False,
+            )
+            for sample_t in itererator:
+                hidden_states, cond = self.get_emb(
+                    sample_t, n_samples, input_tokens, audio_conditioning, metadata_conditioning
+                )
+
+                hidden_states = self.transformer(
+                    hidden_states, last_encoder_hidden_states=last_encoder_hidden_states, sample=True
+                )
+                if self.add_cond_after_transformer:
+                    hidden_states = hidden_states + cond
+                hidden_states = self.fc_proj_out(hidden_states)  # Predictions
+                if get_preds:
+                    preds.append(hidden_states)
+                # Adjust logits
+                hidden_states = hidden_states / temp
+                hidden_states = filter_logits(hidden_states, top_k=top_k, top_p=top_p)
+                # only music tokens are sampled
+                music_tokens = torch.distributions.Categorical(logits=hidden_states).sample()
+                sampled_audio.append(music_tokens.clone())
+                input_tokens = music_tokens
+
+            del input_tokens, music_tokens
+            self.transformer.del_cache()
+
+            music_tokens = torch.cat(sampled_audio, dim=1)
+            if get_preds:
+                preds = torch.cat(preds, dim=1)
+        if get_preds:
+            return music_tokens, preds
+        else:
+            return music_tokens
+
+
+class JukeboxMusicTokenConditioner(nn.Module):
+    """
+    The `JukeboxMusicTokenConditioner` takes music tokens as an input (corresponding to the codes of the VQVAE's
+    codebook) and upsamples it using a single layer of decoder convolution block (the same is used in the VQVAE).
+    """
+
+    def __init__(self, config, level):
+        super().__init__()
+        self.embed_tokens = nn.Embedding(config.music_vocab_size, config.hidden_size)
+        config.embed_dim = config.music_vocab_size  # setting correct argument for the `JukeboxDecoder`
+
+        self.upsampler = JukeboxDecoderConvBock(
+            config,
+            config.hidden_size,
+            config.res_conv_width,
+            config.res_conv_depth,
+            config.res_downs_t[level],
+            config.res_strides_t[level],
+            reverse_dilation=False,
+        )
+        self.layer_norm = JukeboxLayerNorm(config.hidden_size)
+
+    def forward(self, music_tokens, raw_audio_conditioning=None):
+        """
+        Args:
+            music_tokens (`torch.LongTensor`):
+                Music tokens form the upper level in range(nb_discrete_codes)
+            raw_audio_conditioning (`torch.LongTensor`, *optional*):
+                Audio used when primed sampling, raw audio information that conditions the generation
+        """
+        if raw_audio_conditioning is None:
+            raw_audio_conditioning = 0.0
+        # Embed music_tokens
+        music_tokens = music_tokens.long()
+        hidden_states = self.embed_tokens(music_tokens)
+        hidden_states = hidden_states + raw_audio_conditioning
+
+        # Run conditioner
+        hidden_states = hidden_states.permute(0, 2, 1)
+        hidden_states = self.upsampler(hidden_states)
+        hidden_states = hidden_states.permute(0, 2, 1)
+        hidden_states = self.layer_norm(hidden_states)
+        return hidden_states
+
+
+class JukeboxRangeEmbedding(nn.Module):
+    """
+    The `JukeboxRangeEmbedding` interpolate the given [pos_start, pos_end] to obtain an equivalent of time positional
+    embedding of length `n_ctx`.
+
+    Binning process : For each pos in position tensor, find its bin [start,end) mapped to [0,1,...,bins-1] [start,end)
+    -> [0,1) -> [0, bins) -> floor -> [0,...,bins-1] NOTE: Open ended interval on right, so start <= pos < end, not <=
+    end
+    """
+
+    def __init__(self, n_time, embed_dim, range, out_width, clamp=False):
+        super().__init__()
+        self.n_time = n_time
+        self.embed_dim = embed_dim
+        self.emb = nn.Embedding(embed_dim, out_width)
+        self.pos_min, self.pos_max = range
+        self.clamp = clamp
+
+    def forward(self, pos_start, pos_end=None):
+        # Check if [pos_start,pos_end] in [pos_min, pos_max)
+        if not len(pos_start.shape) == 2:
+            raise TypeError(f"Expected shape with 2 dims, got {pos_start.shape}")
+        if not (self.pos_min <= pos_start).all() and (pos_start < self.pos_max).all():
+            raise TypeError(f"Range is [{self.pos_min},{self.pos_max}), got {pos_start}")
+
+        pos_start = pos_start.float()
+        if pos_end is not None:
+            if self.clamp:
+                pos_end = pos_end.clamp(self.pos_min, self.pos_max)
+
+            pos_end = pos_end.float()
+        # Interpolate so that [pos_start, ..., pos_end] <-> position tensor of length n_ctx
+        n_time = self.n_time
+        if n_time != 1:
+            interpolation = (
+                torch.arange(0, n_time, dtype=torch.float, device=pos_start.device).view(1, n_time) / n_time
+            )
+            position = pos_start + (pos_end - pos_start) * interpolation
+        else:
+            position = pos_start
+
+        # Bin each value to bins_
+        # [0,1) -> [0,1..,embed_dim) -> [0,1...,embed_dim-1
+        normalised_position = (position - self.pos_min) / (self.pos_max - self.pos_min)
+        bins_ = (self.embed_dim * normalised_position).floor().long().detach()
+        return self.emb(bins_)
+
+
+class JukeboxLabelConditioner(nn.Module):
+    def __init__(self, config, include_time_signal):
+        super().__init__()
+
+        embed_dim = config.hidden_size
+        timing_dims = config.timing_dims
+        sampling_rate = config.sampling_rate
+        nb_genres, nb_artists = config.metadata_dims
+        music_tokens_shape = config.n_ctx
+
+        self.max_nb_genres = config.max_nb_genres
+        self.bow_genre_emb = nn.Embedding(nb_genres, embed_dim)
+        self.artist_emb = nn.Embedding(nb_artists, embed_dim)
+        self.include_time_signal = include_time_signal
+        if self.include_time_signal:
+            total_length_range = (config.min_duration * sampling_rate, config.max_duration * sampling_rate)
+            absolute_pos_range = (0.0, config.max_duration * sampling_rate)
+            relative_pos_range = (0.0, 1.0)
+            self.total_length_emb = JukeboxRangeEmbedding(1, timing_dims, total_length_range, embed_dim)
+            self.absolute_pos_emb = JukeboxRangeEmbedding(
+                music_tokens_shape, timing_dims, absolute_pos_range, embed_dim
+            )
+            self.relative_pos_emb = JukeboxRangeEmbedding(
+                music_tokens_shape, timing_dims, relative_pos_range, embed_dim, clamp=True
+            )
+
+    def forward(self, metadata):
+        total_length = metadata[:, 0:1]
+        offset = metadata[:, 1:2]
+        length = metadata[:, 2:3]
+        artist = metadata[:, 3:4]
+        genre = metadata[:, 4:]
+
+        # Start embedding of length 1
+        artist_emb = self.artist_emb(artist)
+        # Empty genre slots are denoted by -1. We mask these out.
+        mask = (genre >= 0).float().unsqueeze(2)
+        genre_emb = (self.bow_genre_emb(genre.clamp(0)) * mask).sum(dim=1, keepdim=True)
+        start_emb = genre_emb + artist_emb
+
+        # Pos embedding of length n_ctx
+        if self.include_time_signal:
+            start, end = offset, offset + length
+            total_length = total_length.float()
+            start = start.float()
+            end = end.float()
+            pos_emb = (
+                self.total_length_emb(total_length)
+                + self.absolute_pos_emb(start, end)
+                + self.relative_pos_emb(start / total_length, end / total_length)
+            )
+        else:
+            pos_emb = None
+        return start_emb, pos_emb
+
+
+class JukeboxPrior(PreTrainedModel):
+    """
+    The JukeboxPrior class, which is a wrapper around the various conditioning and the transformer. JukeboxPrior can be
+    seen as language models trained on music. They model the next `music token` prediction task. If a (lyric) `encoderù
+    is defined, it also models the `next character` prediction on the lyrics. Can be conditioned on timing, artist,
+    genre, lyrics and codes from lower-levels Priors.
+
+    Args:
+        config (`JukeboxPriorConfig`):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+        level (`int`, *optional*):
+            Current level of the Prior. Should be in range `[0,nb_priors]`.
+        nb_priors (`int`, *optional*, defaults to 3):
+            Total number of priors.
+        vqvae_encoder (`Callable`, *optional*):
+            Encoding method of the VQVAE encoder used in the forward pass of the model. Passing functions instead of
+            the vqvae module to avoid getting the parameters.
+        vqvae_decoder (`Callable`, *optional*):
+            Decoding method of the VQVAE decoder used in the forward pass of the model. Passing functions instead of
+            the vqvae module to avoid getting the parameters.
+    """
+
+    config: JukeboxPriorConfig
+
+    def _init_weights(self, module):
+        init_scale = self.config.init_scale
+
+        if isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=0.02 * init_scale)
+        elif isinstance(module, JukeboxConv1D):
+            if self.config.zero_out:
+                module.weight.data.zero_()
+            else:
+                module.weight.data.normal_(mean=0.0, std=0.02 * init_scale)
+        elif isinstance(module, JukeboxPositionalEmbedding):
+            module.pos_emb.data.normal_(mean=0.0, std=0.01 * init_scale)
+        elif isinstance(module, JukeboxRangeEmbedding):
+            module.emb.weight.data.normal_(mean=0.0, std=0.01 * init_scale)
+        elif isinstance(module, JukeboxConditionalAutoregressive) and hasattr(module, "lm_head"):
+            module.lm_head.weight.data.normal_(mean=0.0, std=0.02 * init_scale)
+        elif isinstance(module, JukeboxConditionalAutoregressive) and hasattr(module, "start_token"):
+            module.start_token.data.normal_(mean=0.0, std=0.01 * init_scale)
+        elif isinstance(module, JukeboxResConv1DBlock) and self.config.zero_out:
+            module.conv1d_2.weight.data.zero_()
+            module.conv1d_2.bias.data.zero_()
+        if isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        if isinstance(module, nn.Linear) and module.bias is not None:
+            module.bias.data.zero_()
+
+    def __init__(self, config: JukeboxPriorConfig, level=None, nb_priors=3, vqvae_encoder=None, vqvae_decoder=None):
+        super().__init__(config)
+        # Passing functions instead of the vqvae module to avoid getting params, only used in the
+        # forward loop
+        self.vqvae_encoder = vqvae_encoder
+        self.vqvae_decoder = vqvae_decoder
+
+        self.levels = nb_priors
+        self.level = level if level is not None else config.level
+
+        self.base_model_prefix = f"priors.{self.level}"
+
+        self.n_ctx = config.n_ctx
+
+        self.lyric_conditioning = config.nb_relevant_lyric_tokens > 0
+        self.nb_relevant_lyric_tokens = config.nb_relevant_lyric_tokens
+        self.encoder_loss_fraction = config.encoder_loss_fraction
+
+        # Audio conditioning : conditioning on music tokens (either from audio or from previous levels or both)
+        self.audio_conditioning = self.level != 0
+        self.cond_level = self.level - 1
+        if self.audio_conditioning:
+            self.conditioner_blocks = JukeboxMusicTokenConditioner(config, self.level)
+
+        # metadata conditioning : contioning on timing, genres, and artist
+        self.metadata_conditioning = config.metadata_conditioning
+        if self.metadata_conditioning:
+            self.metadata_embedding = JukeboxLabelConditioner(config, include_time_signal=not self.audio_conditioning)
+
+        # define encoder-decoder or encoder and decoder
+        self.is_encoder_decoder = config.is_encoder_decoder
+        if config.is_encoder_decoder:
+            # encoder-decoder transformer
+            self.input_shapes = [config.nb_relevant_lyric_tokens, config.n_ctx]
+            self.embed_dim_shift = [0, config.lyric_vocab_size]
+            self.width = config.hidden_size
+
+            self.nb_relevant_lyric_tokens = config.nb_relevant_lyric_tokens
+
+            self.prior = JukeboxConditionalAutoregressive(
+                config,
+                n_ctx=config.nb_relevant_lyric_tokens + config.n_ctx,
+                embed_dim=config.lyric_vocab_size + config.music_vocab_size,
+                audio_conditioning=(self.audio_conditioning or self.metadata_conditioning),
+                metadata_conditioning=True,
+            )
+
+        else:
+            # Separate encoder-decoder transformer
+            encoder_config = config.encoder_config
+
+            if self.nb_relevant_lyric_tokens != 0 and self.lyric_conditioning:
+                self.lyric_acts_width = encoder_config.hidden_size
+                self.encoder_width = config.hidden_size
+                self.encoder_dim = config.lyric_vocab_size
+                self.encoder = JukeboxConditionalAutoregressive(
+                    encoder_config,
+                    n_ctx=self.nb_relevant_lyric_tokens,
+                    embed_dim=self.encoder_dim,
+                    audio_conditioning=False,
+                    metadata_conditioning=False,
+                    is_encoder=True,
+                )
+                self.encoder.proj_in = JukeboxConv1D(encoder_config.hidden_size, config.hidden_size)
+                self.encoder.final_layer_norm = JukeboxLayerNorm(config.hidden_size)
+                self.encoder.lm_head = nn.Linear(config.hidden_size, config.lyric_vocab_size, bias=False)
+            else:
+                self.nb_relevant_lyric_tokens = 0
+
+            # decoder model on the tokens
+            self.prior = JukeboxConditionalAutoregressive(
+                config,
+                audio_conditioning=(self.audio_conditioning or self.metadata_conditioning),
+                metadata_conditioning=self.metadata_conditioning,
+            )
+
+        self.next_token_prediction_loss_dims = config.n_ctx
+        self.total_loss_dims = self.nb_relevant_lyric_tokens + self.next_token_prediction_loss_dims
+
+        self.downsamples = [stride**down for stride, down in zip(config.res_strides_t, config.res_downs_t)]
+        self.cond_downsample = self.downsamples[self.level] if self.level != 0 else None
+        self.raw_to_tokens = np.prod(self.downsamples[: nb_priors - self.level])
+        self.sample_length = self.n_ctx * self.raw_to_tokens
+
+        logger.info(
+            f"Level:{self.level}, Cond downsample:{self.cond_downsample}, Raw to tokens:{self.raw_to_tokens}, Sample"
+            f" length:{self.sample_length}"
+        )
+
+    def get_metadata(self, labels, start, total_length, offset, get_indices=False):
+        metadata = labels.clone()
+        metadata[:, 0] = total_length
+        # Set sample_length to match this level
+        metadata[:, 2] = int(self.sample_length)
+
+        # Set offset
+        metadata[:, 1:2] = int(offset * self.raw_to_tokens) + int(start * self.raw_to_tokens)
+        # here since metadata has the full token_list, we just need to selected the ones that are relevant
+
+        # Set lyric tokens
+        metadata, indices = self.set_metadata_lyric_tokens(metadata)
+        if get_indices:
+            return metadata, indices
+        else:
+            return metadata
+
+    def set_metadata_lyric_tokens(self, labels):
+        """
+        Processes the full labels to only retrieve the relevant lyric tokens and keep the metadata conditioning tokens.
+        """
+        if self.nb_relevant_lyric_tokens > 0:
+            tokens_list = torch.zeros(
+                (labels.shape[0], self.nb_relevant_lyric_tokens), dtype=torch.long, device=labels.device
+            )
+            indices_list = []  # what's the index of each current character in original array
+            for idx in range(labels.shape[0]):
+                full_tokens = labels.clone()[:, 4 + self.metadata_embedding.max_nb_genres :]
+                total_length, offset, duration = labels[idx, 0], labels[idx, 1], labels[idx, 2]
+                tokens, indices = get_relevant_lyric_tokens(
+                    full_tokens, self.nb_relevant_lyric_tokens, total_length, offset, duration
+                )
+                tokens_list[idx, :] = tokens
+                indices_list.append(indices)
+
+            return (
+                torch.cat((labels[:, : 4 + self.metadata_embedding.max_nb_genres], tokens_list), dim=-1),
+                indices_list,
+            )
+        else:
+            return labels, None
+
+    def get_music_tokens_conds(self, music_tokens, start, end):
+        """
+        Extracts current level's conditioning music tokens.
+        """
+        if self.level != 0:
+            music_tokens_cond = music_tokens[self.level - 1]
+            music_tokens = music_tokens_cond[:, start // self.cond_downsample : end // self.cond_downsample]
+            missing_cond_len = self.n_ctx // self.cond_downsample - music_tokens_cond[-1].shape[-1]
+            if missing_cond_len > 0:
+                init_cond = torch.zeros(1, missing_cond_len).to(music_tokens_cond.device)
+                music_tokens_cond = torch.cat((music_tokens_cond, init_cond), dim=-1).long()
+            music_tokens_conds = [music_tokens_cond]
+        else:
+            music_tokens_conds = None
+        return music_tokens_conds
+
+    def prior_preprocess(self, tokens, conds):
+        """
+        Shifts the input tokens to account for the dictionary merge. The embed_dim_shift give by how much the music
+        tokens should be shifted by. It is equal to `lyric_vocab_size`.
+        """
+        batch_size = tokens[0].shape[0]
+        for i in range(len(tokens)):
+            tokens[i] = (tokens[i] + int(self.embed_dim_shift[i])).view(batch_size, -1)
+
+        for i in range(len(conds)):
+            if conds[i] is None:
+                conds[i] = torch.zeros(
+                    (batch_size, self.input_shapes[i], self.width), dtype=tokens[0].dtype, device=tokens[0].device
+                )
+
+        return torch.cat(tokens, dim=1), torch.cat(conds, dim=1)
+
+    def prior_postprocess(self, tokens):
+        """
+        Shifts back the input tokens if the model uses an encoder decoder architecture. As the embedding layer is
+        shared, `prior_embed_dim_shift` shifts the music token ids by `lyric_vocab_size`. Only returns the music
+        tokens.
+        """
+        batch_size = tokens.shape[0]
+        dims = (self.input_shapes[0], tokens.shape[1] - self.input_shapes[0])
+        tokens = list(torch.split(tokens, dims, dim=1))
+
+        # Some of the input tokens might be shifted to take into account the voccabulary fusion
+        for i in range(len(tokens)):
+            bins_shift = int(self.embed_dim_shift[i])
+            tokens[i] = (tokens[i] - bins_shift).view(batch_size, -1)
+            tokens[i] = torch.clamp(tokens[i], min=0)
+            # If not masking loss, model may have generated lyric/midi tokens which are now shifted <0 by bin_shift
+        return tokens[-1]
+
+    def embed_tokens(self, music_tokens_conds):
+        """
+        Embeds the upper level music tokens and upsamples them to provide as audio conditioning.
+        """
+        music_tokens_conds = music_tokens_conds[: self.cond_level + 1]
+        audio_conditioning = None
+        for music_tokens_cond, conditioner_block in reversed(list(zip(music_tokens_conds, [self.conditioner_blocks]))):
+            audio_conditioning = conditioner_block(music_tokens_cond, audio_conditioning)
+        return audio_conditioning
+
+    def encode(self, hidden_states, start_level=None, end_level=None, bs_chunks=1):
+        """
+        Encodes the hidden states (raw audio) using the VQVAE's encoder. Returns latent_states.
+        """
+        if start_level is None:
+            start_level = self.level
+        if end_level is None:
+            end_level = self.levels
+        # Get latents
+        with torch.no_grad():
+            latent_states = self.vqvae_encoder(
+                hidden_states, start_level=start_level, end_level=end_level, bs_chunks=bs_chunks
+            )
+        return latent_states
+
+    def decode(self, music_tokens, start_level=None, end_level=None, bs_chunks=1):
+        """
+        Usamples the sequence of codebook vectors to a raw audio.
+        """
+        if start_level is None:
+            start_level = self.level
+        if end_level is None:
+            end_level = self.levels
+        with torch.no_grad():
+            output = self.vqvae_decoder(
+                music_tokens, start_level=start_level, end_level=end_level, bs_chunks=bs_chunks
+            )
+        return output
+
+    def get_cond(self, music_tokens_conds, metadata):
+        """
+        Converts the input tokens to input_embeddings. Splits the lyrics form the rest of the metadata. Lyric tokens
+        can be None.
+        """
+        if metadata is not None:
+            n_labels = metadata.shape[1] - self.nb_relevant_lyric_tokens
+            metadata, lyric_tokens = metadata[:, :n_labels], metadata[:, n_labels:]
+        else:
+            metadata, lyric_tokens = None, None
+        metadata_conditioning, metadata_pos = (
+            self.metadata_embedding(metadata) if self.metadata_conditioning else (None, None)
+        )
+        audio_conditioning = self.embed_tokens(music_tokens_conds) if self.audio_conditioning else metadata_pos
+        return audio_conditioning, metadata_conditioning, lyric_tokens
+
+    def sample(
+        self,
+        n_samples,
+        music_tokens=None,
+        music_tokens_conds=None,
+        metadata=None,
+        temp=1.0,
+        top_k=0,
+        top_p=0.0,
+        chunk_size=None,
+        sample_tokens=None,
+    ):
+        """
+        Ancestral/Prime sampling a window of tokens using the provided conditioning and metadatas.
+
+        Args:
+            n_samples (`int`):
+                Number of samples to generate.
+            music_tokens (`list[torch.LongTensor]`, *optional*):
+                Previously generated tokens at the current level. Used as context for the generation.
+            music_tokens_conds (`list[torch.FloatTensor]`, *optional*):
+                Upper-level music tokens generated by the previous prior model. Is `None` if the generation is not
+                conditioned on the upper-level tokens.
+            metadata (`list[torch.LongTensor]`, *optional*):
+                List containing the metadata tensor with the artist, genre and the lyric tokens.
+            temp (`float`, *optional*, defaults to 1.0):
+                Sampling temperature.
+            top_k (`int`, *optional*, defaults to 0):
+                Top k probabilities used for filtering.
+            top_p (`float`, *optional*, defaults to 0.0):
+                Top p probabilities used for filtering.
+            chunk_size (`int`, *optional*):
+                Size of the chunks used to prepare the cache of the transformer.
+            sample_tokens (`int`, *optional*):
+                Number of tokens to sample.
+
+        """
+        no_past_context = music_tokens is None or music_tokens.shape[1] == 0
+        name = {True: "Ancestral", False: "Primed"}[no_past_context]
+        logger.info(f"{name} sampling {n_samples} samples with temp={temp}, top_k={top_k}, top_p={top_p}")
+
+        with torch.no_grad():
+            # Currently audio_conditioning only uses immediately above layer
+            audio_conditioning, metadata_conditioning, lyric_tokens = self.get_cond(music_tokens_conds, metadata)
+            if self.is_encoder_decoder:
+                if no_past_context:  # the prime_sample function will be used with music_tokens set to None
+                    lyric_and_music_tokens, audio_conditioning = self.prior_preprocess(
+                        [lyric_tokens], [None, audio_conditioning]
+                    )
+                else:
+                    lyric_and_music_tokens, audio_conditioning = self.prior_preprocess(
+                        [lyric_tokens, music_tokens], [None, audio_conditioning]
+                    )
+                if sample_tokens is not None:
+                    sample_tokens += self.nb_relevant_lyric_tokens
+                music_tokens = self.prior.primed_sample(
+                    n_samples,
+                    lyric_and_music_tokens,
+                    audio_conditioning,
+                    metadata_conditioning,
+                    temp=temp,
+                    top_k=top_k,
+                    top_p=top_p,
+                    chunk_size=chunk_size,
+                    sample_tokens=sample_tokens,
+                )
+                music_tokens = self.prior_postprocess(music_tokens)
+            else:
+                last_encoder_hidden_states = self.get_encoder_states(lyric_tokens, sample=True)
+                if no_past_context:
+                    music_tokens = self.prior.sample(
+                        n_samples,
+                        audio_conditioning,
+                        metadata_conditioning,
+                        last_encoder_hidden_states,
+                        temp=temp,
+                        top_k=top_k,
+                        top_p=top_p,
+                        sample_tokens=sample_tokens,
+                    )
+                else:
+                    music_tokens = self.prior.primed_sample(
+                        n_samples,
+                        music_tokens,
+                        audio_conditioning,
+                        metadata_conditioning,
+                        last_encoder_hidden_states,
+                        temp=temp,
+                        top_k=top_k,
+                        top_p=top_p,
+                        chunk_size=chunk_size,
+                        sample_tokens=sample_tokens,
+                    )
+        return music_tokens
+
+    def get_encoder_states(self, lyric_tokens, sample=False):
+        """
+        Retrieve the last hidden_states of the lyric encoder that will be attended to by the decoder. Forwards through
+        the lyric encoder.
+        """
+        if self.nb_relevant_lyric_tokens != 0 and self.lyric_conditioning:
+            if sample:
+                self.encoder = self.encoder.to(lyric_tokens.device)
+            lyric_acts = self.encoder(lyric_tokens, None, None, None)
+            lyric_acts = self.encoder.proj_in(lyric_acts)
+            last_encoder_hidden_states = self.encoder.final_layer_norm(lyric_acts)
+        else:
+            last_encoder_hidden_states = None
+        return last_encoder_hidden_states
+
+    def get_encoder_loss(self, last_encoder_hidden_states, target_lyrics):
+        """
+        Computes the loss for the lyric encoder: next lyric token prediction.
+        """
+        if self.lyric_conditioning:
+            last_encoder_hidden_states = self.encoder.lm_head(last_encoder_hidden_states)
+            encoder_loss = nn.functional.cross_entropy(
+                last_encoder_hidden_states.view(-1, self.encoder_dim), target_lyrics.view(-1)
+            ) / np.log(2.0)
+        else:
+            encoder_loss = torch.tensor(0.0, device=last_encoder_hidden_states.device)
+        return encoder_loss
+
+    def forward_tokens(
+        self, music_tokens, music_tokens_conds=[], metadata=None, get_preds=False, get_attn_weights=False
+    ):
+        """
+        Applies a forward pass using the conditioning tokens. Different from the classic forward as it does not use the
+        vqvae's encoding layers.
+        """
+        if get_attn_weights:
+            self.prior.transformer.set_record_attn(get_attn_weights)
+        audio_conditioning, metadata_conditioning, lyric_tokens = self.get_cond(music_tokens_conds, metadata)
+
+        if self.is_encoder_decoder:  # the preprocess returns the full tokens (Lyrics and Music tokens), shifted
+            tokens, audio_conditioning = self.prior_preprocess(
+                [lyric_tokens, music_tokens], [None, audio_conditioning]
+            )
+            (encoder_loss, next_token_prediction_loss), preds = self.prior(
+                tokens, audio_conditioning, metadata_conditioning, get_sep_loss=True, get_preds=get_preds
+            )
+        else:
+            last_encoder_hidden_states = self.get_encoder_states(lyric_tokens)
+            encoder_loss = self.get_encoder_loss(last_encoder_hidden_states, lyric_tokens)
+            next_token_prediction_loss, preds = self.prior(
+                music_tokens,
+                audio_conditioning,
+                metadata_conditioning,
+                last_encoder_hidden_states,
+                get_preds=get_preds,
+            )
+        loss = self.encoder_loss_fraction * encoder_loss * self.nb_relevant_lyric_tokens / self.total_loss_dims
+        loss += next_token_prediction_loss * self.next_token_prediction_loss_dims / self.total_loss_dims
+
+        metrics = {
+            "bpd": next_token_prediction_loss.detach().clone(),
+            "encoder_loss": encoder_loss.detach().clone(),
+            "next_token_prediction_loss": next_token_prediction_loss.detach().clone(),
+        }
+        if get_preds:
+            metrics["preds"] = preds.detach().clone()
+        if get_attn_weights:
+            saved_attn_weights = self.prior.transformer.saved_attn_weights
+            self.prior.transformer.set_record_attn(False)
+            return saved_attn_weights
+        else:
+            return loss, metrics
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        metadata: Optional[list[torch.LongTensor]],
+        decode: Optional[bool] = False,
+        get_preds: Optional[bool] = False,
+    ) -> list[torch.Tensor]:
+        """
+        Encode the hidden states using the `vqvae` encoder, and then predicts the next token in the `forward_tokens`
+        function. The loss is the sum of the `encoder` loss and the `decoder` loss.
+
+        Args:
+            hidden_states (`torch.Tensor`):
+                Hidden states which should be raw audio
+            metadata (`list[torch.LongTensor]`, *optional*):
+                List containing the metadata conditioning tensor with the lyric and the metadata tokens.
+            decode (`bool`, *optional*, defaults to `False`):
+                Whether or not to decode the encoded to tokens.
+            get_preds (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the actual predictions of the model.
+        """
+        batch_size = hidden_states.shape[0]
+        music_tokens, *music_tokens_conds = self.encode(hidden_states, bs_chunks=batch_size)
+        loss, metrics = self.forward_tokens(
+            music_tokens=music_tokens,
+            music_tokens_conds=music_tokens_conds,
+            metadata=metadata,
+            get_preds=get_preds,
+        )
+        if decode:
+            dequantised_states = self.decode([music_tokens, *music_tokens_conds])
+        else:
+            dequantised_states = None
+        return dequantised_states, loss, metrics
+
+
+class JukeboxPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config: JukeboxConfig
+    base_model_prefix = "jukebox"
+    supports_gradient_checkpointing = False
+
+    def _init_weights(self, module):
+        if isinstance(module, (JukeboxPrior, JukeboxVQVAE)):
+            module.apply(module._init_weights)
+
+    def __init__(self, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+
+
+JUKEBOX_SAMPLING_INPUT_DOCSTRING = r"""
+            labels (`list[torch.LongTensor]` of length `n_sample`, and shape `(self.levels, self.config.max_nb_genre + lyric_sequence_length)` :
+                List of metadata such as `artist_id`, `genre_id` and the full list of lyric tokens which are used to
+                condition the generation.
+            sampling_kwargs (`dict[Any]`):
+                Various additional sampling arguments that are used by the `_sample` function. A detail list of the
+                arguments can bee seen in the [`_sample`] function documentation.
+"""
+
+
+@add_start_docstrings(
+    """The bare JUKEBOX Model used for music generation. 4 sampling techniques are supported : `primed_sample`, `upsample`,
+    `continue_sample` and `ancestral_sample`. It does not have a `forward` method as the training is not end to end. If
+    you want to fine-tune the model, it is recommended to use the `JukeboxPrior` class and train each prior
+    individually.
+    """,
+    JUKEBOX_START_DOCSTRING,
+)
+class JukeboxModel(JukeboxPreTrainedModel):
+    _no_split_modules = ["JukeboxBlock"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        vqvae_config = config.vqvae_config
+        self.vqvae = JukeboxVQVAE(vqvae_config)
+        self.set_shared_params(config)
+        self.priors = nn.ModuleList(
+            [JukeboxPrior(config.prior_configs[level], level) for level in range(config.nb_priors)]
+        )
+
+    def set_shared_params(self, model_config):
+        """
+        Initialises the parameters that are shared. This has to be done here because the list of `JukeboxPriorConfig`
+        is nest, and is thus unreachable in the `from_dict` function
+        """
+        for config in model_config.prior_configs:
+            config.sampling_rate = model_config.sampling_rate
+            config.timing_dims = model_config.timing_dims
+            config.min_duration = model_config.min_duration
+            config.max_duration = model_config.max_duration
+            config.max_nb_genres = model_config.max_nb_genres
+            config.metadata_conditioning = model_config.metadata_conditioning
+
+    def decode(self, music_tokens, start_level=0, end_level=None, bs_chunks=1):
+        return self.vqvae.decode(music_tokens, start_level, end_level, bs_chunks)
+
+    def encode(self, input_audio, start_level=0, end_level=None, bs_chunks=1):
+        return self.vqvae.encode(input_audio, start_level, end_level, bs_chunks)
+
+    def split_batch(self, obj, n_samples, split_size):
+        n_passes = (n_samples + split_size - 1) // split_size
+        if isinstance(obj, torch.Tensor):
+            return torch.split(obj, split_size, dim=0)
+        elif isinstance(obj, list):
+            return list(zip(*[torch.split(item, split_size, dim=0) for item in obj]))
+        elif obj is None:
+            return [None] * n_passes
+        else:
+            raise TypeError("Unknown input type")
+
+    # Sample a partial window of length<n_ctx with tokens_to_sample new tokens on level=level
+    def sample_partial_window(
+        self, music_tokens, labels, offset, sampling_kwargs, level, tokens_to_sample, max_batch_size
+    ):
+        prior = self.priors[level]
+        sampled_tokens = music_tokens[level]
+        n_ctx = prior.n_ctx
+        nb_sampled_tokens = sampled_tokens.shape[1]
+        if nb_sampled_tokens < n_ctx - tokens_to_sample:
+            sampling_kwargs["sample_tokens"] = nb_sampled_tokens + tokens_to_sample
+            start = 0
+        else:
+            sampling_kwargs["sample_tokens"] = n_ctx
+            start = nb_sampled_tokens - n_ctx + tokens_to_sample
+
+        return self.sample_single_window(music_tokens, labels, offset, sampling_kwargs, level, start, max_batch_size)
+
+    # Sample a single window of length=n_ctx at position=start on level=level
+    def sample_single_window(self, music_tokens, labels, offset, sampling_kwargs, level, start, max_batch_size):
+        prior = self.priors[level]
+        n_samples = music_tokens[0].shape[0]
+        n_ctx = prior.n_ctx
+        end = start + n_ctx
+        # get music_tokens already sampled at current level
+        previous_sampled_tokens = music_tokens[level][:, start:end]
+
+        sample_tokens = sampling_kwargs.get("sample_tokens", None)
+        if "sample_tokens" in sampling_kwargs:
+            sample_tokens = end - start
+
+        conditioning_tokens = previous_sampled_tokens.shape[1]
+        new_tokens = sample_tokens - previous_sampled_tokens.shape[1]
+
+        logger.info(
+            f"Sampling {sample_tokens} tokens for [{start},{start + sample_tokens}]. Conditioning on"
+            f" {conditioning_tokens} tokens"
+        )
+
+        if new_tokens <= 0:
+            # Nothing new to sample
+            return music_tokens
+
+        # get music_tokens_conds from level above
+        music_tokens_conds = prior.get_music_tokens_conds(music_tokens, start, end)
+        # if there are no levels above should return None!
+
+        # set metadata offset, sample_length and lyrics tokens
+        metadata = prior.get_metadata(labels, start, self.total_length, offset)
+
+        music_tokens_list = self.split_batch(previous_sampled_tokens, n_samples, max_batch_size)
+        music_tokens_conds_list = self.split_batch(music_tokens_conds, n_samples, max_batch_size)
+        metadata_list = self.split_batch(metadata, n_samples, max_batch_size)
+        tokens = []
+        iterator = tqdm(zip(music_tokens_list, music_tokens_conds_list, metadata_list), leave=False)
+        for music_tokens_i, music_tokens_conds_i, metadata_i in iterator:
+            name = ["Ancestral", "Primed"][music_tokens_i.shape[1] == 0]
+            iterator.set_description(
+                f"[prior level {level}] {name} Sampling {sample_tokens} tokens out of"
+                f" {self.total_length // prior.raw_to_tokens}",
+                refresh=True,
+            )
+            tokens_i = prior.sample(
+                n_samples=music_tokens_i.shape[0],
+                music_tokens=music_tokens_i,
+                music_tokens_conds=music_tokens_conds_i,
+                metadata=metadata_i,
+                **sampling_kwargs,
+            )
+            tokens.append(tokens_i)
+        sampled_tokens = torch.cat(tokens, dim=0)
+
+        # Update music_tokens with new sample
+        music_tokens_new = sampled_tokens[:, -new_tokens:]
+        music_tokens[level] = torch.cat([music_tokens[level], music_tokens_new], dim=1)
+        return music_tokens
+
+    # Sample total_length tokens at level=level with hop_length=hop_length
+    def sample_level(
+        self, music_tokens, labels, offset, sampling_kwargs, level, total_length, hop_length, max_batch_size
+    ):
+        if total_length >= self.priors[level].n_ctx:
+            iterator = get_starts(total_length, self.priors[level].n_ctx, hop_length)
+            for start in iterator:
+                music_tokens = self.sample_single_window(
+                    music_tokens, labels, offset, sampling_kwargs, level, start, max_batch_size
+                )
+
+        else:
+            music_tokens = self.sample_partial_window(
+                music_tokens, labels, offset, sampling_kwargs, level, total_length, max_batch_size
+            )
+        return music_tokens
+
+    @torch.no_grad()
+    def _sample(
+        self,
+        music_tokens,
+        labels,
+        sample_levels,
+        metas=None,
+        chunk_size=32,
+        sampling_temperature=0.98,
+        lower_batch_size=16,
+        max_batch_size=16,
+        sample_length_in_seconds=24,
+        compute_alignments=False,
+        sample_tokens=None,
+        offset=0,
+        save_results=True,
+        sample_length=None,
+    ) -> list[torch.LongTensor]:
+        """
+        Core sampling function used to generate music tokens. Iterates over the provided list of levels, while saving
+        the generated raw audio at each step.
+
+        Args:
+            music_tokens (`list[torch.LongTensor]`):
+                A sequence of music tokens of length `self.levels` which will be used as context to continue the
+                sampling process. Should have `self.levels` tensors, each corresponding to the generation at a certain
+                level.
+            labels (`list[torch.LongTensor]`):
+                List of length `n_sample`, and shape `(self.levels, 4 + self.config.max_nb_genre +
+                lyric_sequence_length)` metadata such as `artist_id`, `genre_id` and the full list of lyric tokens
+                which are used to condition the generation.
+            sample_levels (`list[int]`):
+                List of the desired levels at which the sampling will be done. A level is equivalent to the index of
+                the prior in the list of priors
+            metas (`list[Any]`, *optional*):
+                Metadatas used to generate the `labels`
+            chunk_size (`int`, *optional*, defaults to 32):
+                Size of a chunk of audio, used to fill up the memory in chunks to prevent OOM errors. Bigger chunks
+                means faster memory filling but more consumption.
+            sampling_temperature (`float`, *optional*, defaults to 0.98):
+                Temperature used to adjust the randomness of the sampling.
+            lower_batch_size (`int`, *optional*, defaults to 16):
+                Maximum batch size for the lower level priors
+            max_batch_size (`int`, *optional*, defaults to 16):
+                Maximum batch size for the top level priors
+            sample_length_in_seconds (`int`, *optional*, defaults to 24):
+                Desired length of the generation in seconds
+            compute_alignments (`bool`, *optional*, defaults to `False`):
+                Whether or not to compute the alignment between the lyrics and the audio using the top_prior
+            sample_tokens (`int`, *optional*):
+                Precise number of tokens that should be sampled at each level. This is mostly useful for running dummy
+                experiments
+            offset (`int`, *optional*, defaults to 0):
+                Audio offset used as conditioning, corresponds to the starting sample in the music. If the offset is
+                greater than 0, the lyrics will be shifted take that intoaccount
+            save_results (`bool`, *optional*, defaults to `True`):
+                Whether or not to save the intermediate results. If `True`, will generate a folder named with the start
+                time.
+            sample_length (`int`, *optional*):
+                Desired length of the generation in samples.
+
+        Returns: torch.Tensor
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, JukeboxModel, set_seed
+        >>> import torch
+
+        >>> metas = dict(artist="Zac Brown Band", genres="Country", lyrics="I met a traveller from an antique land")
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai/jukebox-1b-lyrics")
+        >>> model = JukeboxModel.from_pretrained("openai/jukebox-1b-lyrics", min_duration=0).eval()
+
+        >>> labels = tokenizer(**metas)["input_ids"]
+        >>> set_seed(0)
+        >>> zs = [torch.zeros(1, 0, dtype=torch.long) for _ in range(3)]
+        >>> zs = model._sample(zs, labels, [0], sample_length=40 * model.priors[0].raw_to_tokens, save_results=False)
+        >>> zs[0]
+        tensor([[1853, 1369, 1150, 1869, 1379, 1789,  519,  710, 1306, 1100, 1229,  519,
+              353, 1306, 1379, 1053,  519,  653, 1631, 1467, 1229, 1229,   10, 1647,
+             1254, 1229, 1306, 1528, 1789,  216, 1631, 1434,  653,  475, 1150, 1528,
+             1804,  541, 1804, 1434]])
+        ```
+        """
+
+        top_prior = self.priors[0]
+        if sample_length is not None:
+            total_length = sample_length
+        else:
+            total_length = (
+                int(sample_length_in_seconds * self.config.sampling_rate) // top_prior.raw_to_tokens
+            ) * top_prior.raw_to_tokens
+
+        if sample_levels is None:
+            sample_levels = range(len(self.priors))
+
+        # total length of the signal, might be bit different from the actual generated length
+        self.total_length = total_length
+        for level in sample_levels:
+            sampling_kwargs = {
+                "temp": 0.99 if level == len(self.priors) - 1 else sampling_temperature,
+                "chunk_size": chunk_size,
+                "sample_tokens": sample_tokens,
+            }
+            # Set correct total_length, hop_length, labels and sampling_kwargs for level
+
+            total_token_to_sample = total_length // self.priors[level].raw_to_tokens
+            hop_length = int(self.config.hop_fraction[level] * self.priors[level].n_ctx)
+            max_batch_size = lower_batch_size if level != sample_levels else max_batch_size
+            music_tokens = self.sample_level(
+                music_tokens,
+                labels[level],
+                offset,
+                sampling_kwargs,
+                level,
+                total_token_to_sample,
+                hop_length,
+                max_batch_size,
+            )
+
+            if save_results:
+                self.vqvae.to(music_tokens[level].device)
+                # Decode sample
+                with torch.no_grad():
+                    start_level = len(self.priors) - level - 1  # vqvae levels are reversed
+                    raw_audio = self.vqvae.decode(
+                        music_tokens[: level + 1], start_level=start_level, bs_chunks=music_tokens[level].shape[0]
+                    )
+                logdir = f"jukebox/level_{level}"
+                if not os.path.exists(logdir):
+                    os.makedirs(logdir)
+                save_temp_audio(logdir, level, metas=metas, aud=raw_audio.float())
+                if compute_alignments and self.priors[0] is not None and self.priors[0].nb_relevant_lyric_tokens > 0:
+                    with torch.no_grad():
+                        alignments = get_alignment(music_tokens, labels[0], self.priors[0], self.config)
+                    torch.save({"alignments": alignments}, f"{logdir}/lyric_alignments.pt")
+
+        return music_tokens
+
+    @add_start_docstrings(
+        """
+        Generates music tokens based on the provided `labels. Will start at the desired prior level and automatically
+        upsample the sequence. If you want to create the audio, you should call `model.decode(tokens)`, which will use
+        the VQ-VAE decoder to convert the music tokens to raw audio.
+
+        Args:
+            labels (`list[torch.LongTensor]`) :
+                List of length `n_sample`, and shape `(self.levels, 4 + self.config.max_nb_genre +
+                lyric_sequence_length)` metadata such as `artist_id`, `genre_id` and the full list of lyric tokens
+                which are used to condition the generation.
+            n_samples (`int`, *optional*, default to 1) :
+                Number of samples to be generated in parallel.
+        """,
+    )
+    def ancestral_sample(self, labels, n_samples=1, **sampling_kwargs) -> list[torch.LongTensor]:
+        """
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, JukeboxModel, set_seed
+
+        >>> model = JukeboxModel.from_pretrained("openai/jukebox-1b-lyrics", min_duration=0).eval()
+        >>> tokenizer = AutoTokenizer.from_pretrained("openai/jukebox-1b-lyrics")
+
+        >>> lyrics = "Hey, are you awake? Can you talk to me?"
+        >>> artist = "Zac Brown Band"
+        >>> genre = "Country"
+        >>> metas = tokenizer(artist=artist, genres=genre, lyrics=lyrics)
+        >>> set_seed(0)
+        >>> music_tokens = model.ancestral_sample(metas.input_ids, sample_length=400)
+
+        >>> with torch.no_grad():
+        ...     model.decode(music_tokens)[:, :10].squeeze(-1)
+        tensor([[-0.0219, -0.0679, -0.1050, -0.1203, -0.1271, -0.0936, -0.0396, -0.0405,
+            -0.0818, -0.0697]])
+        ```
+        """
+
+        sample_levels = sampling_kwargs.pop("sample_levels", list(range(len(self.priors))))
+        music_tokens = [
+            torch.zeros(n_samples, 0, dtype=torch.long, device=labels[0].device) for _ in range(len(self.priors))
+        ]
+        music_tokens = self._sample(music_tokens, labels, sample_levels, **sampling_kwargs)
+        return music_tokens
+
+    @add_start_docstrings(
+        """Generates a continuation of the previously generated tokens.
+
+        Args:
+            music_tokens (`list[torch.LongTensor]` of length `self.levels` ) :
+                A sequence of music tokens which will be used as context to continue the sampling process. Should have
+                `self.levels` tensors, each corresponding to the generation at a certain level.
+        """,
+        JUKEBOX_SAMPLING_INPUT_DOCSTRING,
+    )
+    def continue_sample(self, music_tokens, labels, **sampling_kwargs) -> list[torch.LongTensor]:
+        sample_levels = sampling_kwargs.pop("sample_levels", list(range(len(self.priors))))
+        music_tokens = self._sample(music_tokens, labels, sample_levels, **sampling_kwargs)
+        return music_tokens
+
+    @add_start_docstrings(
+        """Upsamples a sequence of music tokens using the prior at level `level`.
+
+        Args:
+            music_tokens (`list[torch.LongTensor]` of length `self.levels` ) :
+                A sequence of music tokens which will be used as context to continue the sampling process. Should have
+                `self.levels` tensors, each corresponding to the generation at a certain level.
+        """,
+        JUKEBOX_SAMPLING_INPUT_DOCSTRING,
+    )
+    def upsample(self, music_tokens, labels, **sampling_kwargs) -> list[torch.LongTensor]:
+        sample_levels = sampling_kwargs.pop("sample_levels", list(range(len(self.priors) - 1)))
+        music_tokens = self._sample(music_tokens, labels, sample_levels, **sampling_kwargs)
+        return music_tokens
+
+    @add_start_docstrings(
+        """Generate a raw audio conditioned on the provided `raw_audio` which is used as conditioning at each of the
+        generation levels. The audio is encoded to music tokens using the 3 levels of the VQ-VAE. These tokens are
+        used: as conditioning for each level, which means that no ancestral sampling is required.
+
+        Args:
+            raw_audio (`list[torch.Tensor]` of length `n_samples` ) :
+                A list of raw audio that will be used as conditioning information for each samples that will be
+                generated.
+        """,
+        JUKEBOX_SAMPLING_INPUT_DOCSTRING,
+    )
+    def primed_sample(self, raw_audio, labels, **sampling_kwargs) -> list[torch.LongTensor]:
+        sample_levels = sampling_kwargs.pop("sample_levels", list(range(len(self.priors))))
+        self.vqvae.to(raw_audio.device).float()
+        with torch.no_grad():
+            music_tokens = self.vqvae.encode(
+                raw_audio, start_level=0, end_level=len(self.priors), bs_chunks=raw_audio.shape[0]
+            )
+        music_tokens = self._sample(music_tokens, labels, sample_levels, **sampling_kwargs)
+        return music_tokens
+
+
+__all__ = ["JukeboxModel", "JukeboxPreTrainedModel", "JukeboxVQVAE", "JukeboxPrior"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/jukebox/tokenization_jukebox.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/jukebox/tokenization_jukebox.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec2162db2cce52876fee37d9806c89b4dbabca2f
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/jukebox/tokenization_jukebox.py
@@ -0,0 +1,407 @@
+# coding=utf-8
+# Copyright 2022 The Open AI Team Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for OpenAI Jukebox."""
+
+import json
+import os
+import re
+import unicodedata
+from json.encoder import INFINITY
+from typing import Any, Optional, Union
+
+import numpy as np
+import regex
+
+from ....tokenization_utils import AddedToken, PreTrainedTokenizer
+from ....tokenization_utils_base import BatchEncoding
+from ....utils import TensorType, is_flax_available, is_tf_available, is_torch_available, logging
+from ....utils.generic import _is_jax, _is_numpy
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {
+    "artists_file": "artists.json",
+    "lyrics_file": "lyrics.json",
+    "genres_file": "genres.json",
+}
+
+
+class JukeboxTokenizer(PreTrainedTokenizer):
+    """
+    Constructs a Jukebox tokenizer. Jukebox can be conditioned on 3 different inputs :
+        - Artists, unique ids are associated to each artist from the provided dictionary.
+        - Genres, unique ids are associated to each genre from the provided dictionary.
+        - Lyrics, character based tokenization. Must be initialized with the list of characters that are inside the
+        vocabulary.
+
+    This tokenizer does not require training. It should be able to process a different number of inputs:
+    as the conditioning of the model can be done on the three different queries. If None is provided, defaults values will be used.:
+
+    Depending on the number of genres on which the model should be conditioned (`n_genres`).
+    ```python
+    >>> from transformers import JukeboxTokenizer
+
+    >>> tokenizer = JukeboxTokenizer.from_pretrained("openai/jukebox-1b-lyrics")
+    >>> tokenizer("Alan Jackson", "Country Rock", "old town road")["input_ids"]
+    [tensor([[   0,    0,    0, 6785,  546,   41,   38,   30,   76,   46,   41,   49,
+               40,   76,   44,   41,   27,   30]]), tensor([[  0,   0,   0, 145,   0]]), tensor([[  0,   0,   0, 145,   0]])]
+    ```
+
+    You can get around that behavior by passing `add_prefix_space=True` when instantiating this tokenizer or when you
+    call it on some text, but since the model was not pretrained this way, it might yield a decrease in performance.
+
+    <Tip>
+
+    If nothing is provided, the genres and the artist will either be selected randomly or set to None
+
+    </Tip>
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to:
+    this superclass for more information regarding those methods.
+
+    However the code does not allow that and only supports composing from various genres.
+
+    Args:
+        artists_file (`str`):
+            Path to the vocabulary file which contains a mapping between artists and ids. The default file supports
+            both "v2" and "v3"
+        genres_file (`str`):
+            Path to the vocabulary file which contain a mapping between genres and ids.
+        lyrics_file (`str`):
+            Path to the vocabulary file which contains the accepted characters for the lyrics tokenization.
+        version (`list[str]`, `optional`, default to `["v3", "v2", "v2"]`) :
+            List of the tokenizer versions. The `5b-lyrics`'s top level prior model was trained using `v3` instead of
+            `v2`.
+        n_genres (`int`, `optional`, defaults to 1):
+            Maximum number of genres to use for composition.
+        max_n_lyric_tokens (`int`, `optional`, defaults to 512):
+            Maximum number of lyric tokens to keep.
+        unk_token (`str`, *optional*, defaults to `"<|endoftext|>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        artists_file,
+        genres_file,
+        lyrics_file,
+        version=["v3", "v2", "v2"],
+        max_n_lyric_tokens=512,
+        n_genres=5,
+        unk_token="<|endoftext|>",
+        **kwargs,
+    ):
+        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+        self.version = version
+        self.max_n_lyric_tokens = max_n_lyric_tokens
+        self.n_genres = n_genres
+        self._added_tokens_decoder = {0: unk_token}
+
+        with open(artists_file, encoding="utf-8") as vocab_handle:
+            self.artists_encoder = json.load(vocab_handle)
+
+        with open(genres_file, encoding="utf-8") as vocab_handle:
+            self.genres_encoder = json.load(vocab_handle)
+
+        with open(lyrics_file, encoding="utf-8") as vocab_handle:
+            self.lyrics_encoder = json.load(vocab_handle)
+
+        oov = r"[^A-Za-z0-9.,:;!?\-'\"()\[\] \t\n]+"
+        # In v2, we had a n_vocab=80 and in v3 we missed + and so n_vocab=79 of characters.
+        if len(self.lyrics_encoder) == 79:
+            oov = oov.replace(r"\-'", r"\-+'")
+
+        self.out_of_vocab = regex.compile(oov)
+        self.artists_decoder = {v: k for k, v in self.artists_encoder.items()}
+        self.genres_decoder = {v: k for k, v in self.genres_encoder.items()}
+        self.lyrics_decoder = {v: k for k, v in self.lyrics_encoder.items()}
+        super().__init__(
+            unk_token=unk_token,
+            n_genres=n_genres,
+            version=version,
+            max_n_lyric_tokens=max_n_lyric_tokens,
+            **kwargs,
+        )
+
+    @property
+    def vocab_size(self):
+        return len(self.artists_encoder) + len(self.genres_encoder) + len(self.lyrics_encoder)
+
+    def get_vocab(self):
+        return {
+            "artists_encoder": self.artists_encoder,
+            "genres_encoder": self.genres_encoder,
+            "lyrics_encoder": self.lyrics_encoder,
+        }
+
+    def _convert_token_to_id(self, list_artists, list_genres, list_lyrics):
+        """Converts the artist, genre and lyrics tokens to their index using the vocabulary.
+        The total_length, offset and duration have to be provided in order to select relevant lyrics and add padding to
+        the lyrics token sequence.
+        """
+        artists_id = [self.artists_encoder.get(artist, 0) for artist in list_artists]
+        for genres in range(len(list_genres)):
+            list_genres[genres] = [self.genres_encoder.get(genre, 0) for genre in list_genres[genres]]
+            list_genres[genres] = list_genres[genres] + [-1] * (self.n_genres - len(list_genres[genres]))
+
+        lyric_ids = [[self.lyrics_encoder.get(character, 0) for character in list_lyrics[0]], [], []]
+        return artists_id, list_genres, lyric_ids
+
+    def _tokenize(self, lyrics):
+        """
+        Converts a string into a sequence of tokens (string), using the tokenizer. Split in words for word-based
+        vocabulary or sub-words for sub-word-based vocabularies (BPE/SentencePieces/WordPieces).
+
+        Do NOT take care of added tokens. Only the lyrics are split into character for the character-based vocabulary.
+        """
+        # only lyrics are not tokenized, but character based is easily handled
+        return list(lyrics)
+
+    def tokenize(self, artist, genre, lyrics, **kwargs):
+        """
+        Converts three strings in a 3 sequence of tokens using the tokenizer
+        """
+        artist, genre, lyrics = self.prepare_for_tokenization(artist, genre, lyrics)
+        lyrics = self._tokenize(lyrics)
+        return artist, genre, lyrics
+
+    def prepare_for_tokenization(
+        self, artists: str, genres: str, lyrics: str, is_split_into_words: bool = False
+    ) -> tuple[str, str, str, dict[str, Any]]:
+        """
+        Performs any necessary transformations before tokenization.
+
+        Args:
+            artist (`str`):
+                The artist name to prepare. This will mostly lower the string
+            genres (`str`):
+                The genre name to prepare. This will mostly lower the string.
+            lyrics (`str`):
+                The lyrics to prepare.
+            is_split_into_words (`bool`, *optional*, defaults to `False`):
+                Whether or not the input is already pre-tokenized (e.g., split into words). If set to `True`, the
+                tokenizer assumes the input is already split into words (for instance, by splitting it on whitespace)
+                which it will tokenize. This is useful for NER or token classification.
+        """
+        for idx in range(len(self.version)):
+            if self.version[idx] == "v3":
+                artists[idx] = artists[idx].lower()
+                genres[idx] = [genres[idx].lower()]
+            else:
+                artists[idx] = self._normalize(artists[idx]) + ".v2"
+                genres[idx] = [
+                    self._normalize(genre) + ".v2" for genre in genres[idx].split("_")
+                ]  # split is for the full dictionary with combined genres
+
+        if self.version[0] == "v2":
+            self.out_of_vocab = regex.compile(r"[^A-Za-z0-9.,:;!?\-'\"()\[\] \t\n]+")
+            vocab = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789.,:;!?-+'\"()[] \t\n"
+            self.vocab = {vocab[index]: index + 1 for index in range(len(vocab))}
+            self.vocab["<unk>"] = 0
+            self.n_vocab = len(vocab) + 1
+            self.lyrics_encoder = self.vocab
+            self.lyrics_decoder = {v: k for k, v in self.vocab.items()}
+            self.lyrics_decoder[0] = ""
+        else:
+            self.out_of_vocab = regex.compile(r"[^A-Za-z0-9.,:;!?\-+'\"()\[\] \t\n]+")
+
+        lyrics = self._run_strip_accents(lyrics)
+        lyrics = lyrics.replace("\\", "\n")
+        lyrics = self.out_of_vocab.sub("", lyrics), [], []
+        return artists, genres, lyrics
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
+    def _normalize(self, text: str) -> str:
+        """
+        Normalizes the input text. This process is for the genres and the artist
+
+        Args:
+            text (`str`):
+                Artist or Genre string to normalize
+        """
+
+        accepted = (
+            [chr(i) for i in range(ord("a"), ord("z") + 1)]
+            + [chr(i) for i in range(ord("A"), ord("Z") + 1)]
+            + [chr(i) for i in range(ord("0"), ord("9") + 1)]
+            + ["."]
+        )
+        accepted = frozenset(accepted)
+        pattern = re.compile(r"_+")
+        text = "".join([c if c in accepted else "_" for c in text.lower()])
+        text = pattern.sub("_", text).strip("_")
+        return text
+
+    def convert_lyric_tokens_to_string(self, lyrics: list[str]) -> str:
+        return " ".join(lyrics)
+
+    def convert_to_tensors(
+        self, inputs, tensor_type: Optional[Union[str, TensorType]] = None, prepend_batch_axis: bool = False
+    ):
+        """
+        Convert the inner content to tensors.
+
+        Args:
+            tensor_type (`str` or [`~utils.TensorType`], *optional*):
+                The type of tensors to use. If `str`, should be one of the values of the enum [`~utils.TensorType`]. If
+                unset, no modification is done.
+            prepend_batch_axis (`int`, *optional*, defaults to `False`):
+                Whether or not to add the batch dimension during the conversion.
+        """
+        # Convert to TensorType
+        if not isinstance(tensor_type, TensorType):
+            tensor_type = TensorType(tensor_type)
+
+        # Get a function reference for the correct framework
+        if tensor_type == TensorType.TENSORFLOW:
+            if not is_tf_available():
+                raise ImportError(
+                    "Unable to convert output to TensorFlow tensors format, TensorFlow is not installed."
+                )
+            import tensorflow as tf
+
+            as_tensor = tf.constant
+            is_tensor = tf.is_tensor
+        elif tensor_type == TensorType.PYTORCH:
+            if not is_torch_available():
+                raise ImportError("Unable to convert output to PyTorch tensors format, PyTorch is not installed.")
+            import torch
+
+            as_tensor = torch.tensor
+            is_tensor = torch.is_tensor
+        elif tensor_type == TensorType.JAX:
+            if not is_flax_available():
+                raise ImportError("Unable to convert output to JAX tensors format, JAX is not installed.")
+            import jax.numpy as jnp  # noqa: F811
+
+            as_tensor = jnp.array
+            is_tensor = _is_jax
+        else:
+            as_tensor = np.asarray
+            is_tensor = _is_numpy
+
+        # Do the tensor conversion in batch
+
+        try:
+            if prepend_batch_axis:
+                inputs = [inputs]
+
+            if not is_tensor(inputs):
+                inputs = as_tensor(inputs)
+        except:  # noqa E722
+            raise ValueError(
+                "Unable to create tensor, you should probably activate truncation and/or padding "
+                "with 'padding=True' 'truncation=True' to have batched tensors with the same length."
+            )
+
+        return inputs
+
+    def __call__(self, artist, genres, lyrics="", return_tensors="pt") -> BatchEncoding:
+        """Convert the raw string to a list of token ids
+
+        Args:
+            artist (`str`):
+                Name of the artist.
+            genres (`str`):
+                List of genres that will be mixed to condition the audio
+            lyrics (`str`, *optional*, defaults to `""`):
+                Lyrics used to condition the generation
+        """
+        input_ids = [0, 0, 0]
+        artist = [artist] * len(self.version)
+        genres = [genres] * len(self.version)
+
+        artists_tokens, genres_tokens, lyrics_tokens = self.tokenize(artist, genres, lyrics)
+        artists_id, genres_ids, full_tokens = self._convert_token_to_id(artists_tokens, genres_tokens, lyrics_tokens)
+
+        attention_masks = [-INFINITY] * len(full_tokens[-1])
+        input_ids = [
+            self.convert_to_tensors(
+                [input_ids + [artists_id[i]] + genres_ids[i] + full_tokens[i]], tensor_type=return_tensors
+            )
+            for i in range(len(self.version))
+        ]
+        return BatchEncoding({"input_ids": input_ids, "attention_masks": attention_masks})
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        """
+        Saves the tokenizer's vocabulary dictionary to the provided save_directory.
+
+        Args:
+            save_directory (`str`):
+                A path to the directory where to saved. It will be created if it doesn't exist.
+
+            filename_prefix (`Optional[str]`, *optional*):
+                A prefix to add to the names of the files saved by the tokenizer.
+
+        """
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+
+        artists_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["artists_file"]
+        )
+        with open(artists_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.artists_encoder, ensure_ascii=False))
+
+        genres_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["genres_file"]
+        )
+        with open(genres_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.genres_encoder, ensure_ascii=False))
+
+        lyrics_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["lyrics_file"]
+        )
+        with open(lyrics_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.lyrics_encoder, ensure_ascii=False))
+
+        return (artists_file, genres_file, lyrics_file)
+
+    def _convert_id_to_token(self, artists_index, genres_index, lyric_index):
+        """
+        Converts an index (integer) in a token (str) using the vocab.
+
+        Args:
+            artists_index (`int`):
+                Index of the artist in its corresponding dictionary.
+            genres_index (`Union[list[int], int]`):
+               Index of the genre in its corresponding dictionary.
+            lyric_index (`list[int]`):
+                List of character indices, which each correspond to a character.
+        """
+        artist = self.artists_decoder.get(artists_index)
+        genres = [self.genres_decoder.get(genre) for genre in genres_index]
+        lyrics = [self.lyrics_decoder.get(character) for character in lyric_index]
+        return artist, genres, lyrics
+
+
+__all__ = ["JukeboxTokenizer"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..53ec5ed37c13614266d04cde838ff7360946a451
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ....utils import _LazyModule
+from ....utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_mctct import *
+    from .feature_extraction_mctct import *
+    from .modeling_mctct import *
+    from .processing_mctct import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/configuration_mctct.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/configuration_mctct.py
new file mode 100644
index 0000000000000000000000000000000000000000..984dca4a62bfa726bae9081183cd18853381d3ad
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/configuration_mctct.py
@@ -0,0 +1,184 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""M-CTC-T model configuration"""
+
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class MCTCTConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MCTCTModel`]. It is used to instantiate an
+    M-CTC-T model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the M-CTC-T
+    [speechbrain/m-ctc-t-large](https://huggingface.co/speechbrain/m-ctc-t-large) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 8065):
+            Vocabulary size of the M-CTC-T model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`MCTCTModel`].
+        hidden_size (`int`, *optional*, defaults to 1536):
+            Dimension of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 36):
+            Number of hidden layers in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 6144):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 4):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        attention_head_dim (`int`, *optional*, defaults to 384):
+            Dimensions of each attention head for each attention layer in the Transformer encoder.
+        max_position_embeddings (`int`, *optional*, defaults to 920):
+            The maximum sequence length that this model might ever be used with (after log-mel spectrogram extraction).
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        layerdrop (`float`, *optional*, defaults to 0.3):
+            The probability of dropping an encoder layer during training. The default 0.3 value is used in the original
+            implementation.
+        hidden_act (`str` or `function`, *optional*, defaults to `"relu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.3):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.3):
+            The dropout ratio for the attention probabilities.
+        pad_token_id (`int`, *optional*, defaults to 1):
+            The tokenizer index of the pad token.
+        bos_token_id (`int`, *optional*, defaults to 0):
+            The tokenizer index of the bos token.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            The tokenizer index of the eos token.
+        conv_glu_dim (`int`, *optional*, defaults to 1):
+            The dimension of the output of the `Conv1dSubsampler` layer in which GLU is applied on. Though the original
+            Flashlight code uses the value of 2, here it's adapted to 1 due to transposition differences.
+        conv_dropout (`int`, *optional*, defaults to 0.3):
+            The probability of randomly dropping the `Conv1dSubsampler` layer during training.
+        num_conv_layers (`int`, *optional*, defaults to 1):
+            Number of convolution layers before applying transformer encoder layers.
+        conv_kernel (`Sequence[int]`, *optional*, defaults to `(7,)`):
+            The kernel size of the 1D convolution applied before transformer layers. `len(conv_kernel)` must be equal
+            to `num_conv_layers`.
+        conv_stride (`Sequence[int]`, *optional*, defaults to `(3,)`):
+            The stride length of the 1D convolution applied before transformer layers. `len(conv_stride)` must be equal
+            to `num_conv_layers`.
+        input_feat_per_channel (`int`, *optional*, defaults to 80):
+            Feature dimensions of the channels of the input to the Conv1D layer.
+        input_channels (`int`, *optional*, defaults to 1):
+            Number of input channels of the input to the Conv1D layer.
+        conv_channels (`list[int]`, *optional*):
+            Channel sizes of intermediate Conv1D layers.
+        ctc_loss_reduction (`str`, *optional*, defaults to `"sum"`):
+            Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an
+            instance of [`MCTCTForCTC`].
+        ctc_zero_infinity (`bool`, *optional*, defaults to `False`):
+            Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses mainly
+            occur when the inputs are too short to be aligned to the targets. Only relevant when training an instance
+            of [`MCTCTForCTC`].
+
+    Example:
+
+    ```python
+    >>> from transformers import MCTCTConfig, MCTCTModel
+
+    >>> # Initializing a M-CTC-T mctct-large style configuration
+    >>> configuration = MCTCTConfig()
+
+    >>> # Initializing a model (with random weights) from the mctct-large style configuration
+    >>> model = MCTCTModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "mctct"
+
+    def __init__(
+        self,
+        vocab_size=8065,
+        hidden_size=1536,
+        num_hidden_layers=36,
+        intermediate_size=6144,
+        num_attention_heads=4,
+        attention_head_dim=384,
+        max_position_embeddings=920,
+        layer_norm_eps=1e-5,
+        layerdrop=0.3,
+        hidden_act="relu",
+        initializer_range=0.02,
+        hidden_dropout_prob=0.3,
+        attention_probs_dropout_prob=0.3,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        conv_glu_dim=1,
+        conv_dropout=0.3,
+        num_conv_layers=1,
+        conv_kernel=(7,),
+        conv_stride=(3,),
+        input_feat_per_channel=80,
+        input_channels=1,
+        conv_channels=None,
+        ctc_loss_reduction="sum",
+        ctc_zero_infinity=False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.intermediate_size = intermediate_size
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        self.max_position_embeddings = max_position_embeddings
+        self.layer_norm_eps = layer_norm_eps
+        self.layerdrop = layerdrop
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.pad_token_id = pad_token_id
+        self.bos_token_id = bos_token_id
+        self.eos_token_id = eos_token_id
+        self.conv_glu_dim = conv_glu_dim
+        self.conv_dropout = conv_dropout
+        self.num_conv_layers = num_conv_layers
+        self.input_feat_per_channel = input_feat_per_channel
+        self.input_channels = input_channels
+        self.conv_channels = conv_channels
+        self.ctc_loss_reduction = ctc_loss_reduction
+        self.ctc_zero_infinity = ctc_zero_infinity
+
+        # prevents config testing fail with exporting to json
+        self.conv_kernel = list(conv_kernel)
+        self.conv_stride = list(conv_stride)
+
+        if len(self.conv_kernel) != self.num_conv_layers:
+            raise ValueError(
+                "Configuration for convolutional module is incorrect. "
+                "It is required that `len(config.conv_kernel)` == `config.num_conv_layers` "
+                f"but is `len(config.conv_kernel) = {len(self.conv_kernel)}`, "
+                f"`config.num_conv_layers = {self.num_conv_layers}`."
+            )
+
+
+__all__ = ["MCTCTConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/feature_extraction_mctct.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/feature_extraction_mctct.py
new file mode 100644
index 0000000000000000000000000000000000000000..966a160f91b07177b16043a00ccd76685926a422
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/feature_extraction_mctct.py
@@ -0,0 +1,291 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Feature extractor class for M-CTC-T
+"""
+
+from typing import Optional, Union
+
+import numpy as np
+
+from ....audio_utils import mel_filter_bank, optimal_fft_length, spectrogram, window_function
+from ....feature_extraction_sequence_utils import SequenceFeatureExtractor
+from ....feature_extraction_utils import BatchFeature
+from ....file_utils import PaddingStrategy, TensorType
+from ....utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class MCTCTFeatureExtractor(SequenceFeatureExtractor):
+    r"""
+    Constructs a M-CTC-T feature extractor.
+
+    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
+    most of the main methods. Users should refer to this superclass for more information regarding those methods. This
+    code has been adapted from Flashlight's C++ code. For more information about the implementation, one can refer to
+    this [notebook](https://colab.research.google.com/drive/1GLtINkkhzms-IsdcGy_-tVCkv0qNF-Gt#scrollTo=pMCRGMmUC_an)
+    that takes the user step-by-step in the implementation.
+
+    Args:
+        feature_size (`int`, defaults to 80):
+            The feature dimension of the extracted features. This is the number of mel_frequency
+        sampling_rate (`int`, defaults to 16000):
+            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
+        padding_value (`float`, defaults to 0.0):
+            The value that is used to fill the padding values.
+        hop_length (`int`, defaults to 10):
+            Number of audio samples between windows. Otherwise referred to as "shift" in many papers.
+        win_length (`int`, defaults to 25):
+            Number of ms per window
+        win_function (`str`, defaults to `"hamming_window"`):
+            Name for the window function used for windowing, must be accessible via `torch.{win_function}`
+        frame_signal_scale (`float`, defaults to 32768.0):
+            Constant multiplied in creating the frames before applying DFT.
+        preemphasis_coeff (`float`, defaults to 0.97):
+            Constant multiplied in applying Pre-emphasis before DFT.
+        mel_floor (`float` defaults to 1.0):
+            Minimum value of mel frequency banks.
+        normalize_means (`bool`, *optional*, defaults to `True`):
+            Whether or not to zero-mean normalize the extracted features.
+        normalize_vars (`bool`, *optional*, defaults to `True`):
+            Whether or not to unit-variance normalize the extracted features.
+    """
+
+    model_input_names = ["input_features", "attention_mask"]
+
+    def __init__(
+        self,
+        feature_size=80,
+        sampling_rate=16000,
+        padding_value=0.0,
+        hop_length=10,
+        win_length=25,
+        win_function="hamming_window",
+        frame_signal_scale=32768.0,
+        preemphasis_coeff=0.97,
+        mel_floor=1.0,
+        normalize_means=True,
+        normalize_vars=True,
+        return_attention_mask=False,
+        **kwargs,
+    ):
+        super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
+
+        self.feature_size = feature_size
+        self.sampling_rate = sampling_rate
+        self.padding_value = padding_value
+        self.hop_length = hop_length
+        self.win_length = win_length
+        self.frame_signal_scale = frame_signal_scale
+        self.preemphasis_coeff = preemphasis_coeff
+        self.mel_floor = mel_floor
+        self.normalize_means = normalize_means
+        self.normalize_vars = normalize_vars
+        self.win_function = win_function
+        self.return_attention_mask = return_attention_mask
+
+        self.sample_size = win_length * sampling_rate // 1000
+        self.sample_stride = hop_length * sampling_rate // 1000
+
+        self.n_fft = optimal_fft_length(self.sample_size)
+        self.n_freqs = (self.n_fft // 2) + 1
+
+    def _extract_mfsc_features(self, one_waveform: np.ndarray) -> np.ndarray:
+        """
+        Extracts MFSC Features for one waveform vector (unbatched). Adapted from Flashlight's C++ MFSC code.
+        """
+        if self.win_function == "hamming_window":
+            window = window_function(window_length=self.sample_size, name=self.win_function, periodic=False)
+        else:
+            window = window_function(window_length=self.sample_size, name=self.win_function)
+
+        fbanks = mel_filter_bank(
+            num_frequency_bins=self.n_freqs,
+            num_mel_filters=self.feature_size,
+            min_frequency=0.0,
+            max_frequency=self.sampling_rate / 2.0,
+            sampling_rate=self.sampling_rate,
+        )
+
+        msfc_features = spectrogram(
+            one_waveform * self.frame_signal_scale,
+            window=window,
+            frame_length=self.sample_size,
+            hop_length=self.sample_stride,
+            fft_length=self.n_fft,
+            center=False,
+            preemphasis=self.preemphasis_coeff,
+            mel_filters=fbanks,
+            mel_floor=self.mel_floor,
+            log_mel="log",
+        )
+        return msfc_features.T
+
+    def _normalize_one(self, x, input_length, padding_value):
+        # make sure we normalize float32 arrays
+        if self.normalize_means:
+            mean = x[:input_length].mean(axis=0)
+            x = np.subtract(x, mean)
+        if self.normalize_vars:
+            std = x[:input_length].std(axis=0)
+            x = np.divide(x, std)
+
+        if input_length < x.shape[0]:
+            x[input_length:] = padding_value
+
+        # make sure array is in float32
+        x = x.astype(np.float32)
+
+        return x
+
+    def normalize(
+        self, input_features: list[np.ndarray], attention_mask: Optional[np.ndarray] = None
+    ) -> list[np.ndarray]:
+        lengths = attention_mask.sum(-1) if attention_mask is not None else [x.shape[0] for x in input_features]
+        return [self._normalize_one(x, n, self.padding_value) for x, n in zip(input_features, lengths)]
+
+    def __call__(
+        self,
+        raw_speech: Union[np.ndarray, list[float], list[np.ndarray], list[list[float]]],
+        padding: Union[bool, str, PaddingStrategy] = False,
+        max_length: Optional[int] = None,
+        truncation: bool = False,
+        pad_to_multiple_of: Optional[int] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        sampling_rate: Optional[int] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Main method to featurize and prepare for the model one or several sequence(s). sequences. It returns the
+        log-mel spectrogram of the input audio, as implemented in the original Flashlight MFSC feature extraction code.
+
+        Args:
+            raw_speech (`torch.Tensor`, `np.ndarray`, `list[float]`, `list[torch.Tensor]`, `list[np.ndarray]`, `list[list[float]]`):
+                The sequence or batch of sequences to be padded. Each sequence can be a tensor, a numpy array, a list
+                of float values, a list of tensors, a list of numpy arrays or a list of list of float values. Must be
+                mono channel audio, not stereo, i.e. single float per timestep.
+            padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `False`):
+                Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                index) among:
+
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            max_length (`int`, *optional*):
+                Maximum length of the returned list and optionally padding length (see above).
+            truncation (`bool`):
+                Activates truncation to cut input sequences longer than *max_length* to *max_length*.
+            pad_to_multiple_of (`int`, *optional*):
+                If set will pad the sequence to a multiple of the provided value.
+
+                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
+            return_attention_mask (`bool`, *optional*):
+                Whether to return the attention mask. If left to the default, will return the attention mask according
+                to the specific feature_extractor's default.
+
+                [What are attention masks?](../glossary#attention-mask)
+
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+            sampling_rate (`int`, *optional*):
+                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
+                `sampling_rate` at the forward call to prevent silent errors.
+            padding_value (`float`, defaults to 0.0):
+        """
+
+        if sampling_rate is not None:
+            if sampling_rate != self.sampling_rate:
+                raise ValueError(
+                    f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
+                    f" {self.sampling_rate}. Please make sure that the provided `raw_speech` input was sampled with"
+                    f" {self.sampling_rate} and not {sampling_rate}."
+                )
+        else:
+            logger.warning(
+                "It is strongly recommended to pass the ``sampling_rate`` argument to this function. "
+                "Failing to do so can result in silent errors that might be hard to debug."
+            )
+
+        is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
+        if is_batched_numpy and len(raw_speech.shape) > 2:
+            raise ValueError(f"Only mono-channel audio is supported for input to {self}")
+        is_batched = is_batched_numpy or (
+            isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list)))
+        )
+
+        if is_batched:
+            raw_speech = [np.asarray(speech, dtype=np.float32) for speech in raw_speech]
+        elif not is_batched and not isinstance(raw_speech, np.ndarray):
+            raw_speech = np.asarray(raw_speech, dtype=np.float32)
+        elif isinstance(raw_speech, np.ndarray) and raw_speech.dtype is np.dtype(np.float64):
+            raw_speech = raw_speech.astype(np.float32)
+
+        # always return batch
+        if not is_batched:
+            raw_speech = [raw_speech]
+
+        # extract fbank features
+        features = [self._extract_mfsc_features(one_waveform) for one_waveform in raw_speech]
+
+        # convert into correct format for padding
+        encoded_inputs = BatchFeature({"input_features": features})
+
+        padded_inputs = self.pad(
+            encoded_inputs,
+            padding=padding,
+            max_length=max_length,
+            truncation=truncation,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_attention_mask=True,
+            **kwargs,
+        )
+        # make sure list is in array format
+        input_features = padded_inputs.get("input_features")
+        if isinstance(input_features[0], list):
+            padded_inputs["input_features"] = [np.asarray(feature, dtype=np.float32) for feature in input_features]
+
+        attention_mask = padded_inputs.get("attention_mask")
+        if attention_mask is not None:
+            padded_inputs["attention_mask"] = [np.asarray(array, dtype=np.int32) for array in attention_mask]
+
+        if self.normalize_means or self.normalize_vars:
+            attention_mask = (
+                np.array(attention_mask, dtype=np.int32)
+                if self._get_padding_strategies(padding, max_length=max_length) is not PaddingStrategy.DO_NOT_PAD
+                and padding
+                else None
+            )
+            padded_inputs["input_features"] = self.normalize(
+                padded_inputs["input_features"], attention_mask=attention_mask
+            )
+
+        if return_tensors is not None:
+            padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
+
+        return padded_inputs
+
+
+__all__ = ["MCTCTFeatureExtractor"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/modeling_mctct.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/modeling_mctct.py
new file mode 100644
index 0000000000000000000000000000000000000000..16f59d3d1dfa11a11bd1ef51ad77e137adf2cc93
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/modeling_mctct.py
@@ -0,0 +1,778 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch M-CTC-T model."""
+
+import math
+from typing import Optional, Union
+
+import torch
+from torch import nn
+
+from ....activations import ACT2FN
+from ....file_utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward
+from ....integrations.deepspeed import is_deepspeed_zero3_enabled
+from ....integrations.fsdp import is_fsdp_managed_module
+from ....modeling_attn_mask_utils import _prepare_4d_attention_mask
+from ....modeling_layers import GradientCheckpointingLayer
+from ....modeling_outputs import BaseModelOutput, CausalLMOutput
+from ....modeling_utils import PreTrainedModel
+from ....pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ....utils import logging
+from .configuration_mctct import MCTCTConfig
+
+
+logger = logging.get_logger(__name__)
+
+_HIDDEN_STATES_START_POSITION = 1
+
+_CONFIG_FOR_DOC = "MCTCTConfig"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "speechbrain/m-ctc-t-large"
+_EXPECTED_OUTPUT_SHAPE = [1, 195, 1536]
+
+# CTC docstring
+_CTC_EXPECTED_OUTPUT = '"Mr. Quilter is the apostle of the middle classes, and we\'re glad to welcome his gospel."'
+_CTC_EXPECTED_LOSS = 1885.65
+
+
+class MCTCTConv1dSubsampler(nn.Module):
+    """
+    Convolutional subsampler: a stack of 1D convolution (along temporal dimension) followed by non-linear activation
+    via gated linear units (https://huggingface.co/papers/1911.08460)
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.glu_dim = config.conv_glu_dim
+
+        self.dropout = nn.Dropout(config.conv_dropout)
+
+        self.num_layers = config.num_conv_layers
+        self.in_channels = config.input_feat_per_channel * config.input_channels
+
+        if self.num_layers > 1:
+            if config.conv_channels is None:
+                raise ValueError(
+                    "Need to specify `conv_channels` configuration in `MCTCTConfig` to use multiple convolution"
+                    " layers."
+                )
+
+            self.mid_channels = config.conv_channels
+        else:
+            self.mid_channels = None
+
+        self.out_channels = config.hidden_size * 2  # considering GLU halving
+        self.kernel_size = config.conv_kernel
+        self.stride = config.conv_stride
+
+        # NOTE: MCTCT by construction only uses one convolution kernel. I've made this flexible to allow for
+        # multiple layers of convolutions, but not sure if this model definition should just restrict it
+        # to one layer. This becomes especially relevant when considering the padding like line 1 of forward().
+        self.conv_layers = nn.ModuleList(
+            nn.Conv1d(
+                self.in_channels if i == 0 else self.mid_channels[i],
+                self.mid_channels[i] if i < self.num_layers - 1 else self.out_channels,
+                kernel_size=k,
+                stride=self.stride[i],
+                padding="valid",
+            )
+            for i, k in enumerate(self.kernel_size)
+        )
+
+    def forward(self, input_features):
+        # NOTE: in reference to the NOTE in __init__, right now it just calculates padding as if
+        # there will be just one conv layer.
+        padding = sum(size // 2 for size in self.kernel_size)  # (7, 7) -> (3, 3)
+
+        input_features = torch.nn.functional.pad(input_features, (0, 0, padding, padding), "constant", 0)
+        hidden_states = input_features.transpose(1, 2).contiguous()  # -> Batch x Frame x Time
+        for conv in self.conv_layers:
+            hidden_states = conv(hidden_states)
+            hidden_states = nn.functional.glu(hidden_states, dim=self.glu_dim)
+            hidden_states = self.dropout(hidden_states)
+
+        hidden_states = hidden_states.transpose(1, 2).contiguous()  # -> Batch x Time x Frame
+        return hidden_states
+
+
+class MCTCTEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        # self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.LayerNorm = MCTCTLayerNorm()
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+        self.register_buffer(
+            "token_type_ids",
+            torch.zeros(self.position_ids.size(), dtype=torch.long, device=self.position_ids.device),
+            persistent=False,
+        )
+
+    def forward(
+        self, input_features=None, token_type_ids=None, position_ids=None, inputs_embeds=None, past_key_values_length=0
+    ):
+        input_shape = input_features.size() if input_features is not None else inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
+
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_features)
+
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class MCTCTSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = config.attention_head_dim
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=False)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=False)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=False)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+        self.max_position_embeddings = config.max_position_embeddings
+        self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def reshape_fortran(self, x, shape):
+        if len(x.shape) > 0:
+            x = x.permute(*reversed(range(len(x.shape))))
+        return x.reshape(*reversed(shape)).permute(*reversed(range(len(shape))))
+
+    def relative_position_embedding_rotate(self, scores):
+        # NOTE: should re-evaluate whether this re-implementation was truly necessary
+        # or the reason why my complete re-haul worked was due to some other part
+        # of the code. Adding this and the reshape fortrain code seems very undesirable.
+        scores = scores.permute(0, 2, 3, 1)  # e.g. [10, 1839, 14, 4]
+
+        batch, hidden_state, seq_len, heads = scores.shape
+
+        # e.g. [10, 1853, 14, 4]
+        scores = torch.cat((scores, torch.zeros((batch, seq_len, seq_len, heads), device=scores.device)), dim=1)
+
+        # e.g. [10, 25942, 1, 4]
+        scores = self.reshape_fortran(scores, [batch, (hidden_state + seq_len) * seq_len, 1, heads])
+
+        # e.g. [10, 25928, 1, 4]
+        scores = scores[:, : (seq_len + hidden_state - 1) * seq_len]
+
+        # e.g. [10, 1852, 14, 4]
+        scores = self.reshape_fortran(scores, [batch, hidden_state + seq_len - 1, seq_len, heads])
+
+        halfpoint = hidden_state // 2
+        scores = scores[:, halfpoint : halfpoint + seq_len].transpose(1, 2)  # e.g. [10, 14, 14, 4]
+
+        return scores.permute(0, 3, 1, 2)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        output_attentions=False,
+    ):
+        mixed_query_layer = self.query(hidden_states)
+        mixed_query_layer = mixed_query_layer / math.sqrt(self.attention_head_size)
+
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        # relative key position embeddings
+        positional_embedding = self.distance_embedding.weight
+        relative_position_scores = torch.einsum("lh, bche -> bcle", positional_embedding, query_layer.transpose(2, 3))
+
+        relative_position_scores = self.relative_position_embedding_rotate(relative_position_scores)
+        attention_scores = attention_scores + relative_position_scores
+
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in MCTCTModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).flatten(start_dim=-2)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+class MCTCTLayerNorm(nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.singleton_weight = nn.Parameter(torch.ones(1))
+        self.singleton_bias = nn.Parameter(torch.zeros(1))
+
+    def forward(self, hidden_states):
+        return (hidden_states * self.singleton_weight) + self.singleton_bias
+
+
+class MCTCTSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class MCTCTAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.self = MCTCTSelfAttention(config)
+        self.output = MCTCTSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        output_attentions=False,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+
+        return outputs
+
+
+class MCTCTIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size, bias=False)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class MCTCTOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class MCTCTLayer(GradientCheckpointingLayer):
+    def __init__(self, config: MCTCTConfig):
+        super().__init__()
+
+        self.seq_len_dim = 1
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+
+        self.intermediate = MCTCTIntermediate(config)
+        self.attention = MCTCTAttention(config)
+        self.is_decoder = config.is_decoder
+        self.output = MCTCTOutput(config)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        output_attentions=False,
+    ):
+        self_attention_outputs = self.attention(
+            hidden_states, attention_mask, head_mask, output_attentions=output_attentions
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class MCTCTPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config: MCTCTConfig
+    base_model_prefix = "mctct"
+    main_input_name = "input_features"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, MCTCTLayerNorm):
+            module.singleton_weight.data.fill_(1.0)
+            module.singleton_bias.data.zero_()
+        if isinstance(module, (nn.Linear, nn.Conv1d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+
+    def _get_feat_extract_output_lengths(self, input_lengths: torch.LongTensor):
+        """
+        Computes the output length of the convolutional layers
+        """
+        dilation = 1
+        for _, kernel_sz, stride in zip(
+            range(self.config.num_conv_layers), self.config.conv_kernel, self.config.conv_stride
+        ):
+            padding = kernel_sz // 2
+            input_lengths = input_lengths + 2 * padding - dilation * (kernel_sz - 1) - 1
+            input_lengths = torch.div(input_lengths, stride, rounding_mode="trunc") + 1
+
+        return input_lengths
+
+    def _get_feature_vector_attention_mask(self, feature_vector_length, attention_mask):
+        # generate creates 3D attention mask, because of the shape of input_features
+        # convert it to 2D if that's the case
+        if len(attention_mask.shape) > 2:
+            attention_mask = attention_mask[:, :, -1]
+
+        # subsampled_lengths = attention_mask.sum(-1)
+        subsampled_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1))
+        bsz = attention_mask.size()[0]
+        attention_mask = torch.zeros(
+            (bsz, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
+        )
+
+        # these two operations makes sure that all values
+        # before the output lengths indices are attended to
+        attention_mask[(torch.arange(bsz, device=attention_mask.device), subsampled_lengths - 1)] = 1
+        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).long()
+        return attention_mask
+
+
+MCTCT_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`MCTCTConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+MCTCT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_features (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`Wav2Vec2CTCTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class MCTCTEncoder(MCTCTPreTrainedModel):
+    def __init__(self, config: MCTCTConfig):
+        super().__init__(config)
+        self.hidden_dropout_prob = config.hidden_dropout_prob
+
+        self.layer_norm = MCTCTLayerNorm()
+        self.conv = MCTCTConv1dSubsampler(config)
+        self.layers = nn.ModuleList([MCTCTLayer(config) for _ in range(config.num_hidden_layers)])
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        input_features: torch.Tensor,
+        attention_mask: torch.Tensor,
+        head_mask: torch.Tensor,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        input_features = self.layer_norm(input_features)
+
+        inputs_embeds = self.conv(input_features)
+
+        # subsample attention mask if necessary
+        if attention_mask is not None:
+            attention_mask = self._get_feature_vector_attention_mask(inputs_embeds.shape[1], attention_mask)
+
+        hidden_states = nn.functional.dropout(inputs_embeds, p=self.hidden_dropout_prob, training=self.training)
+
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype)
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        # check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            if head_mask.size()[0] != len(self.layers):
+                raise ValueError(
+                    f"The head_mask should be specified for {len(self.layers)} layers, "
+                    f"but it is for {head_mask.size()[0]}."
+                )
+
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+
+            # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            dropout_probability = torch.rand([])
+
+            skip_the_layer = self.training and dropout_probability < self.config.layerdrop
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
+                layer_outputs = encoder_layer(
+                    hidden_states=hidden_states,
+                    attention_mask=attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+                hidden_states = layer_outputs[0]
+
+            if skip_the_layer:
+                layer_outputs = (None, None)
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+@add_start_docstrings(
+    "The bare M-CTC-T Model transformer outputting raw hidden-states without any specific head on top.",
+    MCTCT_START_DOCSTRING,
+)
+class MCTCTModel(MCTCTPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+
+        self.encoder = MCTCTEncoder(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(MCTCT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self,
+        input_features: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_features is None:
+            raise ValueError("You have to specify input_features.")
+
+        encoder_outputs = self.encoder(
+            input_features,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+
+        if not return_dict:
+            return (sequence_output,) + encoder_outputs[1:]
+
+        return BaseModelOutput(
+            last_hidden_state=sequence_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """MCTCT Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
+    MCTCT_START_DOCSTRING,
+)
+class MCTCTForCTC(MCTCTPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.mctct = MCTCTModel(config)
+
+        if config.vocab_size is None:
+            raise ValueError(
+                f"You are trying to instantiate {self.__class__} with a configuration that "
+                "does not define the vocabulary size of the language model head. Please "
+                "instantiate the model as follows: `MCTCTForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
+                "or define `vocab_size` of your model's configuration."
+            )
+        output_hidden_size = config.hidden_size
+
+        self.ctc_head = nn.Linear(output_hidden_size, config.vocab_size)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(MCTCT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=CausalLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_CTC_EXPECTED_OUTPUT,
+        expected_loss=_CTC_EXPECTED_LOSS,
+    )
+    def forward(
+        self,
+        input_features: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.LongTensor] = None,
+    ) -> Union[tuple, CausalLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
+            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
+            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
+            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
+            config.vocab_size - 1]`.
+        """
+        if labels is not None and labels.max() >= self.config.vocab_size:
+            raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.mctct(
+            input_features,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+
+        logits = self.ctc_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # retrieve loss input_lengths from attention_mask
+            attention_mask = (
+                attention_mask
+                if attention_mask is not None
+                else torch.ones(input_features.shape[:-1], dtype=torch.long)
+            )
+            input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
+            # assuming that padded tokens are filled with -100
+            # when not being attended to
+            labels_mask = labels >= 0
+            target_lengths = labels_mask.sum(-1)
+            flattened_targets = labels.masked_select(labels_mask)
+
+            # ctc_loss doesn't support fp16
+            log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)
+
+            with torch.backends.cudnn.flags(enabled=False):
+                loss = nn.functional.ctc_loss(
+                    log_probs,
+                    flattened_targets,
+                    input_lengths,
+                    target_lengths,
+                    blank=self.config.pad_token_id,
+                    reduction=self.config.ctc_loss_reduction,
+                    zero_infinity=self.config.ctc_zero_infinity,
+                )
+
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutput(
+            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
+        )
+
+
+__all__ = ["MCTCTForCTC", "MCTCTModel", "MCTCTPreTrainedModel"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/processing_mctct.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/processing_mctct.py
new file mode 100644
index 0000000000000000000000000000000000000000..9661a5082d4838b4e60042be2ceb747c03e045ce
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mctct/processing_mctct.py
@@ -0,0 +1,139 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Speech processor class for M-CTC-T
+"""
+
+import warnings
+from contextlib import contextmanager
+
+from ....processing_utils import ProcessorMixin
+
+
+class MCTCTProcessor(ProcessorMixin):
+    r"""
+    Constructs a MCTCT processor which wraps a MCTCT feature extractor and a MCTCT tokenizer into a single processor.
+
+    [`MCTCTProcessor`] offers all the functionalities of [`MCTCTFeatureExtractor`] and [`AutoTokenizer`]. See the
+    [`~MCTCTProcessor.__call__`] and [`~MCTCTProcessor.decode`] for more information.
+
+    Args:
+        feature_extractor (`MCTCTFeatureExtractor`):
+            An instance of [`MCTCTFeatureExtractor`]. The feature extractor is a required input.
+        tokenizer (`AutoTokenizer`):
+            An instance of [`AutoTokenizer`]. The tokenizer is a required input.
+    """
+
+    feature_extractor_class = "MCTCTFeatureExtractor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(self, feature_extractor, tokenizer):
+        super().__init__(feature_extractor, tokenizer)
+        self.current_processor = self.feature_extractor
+        self._in_target_context_manager = False
+
+    def __call__(self, *args, **kwargs):
+        """
+        When used in normal mode, this method forwards all its arguments to MCTCTFeatureExtractor's
+        [`~MCTCTFeatureExtractor.__call__`] and returns its output. If used in the context
+        [`~MCTCTProcessor.as_target_processor`] this method forwards all its arguments to AutoTokenizer's
+        [`~AutoTokenizer.__call__`]. Please refer to the docstring of the above two methods for more information.
+        """
+        # For backward compatibility
+        if self._in_target_context_manager:
+            return self.current_processor(*args, **kwargs)
+
+        if "raw_speech" in kwargs:
+            warnings.warn("Using `raw_speech` as a keyword argument is deprecated. Use `audio` instead.")
+            audio = kwargs.pop("raw_speech")
+        else:
+            audio = kwargs.pop("audio", None)
+        sampling_rate = kwargs.pop("sampling_rate", None)
+        text = kwargs.pop("text", None)
+        if len(args) > 0:
+            audio = args[0]
+            args = args[1:]
+
+        if audio is None and text is None:
+            raise ValueError("You need to specify either an `audio` or `text` input to process.")
+
+        if audio is not None:
+            inputs = self.feature_extractor(audio, *args, sampling_rate=sampling_rate, **kwargs)
+        if text is not None:
+            encodings = self.tokenizer(text, **kwargs)
+
+        if text is None:
+            return inputs
+        elif audio is None:
+            return encodings
+        else:
+            inputs["labels"] = encodings["input_ids"]
+            return inputs
+
+    def pad(self, *args, **kwargs):
+        """
+        When used in normal mode, this method forwards all its arguments to MCTCTFeatureExtractor's
+        [`~MCTCTFeatureExtractor.pad`] and returns its output. If used in the context
+        [`~MCTCTProcessor.as_target_processor`] this method forwards all its arguments to PreTrainedTokenizer's
+        [`~PreTrainedTokenizer.pad`]. Please refer to the docstring of the above two methods for more information.
+        """
+        # For backward compatibility
+        if self._in_target_context_manager:
+            return self.current_processor.pad(*args, **kwargs)
+
+        input_features = kwargs.pop("input_features", None)
+        labels = kwargs.pop("labels", None)
+        if len(args) > 0:
+            input_features = args[0]
+            args = args[1:]
+
+        if input_features is not None:
+            input_features = self.feature_extractor.pad(input_features, *args, **kwargs)
+        if labels is not None:
+            labels = self.tokenizer.pad(labels, **kwargs)
+
+        if labels is None:
+            return input_features
+        elif input_features is None:
+            return labels
+        else:
+            input_features["labels"] = labels["input_ids"]
+            return input_features
+
+    def decode(self, *args, **kwargs):
+        """
+        This method forwards all its arguments to AutoTokenizer's [`~PreTrainedTokenizer.decode`]. Please refer to the
+        docstring of this method for more information.
+        """
+        return self.tokenizer.decode(*args, **kwargs)
+
+    @contextmanager
+    def as_target_processor(self):
+        """
+        Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning MCTCT.
+        """
+        warnings.warn(
+            "`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your "
+            "labels by using the argument `text` of the regular `__call__` method (either in the same call as "
+            "your audio inputs, or in a separate call."
+        )
+        self._in_target_context_manager = True
+        self.current_processor = self.tokenizer
+        yield
+        self.current_processor = self.feature_extractor
+        self._in_target_context_manager = False
+
+
+__all__ = ["MCTCTProcessor"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mega/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mega/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cff2c19505f9306c28edb5bdabb66f668363f5fa
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mega/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2023 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ....utils import _LazyModule
+from ....utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_mega import *
+    from .modeling_mega import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mega/configuration_mega.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mega/configuration_mega.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ede2cac66bf292c448b488d18af4d2e6e2c716a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mega/configuration_mega.py
@@ -0,0 +1,243 @@
+# coding=utf-8
+# Copyright 2023 The Mega Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""MEGA configuration"""
+
+from collections import OrderedDict
+from collections.abc import Mapping
+
+from ....configuration_utils import PretrainedConfig
+from ....onnx import OnnxConfig
+from ....utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class MegaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MegaModel`]. It is used to instantiate a Mega
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Mega
+    [mnaylor/mega-base-wikitext](https://huggingface.co/mnaylor/mega-base-wikitext) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the Mega model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`MegaModel`].
+        hidden_size (`int`, *optional*, defaults to 128):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 4):
+            Number of hidden layers in the Mega encoder.
+        intermediate_size (`int`, *optional*, defaults to 256):
+            Dimensionality of the hidden size (self-attention value projection) within the Mega encoder
+        ema_projection_size (`int`, *optional*, defaults to 16):
+            Dimensionality of the MegaMultiDimensionDampedEma
+        bidirectional (`bool`, *optional*, defaults to `True`):
+            Whether the MegaMultiDimensionDampedEma used in Mega's self-attention should work bidirectionally (`True`)
+            or unidirectionally (`False`). Bidirectional EMA is incompatible with causal decoding, so this should be
+            False if you intend to use the model as a decoder.
+        shared_representation_size (`int`, *optional*, defaults to 64):
+            Dimensionality of the linear projection for shared representation of self-attention queries and keys
+        use_chunking (`bool`, *optional*, defaults to `False`):
+            Whether to chunk inputs for linear self-attention complexity (described as Mega-chunk in the paper)
+        chunk_size (`int`, *optional*, defaults to -1):
+            If `use_chunking` is set to `True`, determines the size of the chunks to apply to the input sequence. If
+            chunking is used, input sequences must be padded to a multiple of `chunk_size`
+        truncation (`int`, *optional*):
+            If specified, the sequence length for which to truncate MegaMultiDimensionDampedEma
+        normalize_before_mega (`bool`, *optional*, defaults to `True`):
+            Whether to normalize before (`True`) or after (`False`) passing through Mega encoder blocks
+        normalization_type (`str`, *optional*, defaults to `"scalenorm"`):
+            Type of normalization to use in Mega encoder blocks. Choose one of `"scalenorm"`, `"layernorm"`,
+            `"rmsnorm"`, `"batchnorm"`, or `"syncbatchnorm"` (GPU required for syncbatchnorm)
+        norm_affine (`bool`, *optional*, defaults to `True`):
+            If `True`, applies a parameterized affine transformation to inputs during normalization
+        activation (`str`, *optional*, defaults to `"silu"`):
+            Activation function to apply within Mega encoder blocks. Choose one of `"silu"`, `"relu"`, `"linear"`,
+            `"gelu"`, or `"gelu_accurate"`
+        attention_activation (`str`, *optional*, defaults to `"softmax"`):
+            Activation function to apply for single-headed self-attention (a la Transformer). Choose one of
+            `"softmax"`, `"laplace"`, or `"relu2"`
+        dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for EMA self-attention
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        use_feature_dropout (`bool`, *optional*, defaults to `False`):
+            Whether to use feature-based (`True`) or standard dropout (`False`)
+        use_normalized_ffn (`bool`, *optional*, defaults to `True`):
+            Whether to use the normalized feed-forward sub-layer in Mega blocks (`True`) or pass Mega encoder output
+            as-is (`False`)
+        nffn_hidden_size (`int`, *optional*, defaults to 256):
+            If using the normalized feed-forward network (NFFN) layer within Mega (`use_normalized_ffn = True`), this
+            is the hidden size of the NFFN
+        normalize_before_ffn (`bool`, *optional*, defaults to `True`):
+            Whether to normalize before (`True`) or after (`False`) the feed-forward portion of NFFN
+        nffn_activation_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the NFFN component.
+        max_positions (`int`, *optional*, defaults to 2048):
+            The maximum sequence length to use for positional representations. For `"simple"` relative positional bias,
+            this is a hard limit on input length; `"rotary"` relative positional bias will extrapolate to longer
+            sequences
+        add_token_type_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether to account for token types in embeddings. Left as optional to maintain compatibility with original
+            implementation while adding support for token types.
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`MegaModel`]. Only used if
+            `add_token_type_embeddings = True`
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        ema_delta_alpha_range (`float`, *optional*, defaults to 0.2):
+            The standard deviation for initializing the delta (damping factor) and alpha (decay factor) parameters in
+            MegaMultiDimensionDampedEma.
+        ema_beta_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation for initializing the beta parameter (expansion matrix) in
+            MegaMultiDimensionDampedEma.
+        ema_gamma_omega_range (`float`, *optional*, defaults to 1.0):
+            The standard deviation for initializing the gamma (projection matrix) and omega (residual weight)
+            parameters in MultiDimensionEMA.
+        relative_positional_bias (`str`, *optional*, defaults to `"rotary"`):
+            Type of relative positional encoding. Choose one of `"rotary"` or `"simple"`. If `"simple"` is selected,
+            `max_positions` is used as a limit on input size, while `"rotary"` extrapolates beyond `max_positions`.
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+        add_lm_hidden_dense_layer (`bool`, *optional*, defaults to `True`):
+            Whether to include a hidden layer for projection between encoder outputs and LM heads (`True`) or pass
+            hidden states directly to LM head (`False`). Remains optional for compatibility with original
+            implementation
+
+    Examples:
+
+    ```python
+    >>> from transformers import MegaConfig, MegaModel
+
+    >>> # Initializing a Mega configuration
+    >>> configuration = MegaConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = MegaModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "mega"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=128,
+        num_hidden_layers=4,
+        intermediate_size=256,
+        ema_projection_size=16,
+        bidirectional=True,
+        shared_representation_size=64,
+        use_chunking=False,
+        chunk_size=-1,
+        truncation=None,
+        normalize_before_mega=True,
+        normalization_type="scalenorm",
+        norm_affine=True,
+        activation="silu",
+        attention_activation="softmax",
+        dropout_prob=0.1,
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        use_feature_dropout=False,
+        use_normalized_ffn=True,
+        nffn_hidden_size=256,
+        normalize_before_ffn=True,
+        nffn_activation_dropout_prob=0.1,
+        max_positions=2048,
+        add_token_type_embeddings=False,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        ema_delta_alpha_range=0.2,
+        ema_beta_range=0.02,
+        ema_gamma_omega_range=1.0,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        relative_positional_bias="rotary",
+        classifier_dropout=None,
+        use_cache=True,
+        add_lm_hidden_dense_layer=True,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.activation = activation
+        self.attention_activation = attention_activation
+        self.intermediate_size = intermediate_size
+        self.ema_projection_size = ema_projection_size
+        self.bidirectional = bidirectional
+        self.shared_representation_size = shared_representation_size
+        self.use_chunking = use_chunking
+        self.chunk_size = chunk_size
+        self.truncation = truncation
+        self.normalize_before_mega = normalize_before_mega
+        self.normalization_type = normalization_type
+        self.norm_affine = norm_affine
+        self.dropout_prob = dropout_prob
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.use_feature_dropout = use_feature_dropout
+        self.use_normalized_ffn = use_normalized_ffn
+        self.nffn_hidden_size = nffn_hidden_size
+        self.normalize_before_ffn = normalize_before_ffn
+        self.nffn_activation_dropout_prob = nffn_activation_dropout_prob
+        self.max_positions = max_positions
+        self.add_token_type_embeddings = add_token_type_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.ema_delta_alpha_range = ema_delta_alpha_range
+        self.ema_beta_range = ema_beta_range
+        self.ema_gamma_omega_range = ema_gamma_omega_range
+        self.relative_positional_bias = relative_positional_bias
+        self.use_cache = use_cache
+        self.classifier_dropout = classifier_dropout
+        self.add_lm_hidden_dense_layer = add_lm_hidden_dense_layer
+        self.num_attention_heads = 1  # not used but required by Hugging Face
+
+
+class MegaOnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        if self.task == "multiple-choice":
+            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
+        else:
+            dynamic_axis = {0: "batch", 1: "sequence"}
+        return OrderedDict(
+            [
+                ("input_ids", dynamic_axis),
+                ("attention_mask", dynamic_axis),
+            ]
+        )
+
+
+__all__ = ["MegaConfig", "MegaOnnxConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mega/modeling_mega.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mega/modeling_mega.py
new file mode 100644
index 0000000000000000000000000000000000000000..c237afee9a334d57d155b8efa2eb5de8278d21dd
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mega/modeling_mega.py
@@ -0,0 +1,2276 @@
+# coding=utf-8
+# Copyright 2023 The Mega Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch MEGA model."""
+
+import math
+from typing import Optional, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ....activations import ACT2FN
+from ....cache_utils import Cache
+from ....modeling_outputs import (
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from ....modeling_utils import PreTrainedModel
+from ....utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from ....utils.deprecation import deprecate_kwarg
+from .configuration_mega import MegaConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "mnaylor/mega-base-wikitext"
+_CONFIG_FOR_DOC = "MegaConfig"
+
+
+class MegaEmbeddings(nn.Module):
+    """
+    Mega's basic implementation does not incorporate token type embeddings, so this is a stripped-down version of
+    RoBERTa's embeddings which optionally includes token types
+    """
+
+    def __init__(self, config: MegaConfig):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.use_token_types = config.add_token_type_embeddings
+        if self.use_token_types:
+            self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+            # registering a buffer here allows model tracing when not passing optional token type IDs
+            # more info at transformers issue #5664
+            self.register_buffer(
+                "token_type_ids", torch.zeros(config.max_positions, dtype=torch.long).expand((1, -1)), persistent=False
+            )
+
+        self.padding_idx = config.pad_token_id
+
+    def forward(self, input_ids=None, token_type_ids=None, inputs_embeds=None):
+        if (input_ids is None) and (inputs_embeds is None):
+            raise ValueError("Must provide one of input_ids or inputs_embeds")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            device = input_ids.device
+
+            # get the word embeddings if only IDs are provided
+            inputs_embeds = self.word_embeddings(input_ids)
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+            device = inputs_embeds.device
+
+        # the original Mega implementation did not include token type embeddings, so we add
+        # an option to use them if desired; if embeddings are present and token type IDs are
+        # not provided, we will use a registered buffer (which helps with tracing)
+        if self.use_token_types:
+            if token_type_ids is None:
+                if hasattr(self, "token_type_ids"):
+                    buffered_token_type_ids = self.token_type_ids[:, : input_shape[1]]
+                    buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], input_shape[1])
+                    token_type_ids = buffered_token_type_ids_expanded
+                else:
+                    token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+            # access token type embeddings
+            token_type_embeddings = self.token_type_embeddings(token_type_ids)
+            # add the token type embeddings to the word embeddings
+            embeddings = inputs_embeds + token_type_embeddings
+        else:
+            embeddings = inputs_embeds
+        return embeddings
+
+
+class MegaSimpleRelativePositionalBias(nn.Module):
+    """
+    Simple relative positional embeddings copied from the Mega repo; renamed variables for better readability
+    """
+
+    def __init__(self, config: MegaConfig):
+        super().__init__()
+        self.config = config
+        self.max_positions = self.config.max_positions if self.config.chunk_size < 0 else self.config.chunk_size
+        self.rel_pos_bias = nn.Parameter(torch.Tensor(2 * config.max_positions - 1))
+
+    def forward(self, seq_len):
+        if seq_len > self.max_positions:
+            raise ValueError(f"Sequence length {seq_len} going beyond max length {self.max_positions}")
+
+        # seq_len * 2 - 1
+        bias = self.rel_pos_bias[(self.max_positions - seq_len) : (self.max_positions + seq_len - 1)]
+        # seq_len * 3 - 1
+        tile = F.pad(bias, (0, seq_len))
+        # (seq_len * 3 - 1) * seq_len
+        tile = torch.tile(tile, (seq_len,))
+        tile = tile[:-seq_len]
+        # seq_len x (3 * seq_len - 2)
+        tile = tile.view(seq_len, 3 * seq_len - 2)
+        start = (2 * seq_len - 1) // 2
+        end = tile.size(1) - start
+        tile = tile[:, start:end]
+        return tile
+
+
+class MegaRotaryRelativePositionalBias(nn.Module):
+    """
+    Rotary relative bias for positional information; similar in concept to RoPE (i.e. RoFormer) but taken from the Mega
+    repo due to differences in implementation.
+
+    When initialized, produces a positional bias which ranges from position 0 to config.max_positions, but can
+    extrapolate to longer sequences. Can be indexed according to input position IDs
+    """
+
+    def __init__(self, config: MegaConfig):
+        super().__init__()
+        if config.hidden_size % 2 != 0:
+            raise RuntimeError("Rotary positional bias requires `hidden_size` to be a multiple of 2")
+        self.config = config
+        self.embed_dim = config.shared_representation_size
+        self.max_positions = self.config.max_positions if self.config.chunk_size < 0 else self.config.chunk_size
+        self.sine, self.cosine = MegaRotaryRelativePositionalBias.get_sinusoid_embeddings(
+            config.max_positions, self.embed_dim
+        )
+        # alpha and beta parameters for the rotary bias; beta renamed to b_param to avoid clashes with tf/flax weight handling
+        # in loading pretrained weights
+        self.alpha = nn.Parameter(torch.Tensor(1, self.embed_dim))
+        self.b_param = nn.Parameter(torch.Tensor(1, self.embed_dim))
+        self.register_buffer("_float_tensor", torch.FloatTensor([0.0]))
+
+    @staticmethod
+    def get_sinusoid_embeddings(max_positions: int, embedding_dim: int):
+        half_dim = embedding_dim // 2
+        emb = math.log(10000) / half_dim
+        emb = torch.exp(torch.arange(half_dim, dtype=torch.int64).float() * -emb)
+        emb = torch.arange(max_positions, dtype=torch.float).unsqueeze(1) * emb.unsqueeze(0)
+        return torch.sin(emb), torch.cos(emb)
+
+    def rotary(self, input):
+        seq_len, embed_dim = input.size()
+        chunk_1, chunk_2 = torch.chunk(input, 2, dim=-1)
+        if self.sine is None or seq_len > self.sine.size(0):
+            self.sine, self.cosine = MegaRotaryRelativePositionalBias.get_sinusoid_embeddings(seq_len, embed_dim)
+            self.max_positions = seq_len
+        self.sine = self.sine.to(self._float_tensor)
+        self.cosine = self.cosine.to(self._float_tensor)
+
+        sin = self.sine[:seq_len]
+        cos = self.cosine[:seq_len]
+        return torch.cat([chunk_1 * cos - chunk_2 * sin, chunk_2 * cos + chunk_1 * sin], dim=1)
+
+    def forward(self, seq_len):
+        rotary_alpha = self.rotary(self.alpha.expand(seq_len, self.embed_dim))
+        rotary_beta = self.rotary(self.b_param.expand(seq_len, self.embed_dim))
+        bias = torch.einsum("mk,nk->mn", rotary_alpha, rotary_beta)
+        return bias
+
+
+class MegaDropout(nn.Module):
+    """
+    A unified class for standard dropout functionality and featurewise dropout.
+
+    The original fairseq Mega repo used 2 classes for these, which included some unnecessary handling of training logic
+    and an unused `inplace` option. The original implementation used torch.nn.functional instead of submodules, which
+    is retained here as well.
+    """
+
+    def __init__(self, dropout_probability, is_featurewise=False):
+        super().__init__()
+        self.dropout_probability = dropout_probability
+        self.is_featurewise = is_featurewise
+
+    def forward(self, input, batch_first: bool = False):
+        if self.is_featurewise:
+            if batch_first:
+                # (batch_size X sequence_length X feature_dimension)
+                # -> (batch_size X feature_dimension X sequence_length)
+                # -> (batch_size X sequence_length X feature_dimension)
+                return F.dropout2d(
+                    input.transpose(-1, -2), p=self.dropout_probability, training=self.training
+                ).transpose(-1, -2)
+            else:
+                if input.dim() != 3:
+                    raise ValueError(
+                        "Feature dropout inputs must be exactly 3-dimensional if inputs are ordered [sequence length, batch size, hidden dimension]"
+                    )
+                # (sequence_length X batch_size X feature_dimension)
+                # -> (batch_size X feature_dimension X sequence_length)
+                # -> (sequence_length X batch_size X feature_dimension)
+                return F.dropout2d(input.permute(1, 2, 0), p=self.dropout_probability, training=self.training).permute(
+                    2, 0, 1
+                )
+        else:
+            return F.dropout(input, p=self.dropout_probability, training=self.training)
+
+
+class MegaRMSNorm(nn.Module):
+    """
+    RMSNorm used in Mega implementation. Differs from T5's RMSNorm by applying the weight prior to taking the square
+    root (as opposed to after in T5)
+    """
+
+    def __init__(self, number_features, eps=1e-6, affine=True):
+        super().__init__()
+        self.num_features = number_features
+        self.eps = eps
+        self.affine = affine
+        if affine:
+            self.weight = nn.Parameter(torch.Tensor(self.num_features))
+        else:
+            self.register_parameter("weight", None)
+
+    def forward(self, input):
+        mean_square = torch.mean(torch.square(input), dim=-1, keepdim=True)
+        if self.weight is not None:
+            input = input * self.weight
+
+        input * torch.rsqrt(mean_square + self.eps)
+        return input
+
+    def extra_repr(self):
+        return f"{self.num_features}, eps={self.eps}, affine={self.affine}"
+
+
+class MegaScaleNorm(nn.Module):
+    """
+    Scale normalization introduced in MEGA which is similar to RMSNorm, but uses a single parameter for scalar
+    multiplication instead of a vector, and applies over a specified dimension
+    """
+
+    def __init__(self, dim, eps=1e-6, affine=True):
+        super().__init__()
+        self.dim = dim
+        self.eps = eps
+        self.affine = affine
+        if affine:
+            self.scalar = nn.Parameter(torch.Tensor(1))
+        else:
+            self.register_parameter("scalar", None)
+
+    def forward(self, input):
+        mean_square = torch.mean(torch.square(input), dim=self.dim, keepdim=True)
+        if self.scalar is not None:
+            input = self.scalar * input
+
+        output = input * torch.rsqrt(mean_square + self.eps)
+        return output
+
+
+class MegaSequenceNorm(nn.Module):
+    """
+    A wrapper class for various layer normalization options used in Mega. Used to handle differences in expectations on
+    input axis locations for different normalization methods.
+    """
+
+    def __init__(self, norm_type, embedding_dim, eps=1e-5, affine=True, export=False):
+        super().__init__()
+        if norm_type == "layernorm":
+            self.norm = nn.LayerNorm(embedding_dim, eps, elementwise_affine=affine)
+        elif norm_type == "scalenorm":
+            self.norm = MegaScaleNorm(dim=-1, eps=eps, affine=affine)
+        elif norm_type == "rmsnorm":
+            self.norm = MegaRMSNorm(embedding_dim, eps=eps, affine=affine)
+        elif norm_type == "batchnorm":
+            self.norm = nn.BatchNorm1d(embedding_dim, eps=eps, affine=affine)
+        elif norm_type == "syncbatchnorm":
+            self.norm = nn.SyncBatchNorm(embedding_dim, eps=eps, affine=affine)
+        else:
+            raise ValueError(f"Unknown norm type: {norm_type}")
+
+    def forward(self, input):
+        if isinstance(self.norm, nn.modules.batchnorm._BatchNorm):
+            if input.dim() != 3:
+                raise ValueError("BatchNorm inputs must be exactly 3-dimensional")
+            input = input.permute(1, 2, 0)
+            input = self.norm(input)
+            return input.permute(2, 0, 1)
+        else:
+            return self.norm(input)
+
+
+class MegaMultiDimensionDampedEma(nn.Module):
+    """
+    Mega's Exponential Moving Average layer, largely left unmodified from the original repo with the exception of
+    variable names and moving away from the stateful representation of incremental decoding state. See
+    "https://huggingface.co/papers/2209.10655" for more details.
+    """
+
+    def __init__(self, config: MegaConfig):
+        super().__init__()
+
+        self.config = config
+
+        self.embed_dim = config.hidden_size
+        self.ndim = config.ema_projection_size
+        self.bidirectional = config.bidirectional
+        self.truncation = config.truncation
+        self.scale = math.sqrt(1.0 / self.ndim)
+
+        kernel_dim = 2 * config.hidden_size if self.bidirectional else config.hidden_size
+        # renamed delta (damping_factor) and alpha (decay_factor) to be more descriptive of what the parameters are doing
+        self.damping_factor = nn.Parameter(torch.Tensor(kernel_dim, self.ndim, 1))
+        self.decay_factor = nn.Parameter(torch.Tensor(kernel_dim, self.ndim, 1))
+        # renamed gamma (kernel_projection_matrix) and beta (ema_expansion_matrix) respectively to avoid HF renaming
+        # things and align with the paper's description of these params' behavior
+        self.ema_expansion_matrix = nn.Parameter(torch.Tensor(kernel_dim, self.ndim, 1))
+        self.kernel_projection_matrix = nn.Parameter(torch.Tensor(kernel_dim, self.ndim))
+        # renamed omega to residual_weight to describe what it's doing
+        self.residual_weight = nn.Parameter(torch.Tensor(config.hidden_size))
+        self._kernel = None
+        self._coeffs = None
+
+    def _compute_ema_coefficients(self):
+        self._coeffs = None
+        # convert the alpha and delta parameters (kernel_dim x EMA projection size x 1) to [0, 1] with sigmoid
+        damping_factor = torch.sigmoid(self.damping_factor)
+        decay_factor = torch.sigmoid(self.decay_factor)
+        previous_timestep_weight = 1.0 - damping_factor * decay_factor
+        return damping_factor, previous_timestep_weight
+
+    def _compute_efficient_ema_kernel(self, length: int):
+        # computes the kernel used for efficient damped EMA applied via FFT convolution
+        self._kernel = None
+        # p and q have shape (kernel_dim x ema_projection_size x 1)
+        damping_factor, previous_timestep_weight = self._compute_ema_coefficients()
+        # extend the kernel to (kernel_dim X ema_projection_size X sequence_length) and
+        # multiply q by sequential ints up to the sequence length
+        vander = torch.arange(length).to(damping_factor).view(1, 1, length) * torch.log(previous_timestep_weight)
+        kernel = (damping_factor * self.ema_expansion_matrix) * torch.exp(vander)
+        # (kernel_dim X ema_projection_size X sequence_length) -> (kernel_dim, sequence_length)
+        return torch.einsum("dnl,dn->dl", kernel, self.kernel_projection_matrix * self.scale)
+
+    def get_ema_coefficients(self):
+        if self.training:
+            return self._compute_ema_coefficients()
+        else:
+            if self._coeffs is None:
+                self._coeffs = self._compute_ema_coefficients()
+            return self._coeffs
+
+    def get_ema_kernel(self, length: int):
+        kernel_size = length if self.truncation is None else min(self.truncation, length)
+        if self.training:
+            return self._compute_efficient_ema_kernel(kernel_size)
+        else:
+            if self._kernel is None or self._kernel.size(-1) < kernel_size:
+                self._kernel = self._compute_efficient_ema_kernel(kernel_size)
+            return self._kernel[..., :kernel_size]
+
+    def fft_convolution(self, inputs, kernel, length):
+        # this is a wrapper for repeated use of EMA calculation via FFT (fast Fourier transform) convolution
+        inputs_fft = torch.fft.rfft(inputs.float(), n=2 * length)
+        kernel_fft = torch.fft.rfft(kernel.float(), n=2 * length)
+        convolved_sequence = torch.fft.irfft(inputs_fft * kernel_fft, n=2 * length)
+        return convolved_sequence
+
+    def ema_step(self, inputs, length, past_state=None):
+        if length == 1:
+            return self.one_ema_step(inputs, past_state=past_state)
+
+        # (kernel_dim X ema_projection_size X 1)
+        damping_factor, previous_timestep_weight = self.get_ema_coefficients()
+        # (kernel_dim X ema_projection_size X 1+sequence_length)
+        vander = torch.arange(length + 1).to(damping_factor).view(1, 1, length + 1) * torch.log(
+            previous_timestep_weight
+        )
+        vander = torch.exp(vander)
+        if past_state is not None:
+            # (kernel_dim X ema_projection_size X sequence_length) * (kernel_dim X ema_projection_size X 1)
+            # -> (kernel_dim X ema_projection_size X sequence_length)
+            past_ema_proj = vander[:, :, 1:] * (self.kernel_projection_matrix * self.scale).unsqueeze(-1)
+            # past_state will be (batch_size, kernel_dim, ema_projection_size)
+            past_ema_state = torch.einsum("bdn,dnl->bdl", past_state, past_ema_proj)
+            # (kernel_dim X ema_projection_size) * (batch_size X kernel_dim X ema_projection_size)
+            # -> (batch_size X kernel_dim X ema_projection_size)
+            past_vandermonde = vander[:, :, -1] * past_state
+        else:
+            past_ema_state = None
+            past_vandermonde = None
+
+        # (kernel_dim X ema_projection_size X sequence_length)
+        vander = vander[:, :, :-1]
+        kernel = (damping_factor * self.ema_expansion_matrix) * vander
+        kernel_proj = torch.einsum("dnl,dn->dl", kernel, self.kernel_projection_matrix * self.scale)
+
+        ema_output = self.fft_convolution(inputs, kernel_proj, length=length)[..., 0:length]
+        ema_output = ema_output.type_as(inputs)
+        if past_ema_state is not None:
+            ema_output = ema_output + past_ema_state
+
+        updated_hidden_state = torch.einsum("bdl,dnl->bdn", inputs, torch.flip(kernel, dims=[2]))
+        if past_vandermonde is not None:
+            updated_hidden_state = updated_hidden_state + past_vandermonde
+        # return a tuple:
+        # (sequence_length, batch_size, kernel_dim)
+        # (batch_size, kernel_dim, ema_projection_size)
+        return ema_output.permute(2, 0, 1), updated_hidden_state
+
+    def one_ema_step(self, inputs, past_state=None):
+        damping_factor, previous_timestep_weight = self.get_ema_coefficients()
+        # (kernel_dim X ema_projection_size) x (batch_size X kernel_dim X 1)
+        # -> (batch_size X kernel_dim X ema_projection_size)
+        updated_state = (damping_factor * self.ema_expansion_matrix).squeeze(-1) * inputs
+        if past_state is not None:
+            updated_state = updated_state + previous_timestep_weight.squeeze(-1) * past_state
+        # (batch_size X kernel_dim)
+        out = torch.einsum("bdn,dn->bd", updated_state, self.kernel_projection_matrix * self.scale)
+        # (1 X batch_size X kernel_dim), (batch_size X kernel_dim X ema_projection_size)
+        return out.unsqueeze(0), updated_state
+
+    def forward(
+        self,
+        inputs,
+        attention_mask: Optional[torch.Tensor] = None,
+        prev_state: Optional[torch.Tensor] = None,
+        use_cache: bool = False,
+    ) -> torch.Tensor:
+        """
+        Mega's exponential moving average (EMA) sub-layer applied prior to single-headed (traditional) self-attention
+
+        Args:
+            inputs (`torch.Tensor` of shape `(sequence_length, batch_size, hidden_size)`):
+                Hidden state / embedding input to update via EMA based on FFT convolution
+            attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Indicates which inputs are to be ignored (mostly due to padding), where elements are either 1 for *not
+                masked* or 0 for *masked*
+            prev_state (`torch.Tensor` of shape `(batch_size, config.ndim)`, *optional*):
+                The hidden state returned from the previous timestep during incremental decoding.
+            use_cache (`bool`, default `False`):
+                Whether to perform incremental decoding; uses `prev_state` as the prior timestep, and returns the
+                updated EMA hidden state for use in the next step
+
+        Returns:
+            `tuple(torch.FloatTensor)` containing various elements depending on configuration ([`MegaConfig`]) and
+            inputs:
+            - **hidden_states** (`torch.FloatTensor` of shape `(sequence_length, batch_size, hidden_size)`) -- Hidden
+              states updated by EMA, with same shapes as inputs
+            - **updated_state** (*optional*, returned when `use_cache=True`) `torch.FloatTensor of shape `(batch_size,
+              config.ndim)` -- The incremental EMA state for use in the next step of incremental decoding
+        """
+
+        seq_len, bsz, embed_dim = inputs.size()
+        if embed_dim != self.embed_dim:
+            raise ValueError(
+                f"Unexpected embedding dimension received: input is {embed_dim}, model expects {self.embed_dim}"
+            )
+
+        # sequence_length X batch_size X hidden_size
+        residual = inputs * self.residual_weight
+
+        # (sequence_length x batch_size x hidden_size) -> (batch_size x hidden_size x sequence_length)
+        inputs = inputs.permute(1, 2, 0)
+        # mask the input: output is a tensor with 0 in the masked positions
+        if attention_mask is not None:
+            inputs = inputs * (attention_mask.unsqueeze(1).type_as(inputs))
+
+        if self.bidirectional and use_cache:
+            raise RuntimeError("Bidirectional EMA does not support incremental state")
+
+        if use_cache:
+            out, updated_state = self.ema_step(inputs, seq_len, past_state=prev_state)
+
+            # (batch_size X hidden_size) -> (1 x batch_size x hidden_size)
+            out = F.silu(out + residual)
+
+            # if incremental decoding, return the new state along with the output
+            return out, updated_state
+        else:
+            # (hidden_size x sequence_length)
+            kernel = self.get_ema_kernel(seq_len)
+            fft_len = seq_len
+            s_index = 0
+            kernel_size = kernel.size(1)
+            if self.bidirectional:
+                # split the kernel for each direction of EMA
+                k1, k2 = torch.split(kernel, [self.embed_dim, self.embed_dim], dim=0)
+                # (hidden_size X 2*sequence_length - 1)
+                kernel = F.pad(k1, (kernel_size - 1, 0)) + F.pad(k2.flip(-1), (0, kernel_size - 1))
+                inputs = F.pad(inputs, (kernel_size - 1, 0))
+                fft_len = fft_len + kernel_size - 1
+                s_index = 2 * kernel_size - 2
+
+            ema_output = self.fft_convolution(inputs, kernel, length=fft_len)[..., s_index : s_index + seq_len]
+            ema_output = ema_output.type_as(inputs)
+            # (batch_size X hidden_size X sequence_length) -> (sequence_length X batch_size X hidden_size)
+            gated_ema_output = F.silu(ema_output.permute(2, 0, 1) + residual)
+
+            return gated_ema_output, None
+
+
+class MegaGatedCrossAttention(nn.Module):
+    """
+    Gated Structured State Attention for use in encoder-decoder model. See Mega paper for more details. Only
+    modifications from original implementation are variable names, removing the unnecessary `before_attn_fn` and
+    `static_kv` arguments, and the stateful representation of incremental decoder state.
+    """
+
+    def __init__(self, config: MegaConfig):
+        super().__init__()
+
+        self.config = config
+        self.activation = ACT2FN[self.config.activation]
+        self.attention_activation = self.config.attention_activation
+        self.scaling = self.config.shared_representation_size**-0.5 if self.attention_activation == "softmax" else None
+
+        self.dropout = MegaDropout(self.config.dropout_prob, is_featurewise=self.config.use_feature_dropout)
+        self.hidden_dropout = MegaDropout(
+            self.config.hidden_dropout_prob, is_featurewise=self.config.use_feature_dropout
+        )
+        # Attention dropout is standard dropout
+        self.attention_dropout = MegaDropout(self.config.attention_probs_dropout_prob, is_featurewise=False)
+
+        self.prenorm = self.config.normalize_before_mega
+        self.norm = MegaSequenceNorm(
+            self.config.normalization_type, self.config.hidden_size, affine=self.config.norm_affine
+        )
+
+        self.k_proj = nn.Linear(self.config.hidden_size, self.config.shared_representation_size)
+        self.v_proj = nn.Linear(self.config.hidden_size, self.config.hidden_size)
+        self.q_proj = nn.Linear(
+            self.config.hidden_size, 2 * self.config.hidden_size + self.config.shared_representation_size
+        )
+        self.h_proj = nn.Linear(self.config.hidden_size, self.config.hidden_size)
+
+        if self.config.relative_positional_bias == "simple":
+            self.rel_pos_bias = MegaSimpleRelativePositionalBias(config)
+        elif self.config.relative_positional_bias == "rotary":
+            self.rel_pos_bias = MegaRotaryRelativePositionalBias(config)
+        else:
+            raise ValueError(f"unknown relative position bias: {self.config.relative_positional_bias}")
+
+        self.softmax = nn.Softmax(dim=-1)
+
+    def element_attention(self, query, key, key_padding_mask, pidx):
+        bsz, src_len, _ = key.size()
+        tgt_len = query.size(1) if pidx is None else pidx + 1
+        if key_padding_mask is not None:
+            # (batch_size X source_sequence_length) --> (batch_size X 1 X 1)
+            lengths = key_padding_mask.sum(dim=-1).view(bsz, 1, 1)
+        else:
+            lengths = src_len
+
+        # (target_sequence_length X source_sequence_length)
+        bias = self.rel_pos_bias(max(tgt_len, src_len))[:, :src_len]
+        if pidx is not None:
+            if query.size(1) != 1:
+                raise ValueError("Position offset provided with queries longer than 1 token")
+            # source_sequence_length
+            bias = bias[pidx]
+        else:
+            # (target_sequence_length X source_sequence_length)
+            bias = bias[:tgt_len]
+
+        # (batch_size X target_sequence_length X source_sequence_length)
+        qk = torch.bmm(query, key.transpose(1, 2)) / lengths + bias
+
+        attn_weights = ACT2FN[self.attention_activation](qk).type_as(qk)
+
+        if key_padding_mask is not None:
+            attn_weights = attn_weights * key_padding_mask.unsqueeze(1)
+
+        return attn_weights
+
+    def softmax_attention(self, query, key, key_padding_mask, pidx):
+        bsz, src_len, _ = key.size()
+        tgt_len = query.size(1) if pidx is None else pidx + 1
+
+        # (target_sequence_length X source_sequence_length)
+        bias = self.rel_pos_bias(max(tgt_len, src_len))[:, :src_len]
+        if pidx is not None:
+            if query.size(1) != 1:
+                raise ValueError("Position offset provided with queries longer than 1 token")
+            # source_sequence_length
+            bias = bias[pidx]
+        else:
+            # (target_sequence_length X source_sequence_length)
+            bias = bias[:tgt_len]
+
+        # scaled attention
+        query = query * self.scaling
+        # (batch_size X target_sequence_length X source_sequence_length)
+        qk = torch.bmm(query, key.transpose(1, 2)) + bias
+
+        if key_padding_mask is not None:
+            qk = qk.masked_fill((1 - key_padding_mask).unsqueeze(1).to(torch.bool), float("-inf"))
+
+        attn_weights = self.softmax(qk).type_as(qk)
+        return attn_weights
+
+    def forward(
+        self,
+        query,
+        key: Optional[torch.Tensor],
+        value: Optional[torch.Tensor],
+        key_padding_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """
+        Gated cross-attention used in Mega
+
+        Args:
+            query (`torch.Tensor` of shape `(target_sequence_length, batch_size, hidden_size)`):
+                The self (or target) sequence input used as query inputs for cross-attention
+            key (`torch.Tensor` of shape `(source_sequence_length, batch_size, hidden_size)`):
+                The cross (or source) sequence input with shape used as keys in cross-attention
+            value (`torch.Tensor` of shape `(source_sequence_length, batch_size, hidden_size)`):
+                The cross (or source) sequence input with shape used as values in cross-attention
+            key_padding_mask (`torch.LongTensor` of shape `(batch_size, source_sequence_length)`, *optional*):
+                Padding mask corresponding to the source sequence, where entries are 1 for *not masked* and 0 for
+                *masked* tokens
+            past_key_values (`tuple(torch.FloatTensor)`, *optional*):
+                If provided, the hidden state returned from the previous timestep during incremental decoding; expects
+                that prior cross-attention keys and values will be the last two items in the tuple
+            output_attentions (`bool`, defaults to `False`):
+                Whether or not to return the cross-attention weights.
+            use_cache (`bool`, defaults to `False`):
+                Whether to perform incremental decoding; uses `prev_state` as the prior timestep, and returns the
+                updated EMA hidden state for use in the next step
+
+        Returns:
+            `tuple(torch.FloatTensor)` containing various elements depending on configuration ([`MegaConfig`]) and
+            inputs:
+            - **hidden_states** (`torch.FloatTensor` of shape `(target_sequence_length, batch_size, hidden_size)`) --
+              Hidden states from target sequence updated by gated cross-attention
+            - **attn_weights** (*optional*, returned when `output_attentions=True`) `torch.FloatTensor` of shape
+              `(batch_size, source_sequence_length, target_sequence_length)` -- The pairwise cross-attention weights
+              corresponding to each token in the source and target sequences
+            - **cross_key** (*optional*, returned when `use_cache=True`) `torch.FloatTensor` of shape `(batch_size,
+              source_sequence_length, config.shared_representation_size)` -- The cross-attention key state for use in
+              the next step of incremental decoding
+            - **cross_value** (*optional*, returned when `use_cache=True`) `torch.FloatTensor` of shape `(batch_size,
+              source_sequence_length, config.hidden_size)` -- The cross-attention value state for use in the next step
+              of incremental decoding
+        """
+
+        seq_len, bsz, embed_dim = query.size()
+        if embed_dim != self.config.hidden_size:
+            raise ValueError(
+                f"Unexpected embedding dimension received: input is {embed_dim} but expected {self.config.hidden_size}"
+            )
+
+        if past_key_values is not None:
+            # make sure the inputs only have a sequence length of 1 if we're doing incremental decoding
+            if seq_len != 1:
+                raise ValueError(f"Incremental decoding requested with self-sequence length > 1: {seq_len}")
+            # expect past_key_values to have (self_key, self_value, self_ema, cross_key, cross_value)
+            prev_cross_key, prev_cross_value = past_key_values[-2:]
+            key = value = None
+
+            # use the self-attention cache to get the position id of the current step
+            prev_self_key = past_key_values[0]
+            num_incremental_steps = prev_self_key.size(1) + 1
+        else:
+            prev_cross_key = prev_cross_value = None
+            # we still need the position id if we're doing incremental decoding (past_key_values will be None for the first step)
+            num_incremental_steps = 0 if use_cache and (seq_len == 1) else None
+
+        full_query = query
+        if self.prenorm:
+            full_query = self.norm(full_query)
+
+        # (target_sequence_length X batch_size X 2*hidden_size + shared_representation_size)
+        query_projected = self.q_proj(full_query)
+        # split the query projections into separate components
+        # - residual_weight is passed through sigmoid and sent through elementwise multiplication to the gated/weighted targets prior to being added to the query directly
+        # - target_gate is a silu-gated tensor that is multiplied by the attention-weighted target below prior to residual connection
+        # - attention_query is the part that is passed to the attention function
+        residual_weight, target_gate, attention_query = torch.split(
+            query_projected,
+            [self.config.hidden_size, self.config.hidden_size, self.config.shared_representation_size],
+            dim=-1,
+        )
+
+        # (target_sequence_length X batch_size X hidden_size)
+        residual_weight = torch.sigmoid(residual_weight)
+        target_gate = F.silu(target_gate)
+
+        if key is None:
+            if value is not None:
+                raise ValueError("Key and value must be `None` simultaneously")
+            projected_key = projected_value = None
+        else:
+            # (source_sequence_length X batch_size X shared_representation_size)
+            projected_key = self.k_proj(key)
+            # (source_sequence_length X batch_size X hidden_size)
+            projected_value = self.activation(self.v_proj(key))
+
+        # (target_sequence_length X batch_size X shared_representation_size)
+        # -> (batch_size X target_sequence_length X shared_representation_size)
+        attention_query = attention_query.transpose(0, 1)
+        if projected_key is not None:
+            projected_key = projected_key.transpose(0, 1)
+        if projected_value is not None:
+            projected_value = projected_value.transpose(0, 1)
+
+        # if we're doing incremental decoding, k and v are None and need to be overwritten with past values
+        if past_key_values is not None:
+            projected_key = prev_cross_key
+            projected_value = prev_cross_value
+
+        # if we're returning the cache for later use, store these now for later return (can be done without having past_key_values provided)
+        if use_cache:
+            updated_cross_key = projected_key
+            updated_cross_value = projected_value
+
+        ctx_len = projected_key.size(1)
+        # This is part of a workaround to get around fork/join parallelism
+        # not supporting Optional types.
+        if key_padding_mask is not None and key_padding_mask.dim() == 0:
+            key_padding_mask = None
+
+        if key_padding_mask is not None:
+            if key_padding_mask.size(0) != bsz:
+                raise ValueError("Key padding mask does not align on the batch dimension")
+            if key_padding_mask.size(1) != ctx_len:
+                raise ValueError("Key padding mask does not align on the sequence length dimension")
+
+        if self.attention_activation == "softmax":
+            attn_weights = self.softmax_attention(
+                attention_query, projected_key, key_padding_mask, num_incremental_steps
+            )
+        else:
+            attn_weights = self.element_attention(
+                attention_query, projected_key, key_padding_mask, num_incremental_steps
+            )
+
+        projected_value = self.hidden_dropout(projected_value, batch_first=True)
+        kernel = self.attention_dropout(attn_weights)
+        # (batch_size X target_sequence_length X hidden_size)
+        # -> (target_sequence_length X batch_size X hidden_size)
+        weighted_targets = torch.bmm(kernel, projected_value).transpose(0, 1)
+        # (target_sequence_length X batch_size X hidden_size)
+        weighted_targets = self.activation(self.h_proj(weighted_targets * target_gate))
+        weighted_targets = self.dropout(weighted_targets)
+        out = torch.addcmul(query, residual_weight, weighted_targets - query)
+
+        if not self.prenorm:
+            out = self.norm(out)
+
+        outputs = (out, attn_weights) if output_attentions else (out,)
+        if use_cache:
+            outputs = outputs + (updated_cross_key, updated_cross_value)
+
+        return outputs
+
+
+class MegaMovingAverageGatedAttention(nn.Module):
+    """
+    Pure PyTorch implementation of Mega block; see https://huggingface.co/papers/2209.10655 and original fairseq implementation
+    at https://github.com/facebookresearch/mega (copyright Meta Research, licensed under MIT License)
+
+    Differences from original implementation include hidden state refactor and fixed inconsistency with additive /
+    multiplicative attention masks
+    """
+
+    def __init__(self, config: MegaConfig):
+        super().__init__()
+        self.config = config
+        self.activation = ACT2FN[self.config.activation]
+        self.scaling = (
+            self.config.shared_representation_size**-0.5 if self.config.attention_activation == "softmax" else None
+        )
+        self.dropout = MegaDropout(self.config.dropout_prob, is_featurewise=self.config.use_feature_dropout)
+        self.hidden_dropout = MegaDropout(
+            self.config.hidden_dropout_prob, is_featurewise=self.config.use_feature_dropout
+        )
+        # attention dropout is standard dropout
+        self.attention_dropout = MegaDropout(self.config.attention_probs_dropout_prob, is_featurewise=False)
+
+        self.norm = MegaSequenceNorm(
+            self.config.normalization_type, self.config.hidden_size, affine=self.config.norm_affine
+        )
+        self.ema_gate = MegaMultiDimensionDampedEma(config)
+
+        self.v_proj = nn.Linear(self.config.hidden_size, self.config.intermediate_size)
+        self.mx_proj = nn.Linear(
+            self.config.hidden_size,
+            self.config.shared_representation_size + self.config.intermediate_size + 2 * self.config.hidden_size,
+        )
+        self.h_proj = nn.Linear(self.config.intermediate_size, self.config.hidden_size)
+
+        self.qk_weight = nn.Parameter(torch.Tensor(2, self.config.shared_representation_size))
+        self.qk_bias = nn.Parameter(torch.Tensor(2, self.config.shared_representation_size))
+
+        if self.config.relative_positional_bias == "simple":
+            self.rel_pos_bias = MegaSimpleRelativePositionalBias(config)
+        elif self.config.relative_positional_bias == "rotary":
+            self.rel_pos_bias = MegaRotaryRelativePositionalBias(config)
+        else:
+            raise ValueError(f"Unknown relative positional bias: {self.config.relative_positional_bias}")
+
+        self.softmax = nn.Softmax(dim=-1)
+        self.attention_function = (
+            self.softmax_attention if self.config.attention_activation == "softmax" else self.element_attention
+        )
+
+    def element_attention(self, query, key, padding_mask, causal_mask):
+        """
+        Apply element-wise attention via relu^2 or laplace. Same as original implementation but with standardized
+        causal attention mask. Expects the Hugging Face standard attention mask paradigm: 1 for not masked, and 0 for
+        masked.
+        """
+        seq_len = key.size(2)
+        if padding_mask is not None:
+            # (batch_size X number of chunks X 1)
+            lengths = padding_mask.sum(-1, keepdim=True)
+            # (batch_size X number of chunks X 1 X 1)
+            lengths = lengths.clamp(min=1.0).unsqueeze(-1)
+        else:
+            lengths = seq_len
+
+        if causal_mask is not None:
+            lengths = causal_mask.sum(dim=-1, keepdim=True)
+
+        # (sequence_length X sequence_length)
+        bias = self.rel_pos_bias(seq_len)
+        if seq_len != query.size(2):
+            if query.size(2) != 1:
+                raise ValueError("Size mismatch between Q and K in element attention")
+            # (1 X sequence_length)
+            bias = bias[-1:]
+
+        # (batch_size X number of chunks X sequence_length X sequence_length)
+        qk = torch.matmul(query, key.transpose(2, 3)) / lengths + bias
+
+        attn_weights = ACT2FN[self.config.attention_activation](qk).type_as(qk)
+
+        if padding_mask is not None:
+            attn_weights = attn_weights * padding_mask.unsqueeze(2)
+
+        if causal_mask is not None:
+            attn_weights = attn_weights * causal_mask
+
+        return attn_weights
+
+    def softmax_attention(self, query, key, padding_mask, causal_mask):
+        "Standard softmax self-attention, as in the original Transformer paper"
+        seq_len = key.size(2)
+        # (sequence_length X sequence_length)
+        bias = self.rel_pos_bias(seq_len)
+        if seq_len != query.size(2):
+            if query.size(2) != 1:
+                raise ValueError("Size mismatch between Q and K in softmax attention")
+            # (1 X sequence_length)
+            bias = bias[-1:]
+
+        # scaled attention
+        query = query * self.scaling
+
+        # (batch_size x number of chunks x chunk_size x chunk_size) if chunking
+        # (batch_size x 1 x sequence_length x sequence_length) otherwise
+        qk = torch.matmul(query, key.transpose(2, 3)) + bias
+
+        # apply causal mask (presumed to be 1/0 for not masked / masked)
+        # additive, but convert to 0/-inf (which is not explicitly in the Mega source code)
+        if causal_mask is not None:
+            additive_causal_mask = torch.zeros_like(causal_mask, dtype=qk.dtype)
+            additive_causal_mask = additive_causal_mask.masked_fill((1 - causal_mask).bool(), float("-inf"))
+            qk = qk + additive_causal_mask
+
+        if padding_mask is not None:
+            # 1 for tokens which are *not masked*
+            # 0 for tokens which are *masked*
+            # replace masked tokens with -inf to make softmax ignore them
+            # need to invert the padding mask to match what mega original did
+            padding_mask = 1 - padding_mask
+            padding_mask_all = padding_mask.all(dim=-1, keepdim=True)
+            padding_mask = torch.logical_and(padding_mask, ~padding_mask_all)
+            qk = qk.masked_fill(padding_mask.unsqueeze(2).to(torch.bool), float("-inf"))
+
+        attn_weights = self.softmax(qk).type_as(qk)
+        return attn_weights
+
+    def forward(
+        self,
+        input,
+        padding_mask: Optional[torch.Tensor] = None,
+        causal_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions=False,
+        use_cache=False,
+    ):
+        """
+        Mega's self-attention block, which combines multi-headed EMA with traditional self-attention
+
+        Args:
+            input (`torch.Tensor` of shape `(sequence_length, batch_size, hidden_size)`):
+                Hidden states to be updated by Mega's self-attention
+            padding_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Indicates which inputs are to be ignored due to padding, where elements are either 1 for *not masked*
+                or 0 for *masked*
+            causal_mask (`torch.LongTensor` of shape `(sequence_length, sequence_length)`, *optional*):
+                Indicates which inputs are to be ignored due to causal attention, where elements are either 1 for *not
+                masked* or 0 for *masked*
+            past_key_values (`tuple(torch.Tensor)`, *optional*):
+                The hidden states returned from the previous timestep during incremental decoding; expects that
+                self-attention key, value, and EMA states are the first 3 entries in the tuple
+            output_attentions (`bool`, default `False`):
+                Whether to return self-attention weights
+            use_cache (`bool`, default `False`):
+                Whether to perform incremental decoding; uses `past_key_values` as prior state, and returns the updated
+                states for use in the next step
+
+        Returns:
+            `tuple(torch.FloatTensor)` containing various elements depending on configuration ([`MegaConfig`]) and
+            inputs:
+            - **hidden_states** (`torch.FloatTensor` of shape `(sequence_length, batch_size, hidden_size)`) -- Hidden
+              states from target sequence updated by Mega's self-attention
+            - **attn_weights** (*optional*, returned when `output_attentions=True`) `torch.FloatTensor` of shape
+              `(batch_size, 1, sequence_length, sequence_length)` -- The self-attention weights corresponding to how
+              each token in the input sequence attends to every other token
+            - **self_key** (*optional*, returned when `use_cache=True`) `torch.FloatTensor` of shape `(batch_size,
+              sequence_length, config.shared_representation_size)` -- The self-attention key state for use in the next
+              step of incremental decoding
+            - **self_value** (*optional*, returned when `use_cache=True`) `torch.FloatTensor` of shape `(batch_size,
+              sequence_length, config.hidden_size)` -- The self-attention value state for use in the next step of
+              incremental decoding
+            - **self_ema_state** (*optional*, returned when `use_cache=True`) `torch.FloatTensor` of shape
+              `(batch_size, config.ndim)` The incremental EMA state for use in the next step of incremental decoding.
+        """
+
+        seq_len, bsz, embed_dim = input.size()
+        if embed_dim != self.config.hidden_size:
+            raise ValueError(f"Input embedding dimension should be {self.config.hidden_size}; received {embed_dim}")
+
+        # store inputs for residual connection and handle pre-norm if requested
+        residual = input
+        if self.config.normalize_before_mega:
+            input = self.norm(input)
+
+        # (sequence_length X batch_size X hidden_size) -> (sequence_length X batch_size X intermediate_size)
+        value = self.activation(self.v_proj(input))
+
+        # unpack the incremental state if provided
+        # assumed to be (self K, self V, self EMA state, cross K, cross V)
+        # also assumes that incremental decoding is working one token at a time, so input sequence length must be 1
+        if self.config.is_decoder and (past_key_values is not None):
+            if seq_len > 1:
+                raise ValueError(f"Incremental decoding only supports self sequence length of 1; received {seq_len}")
+            # the first 3 items in the saved states will be these regardless of whether cross-attention is present
+            prev_self_key, prev_self_value, prev_ema_state = past_key_values[0:3]
+        else:
+            prev_self_key = prev_self_value = prev_ema_state = None
+
+        # ema output is (sequence_length x batch_size x hidden_size)
+        # updated_ema_state will be None if use_cache=False; otherwise (batch_size, config.ndim)
+        ema_out, updated_ema_state = self.ema_gate(
+            input, attention_mask=padding_mask, prev_state=prev_ema_state, use_cache=use_cache
+        )
+        ema_out = self.dropout(ema_out)
+
+        # (sequence_length X batch_size X hidden_size)
+        # -> (sequence_length X batch_size X 2*hidden_size + config.shared_representation_size + config.intermediate_size)
+        # - residual_weight -> sigmoid -> applied to residual connection in torch.addcmul
+        # - query_key_gates -> split into two components: query_key becomes query and key for attention input, gates becomes gating for self-attention output
+        # - intermediate_state -> added to weighted attention output, sent through activation, and has inputs subtracted during
+        #   torch.addcmul to create the final layer output
+        base = self.mx_proj(ema_out)
+        residual_weight, query_key_gates, intermediate_state = torch.split(
+            base,
+            [
+                self.config.hidden_size,
+                self.config.shared_representation_size + self.config.intermediate_size,
+                self.config.hidden_size,
+            ],
+            dim=-1,
+        )
+
+        # (sequence_length X batch_size X hidden_size)
+        residual_weight = torch.sigmoid(residual_weight)
+
+        # (sequence_length X batch_size X shared_representation_size + intermediate_size)
+        query_key_gates = F.silu(query_key_gates)
+
+        # split into two different tensors: one for Q/K usage and the other for gating self-attention
+        query_key, attention_gate = torch.split(
+            query_key_gates, [self.config.shared_representation_size, self.config.intermediate_size], dim=-1
+        )
+
+        # (sequence_length X batch_size X shared_representation_size)
+        # -> (sequence_length X batch_size X 1 X shared_representation_size)
+        # -> (sequence_length X batch_size X 2 X shared_representation_size)
+        query_key = query_key.unsqueeze(2) * self.qk_weight + self.qk_bias
+
+        # (sequence_length X batch_size X 2 X shared_representation_size)
+        # -> 2 tensors of (sequence_length X batch_size X shared_representation_size)
+        query, key = torch.unbind(query_key, dim=2)
+
+        # (sequence_length X batch_size X dimension)
+        # -> (batch_size X sequence_length X dimension)
+        # where `dimension` is either shared_representation_size (queries and keys) or intermediate_size (values)
+        query = query.transpose(0, 1)
+        key = key.transpose(0, 1)
+        value = value.transpose(0, 1)
+
+        if self.config.is_decoder:
+            # combine history and current to save updated state (if history is provided)
+            # when chunking is applied, the past states will be None at the end of the chunk, in
+            # which case, proceed as if no K/V history had been provided
+            # saved states are stored with shape (batch_size X sequence_length X dimension)
+            if prev_self_key is not None:
+                key = torch.cat([prev_self_key, key], dim=1)
+            if prev_self_value is not None:
+                value = torch.cat([prev_self_value, value], dim=1)
+
+            # if not chunking, store as-is
+            if not self.config.use_chunking:
+                updated_self_key = key
+                updated_self_value = value
+            else:
+                curr_len = key.size(1) % self.config.chunk_size
+                if curr_len == 0:
+                    # if we're chunking and have reached the end of a chunk, wipe out the saved state
+                    updated_self_key = None
+                    updated_self_value = None
+                else:
+                    updated_self_key = key
+                    updated_self_value = value
+
+        ctx_len = key.size(1)  # potentially differs from seq_len because of incremental decoding
+        if not self.config.use_chunking:
+            # if we're not chunking, treat the entire sequence as one long chunk
+            # (batch_size X sequence_length X dimension) -> (batch_size X 1 X sequence_length X dimension)
+            query = query.unsqueeze(1)
+            key = key.unsqueeze(1)
+            value = value.unsqueeze(1)
+            if padding_mask is not None:
+                # (batch_size X sequence_length) -> (batch_size X 1 X sequence_length)
+                padding_mask = padding_mask.unsqueeze(1)
+        else:
+            # otherwise, split the sequences in the batch into `n_chunks` chunks of size `chunk_size`
+            if seq_len < self.config.chunk_size:
+                query = query.unsqueeze(1)
+            else:
+                # (batch_size X sequence_length X dimension) -> (batch_size X n_chunks X chunk_size X dimension)
+                n_chunks = seq_len // self.config.chunk_size
+                query = query.reshape(bsz, n_chunks, self.config.chunk_size, self.config.shared_representation_size)
+
+            if ctx_len < self.config.chunk_size:
+                key = key.unsqueeze(1)
+                value = value.unsqueeze(1)
+                if padding_mask is not None:
+                    padding_mask = padding_mask.unsqueeze(1)
+            else:
+                # (batch_size X sequence_length X dimension) -> (batch_size X n_chunks X chunk_size X dimension)
+                n_chunks = ctx_len // self.config.chunk_size
+                key = key.reshape(bsz, n_chunks, self.config.chunk_size, self.config.shared_representation_size)
+                value = value.reshape(bsz, n_chunks, self.config.chunk_size, self.config.intermediate_size)
+                if padding_mask is not None:
+                    padding_mask = padding_mask.view(bsz, n_chunks, self.config.chunk_size)
+
+        # this is in the original Mega implementation to work around fork/join parallelism not supporting optional types
+        if padding_mask is not None and padding_mask.dim() == 0:
+            padding_mask = None
+
+        attn_weights = self.attention_function(query, key, padding_mask=padding_mask, causal_mask=causal_mask)
+
+        value = self.hidden_dropout(value, batch_first=True)
+        kernel = self.attention_dropout(attn_weights)
+
+        # (batch_size x n_chunks x chunk_size x intermediate_size) -> (sequence_length X batch_size X intermediate_size)
+        weighted_self_output = (
+            torch.matmul(kernel, value).view(bsz, seq_len, self.config.intermediate_size).transpose(0, 1)
+        )
+
+        # (sequence_length X batch_size X intermediate_size) -> (sequence_length X batch_size X hidden_size)
+        weighted_self_output = self.activation(intermediate_state + self.h_proj(weighted_self_output * attention_gate))
+        weighted_self_output = self.dropout(weighted_self_output)
+        # (sequence_length X batch_size X hidden_size)
+        out = torch.addcmul(residual, residual_weight, weighted_self_output - residual)
+
+        if not self.config.normalize_before_mega:
+            out = self.norm(out)
+
+        return_values = (out, attn_weights) if output_attentions else (out,)
+
+        if self.config.is_decoder:
+            return_values = return_values + (updated_self_key, updated_self_value, updated_ema_state)
+
+        return return_values
+
+
+class MegaNormalizedFeedForwardNetwork(nn.Module):
+    """
+    Normalized feed-forward network used in Mega blocks. Left as-is from original Mega repo aside from retrieving args
+    from Hugging Face config
+    """
+
+    def __init__(self, config: MegaConfig):
+        super().__init__()
+
+        self.config = config
+        self.hidden_dim = config.nffn_hidden_size
+        self.act_fn = config.activation
+        self.activation = ACT2FN[config.activation]
+
+        self.dropout = MegaDropout(self.config.dropout_prob, is_featurewise=self.config.use_feature_dropout)
+        self.hidden_dropout = MegaDropout(
+            self.config.nffn_activation_dropout_prob, is_featurewise=self.config.use_feature_dropout
+        )
+
+        self.prenorm = self.config.normalize_before_ffn
+        self.norm = MegaSequenceNorm(
+            self.config.normalization_type, self.config.hidden_size, affine=self.config.norm_affine
+        )
+
+        self.fc1 = nn.Linear(self.config.hidden_size, self.config.nffn_hidden_size)
+        self.fc2 = nn.Linear(self.config.nffn_hidden_size, self.config.hidden_size)
+
+    def forward(self, inputs):
+        residual = inputs
+
+        if self.prenorm:
+            inputs = self.norm(inputs)
+
+        hidden = self.activation(self.fc1(inputs))
+        hidden = self.hidden_dropout(hidden)
+        output = self.fc2(hidden)
+        output = self.dropout(output)
+        output = output + residual
+
+        if not self.prenorm:
+            output = self.norm(output)
+
+        return output
+
+
+class MegaBlock(nn.Module):
+    def __init__(self, config: MegaConfig):
+        super().__init__()
+        self.seq_len_dim = 1
+        self.mega_layer = MegaMovingAverageGatedAttention(config)
+        self.nffn = MegaNormalizedFeedForwardNetwork(config) if config.use_normalized_ffn else None
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
+            self.cross_attn = MegaGatedCrossAttention(config)
+        else:
+            self.cross_attn = None
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.LongTensor] = None,
+        causal_mask: Optional[torch.LongTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: bool = False,
+    ) -> tuple[torch.Tensor]:
+        """
+        A single Mega layer: either encoder or decoder, with optional cross-attention and optional normalized
+        feed-forward layer
+
+        Args:
+            hidden_states (`torch.Tensor` of shape `(target_sequence_length, batch_size, hidden_size)`):
+                Hidden states to be updated by the Mega block
+            attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+                Indicates which entries in the self/target sequence are to be ignored (mostly due to padding), where
+                elements are either 1 for *not masked* or 0 for *masked*. Causal attention is enforced internally.
+            causal_mask (`torch.LongTensor` of shape `(sequence_length, sequence_length)`, *optional*):
+                Indicates which inputs are to be ignored due to causal attention, where elements are either 1 for *not
+                masked* or 0 for *masked*
+            encoder_hidden_states (`torch.Tensor`, of shape `(source_sequence_length, batch_size, hidden_size)`, *optional*):
+                Encoder hidden states to be used for cross-attention (and required for encoder-decoder model setup)
+            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, source_sequence_length)`, *optional*):
+                Indicates which entries in the cross/source sequence are to be ignored (mostly due to padding), where
+                elements are either 1 for *not masked* or 0 for *masked*.
+            past_key_values (`tuple(torch.Tensor)`, *optional*):
+                The hidden states returned from the previous timestep during incremental decoding; expects that
+                self-attention key, value, and EMA states are the first 3 entries in the tuple, and (if doing
+                cross-attention) cross-attention key and value are the last 2 entries in the tuple
+            output_attentions (`bool`, default `False`):
+                Whether to return self-attention weights
+            use_cache (`bool`, default `False`):
+                Whether to perform incremental decoding; uses `past_key_values` as prior state, and returns the updated
+                states for use in the next step
+
+        Returns:
+            `tuple(torch.FloatTensor)` containing various elements depending on configuration ([`MegaConfig`]) and
+            inputs:
+            - **hidden_states** (`torch.FloatTensor` of shape `(target_sequence_length, batch_size, hidden_size)`) --
+              Hidden states from target sequence updated by Mega
+            - **self_attn_weights** (*optional*, returned when `output_attentions=True`) `torch.FloatTensor` of shape
+              `(batch_size, 1, target_sequence_length, target_sequence_length)` -- The self-attention weights
+              corresponding to how each token in the input sequence attends to every other token
+            - **cross_attn_weights** (*optional*, returned when `output_attentions=True` and
+              `config.add_cross_attention=True`) `torch.FloatTensor` of shape `(batch_size, source_sequence_length,
+              target_sequence_length)` -- Pairwise cross-attention weights between every entry in the source sequence
+              and target sequence
+            - **self_key** (*optional*, returned when `use_cache=True`) `torch.FloatTensor` of shape `(batch_size,
+              sequence_length, config.shared_representation_size)` -- The self-attention key state for use in the next
+              step of incremental decoding
+            - **self_value** (*optional*, returned when `use_cache=True`) `torch.FloatTensor` of shape `(batch_size,
+              sequence_length, config.hidden_size)` -- The self-attention value state for use in the next step of
+              incremental decoding
+            - **self_ema_state** (*optional*, returned when `use_cache=True`) `torch.FloatTensor` of shape
+              `(batch_size, config.ndim)` The incremental EMA state for use in the next step of incremental decoding.
+            - **cross_key** (*optional*, returned when `use_cache=True` and `config.is_decoder=True`)
+              `torch.FloatTensor` of shape `(batch_size, source_sequence_length, config.shared_representation_size)` --
+              The cross-attention key state for use in the next step of incremental decoding
+            - **cross_value** (*optional*, returned when `use_cache=True` and `config.is_decoder=True`)
+              `torch.FloatTensor` of shape `(batch_size, source_sequence_length, config.hidden_size)` -- The
+              cross-attention value state for use in the next step of incremental decoding
+        """
+
+        # incremental decoding in the MegaMultiDimensionDampedEma module requires that the attention mask has the same
+        # sequence length as the input tensor; if we're caching incremental states, we assume the input
+        # sequence length is 1 (Mega will break otherwise), so we take the padding mask for the final
+        # token in the input (mask is received as [batch X sequence length])
+        if use_cache and (past_key_values is not None) and (attention_mask is not None):
+            mega_padding_mask = attention_mask[:, -1].unsqueeze(-1)
+        else:
+            mega_padding_mask = attention_mask
+
+        mega_outputs = self.mega_layer(
+            input=hidden_states,
+            padding_mask=mega_padding_mask,
+            causal_mask=causal_mask,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+
+        new_hidden_states = mega_outputs[0]
+        self_key, self_value, self_ema_state = mega_outputs[-3:] if use_cache else (None, None, None)
+        self_attention_weights = mega_outputs[1] if output_attentions else None
+
+        # optional cross attention
+        if self.cross_attn is not None:
+            if encoder_hidden_states is None:
+                raise ValueError("Requested cross-attention without providing encoder hidden states")
+
+            cross_attn_outputs = self.cross_attn(
+                query=new_hidden_states,
+                key=encoder_hidden_states,
+                value=encoder_hidden_states,
+                key_padding_mask=encoder_attention_mask,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+
+            # update the hidden state from cross attention
+            new_hidden_states = cross_attn_outputs[0]
+            # store cross-attention k/v if caching
+            cross_key, cross_value = cross_attn_outputs[-2:] if use_cache else (None, None)
+            cross_attention_weights = cross_attn_outputs[1] if output_attentions else None
+
+        # optional NFFN follows cross attention
+        if self.nffn is not None:
+            new_hidden_states = self.nffn(new_hidden_states)
+
+        outs = (new_hidden_states,)
+        if output_attentions:
+            outs = outs + (self_attention_weights,)
+            if self.cross_attn is not None:
+                outs = outs + (cross_attention_weights,)
+
+        if use_cache:
+            new_key_values = (
+                self_key,
+                self_value,
+                self_ema_state,
+            )
+            if self.cross_attn is not None:
+                new_key_values = new_key_values + (cross_key, cross_value)
+
+            outs = outs + (new_key_values,)
+
+        return outs
+
+
+# copied from transformers.models.roberta.modeling_roberta.RobertaPooler with Roberta->Mega
+class MegaPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class MegaPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config: MegaConfig
+    base_model_prefix = "mega"
+    supports_gradient_checkpointing = False
+    _no_split_modules = ["MegaMovingAverageGatedAttention"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, MegaMultiDimensionDampedEma):
+            with torch.no_grad():
+                # delta & alpha
+                nn.init.normal_(module.damping_factor, mean=0.0, std=self.config.ema_delta_alpha_range)
+                nn.init.normal_(module.decay_factor, mean=0.0, std=self.config.ema_delta_alpha_range)
+                # beta [1, -1, 1, -1, ...] seems more stable.
+                val = torch.ones(self.config.ema_projection_size, 1)
+                if self.config.ema_projection_size > 1:
+                    idx = torch.tensor(list(range(1, self.config.ema_projection_size, 2)))
+                    val.index_fill_(0, idx, -1.0)
+                module.ema_expansion_matrix.normal_(mean=0.0, std=self.config.ema_beta_range).add_(val)
+                # gamma & omega
+                nn.init.normal_(module.kernel_projection_matrix, mean=0.0, std=self.config.ema_gamma_omega_range)
+                nn.init.normal_(module.residual_weight, mean=0.0, std=self.config.ema_gamma_omega_range)
+        elif isinstance(module, MegaSimpleRelativePositionalBias):
+            nn.init.normal_(module.rel_pos_bias, mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, MegaRotaryRelativePositionalBias):
+            nn.init.normal_(module.alpha, mean=0.0, std=self.config.initializer_range)
+            nn.init.normal_(module.b_param, mean=0.0, std=self.config.initializer_range)
+        elif isinstance(module, MegaScaleNorm):
+            if self.config.norm_affine:
+                nn.init.constant_(module.scalar, 1.0)
+        elif isinstance(module, MegaRMSNorm):
+            if self.config.norm_affine:
+                nn.init.constant_(module.weight, 1.0)
+        elif isinstance(module, MegaMovingAverageGatedAttention):
+            # linear layers covered separately by the generic nn.Linear init below
+            nn.init.normal_(module.qk_weight, mean=0.0, std=self.config.initializer_range)
+            nn.init.constant_(module.qk_bias, 0.0)
+        elif isinstance(module, nn.Linear):
+            # initializes all linear layers in the entire network
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+MEGA_START_DOCSTRING = r"""
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`MegaConfig`]): Model configuration class with all the parameters of the
+            model. Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+MEGA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+            This parameter can only be used when the model is initialized with `add_token_type_embeddings` parameter
+            set to `True`. All the value in this tensor should be always < config.type_vocab_size.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare MEGA Model transformer outputting raw hidden-states without any specific head on top.",
+    MEGA_START_DOCSTRING,
+)
+class MegaModel(MegaPreTrainedModel):
+    """
+
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added after self-attention, following the architecture described in *Mega: Moving Average
+    Equipped Gated Attention*_ by Xuezhe Ma, Chunting Zhou, Xiang Kong, Junxian He, Liangke Gui, Graham Neubig,
+    Jonathan May, and Luke Zettlemoyer
+
+    To behave as a decoder the model needs to be initialized with the `is_decoder` argument of the configuration set to
+    `True` and `bidirectional` set to `False`. To be used in a Seq2Seq model, the model needs to initialized with both
+    `is_decoder=True` and `bidirectional=False` argument as well as `add_cross_attention` set to `True`; an
+    `encoder_hidden_states` is then expected as an input to the forward pass.
+
+    .. _*Mega: Moving Average Equipped Gated Attention*: https://huggingface.co/papers/2209.10655
+
+    """
+
+    def __init__(self, config: MegaConfig, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+
+        self.embedding_layer = MegaEmbeddings(config)
+        self.layers = nn.ModuleList([MegaBlock(config) for _ in range(config.num_hidden_layers)])
+
+        self.pooler = MegaPooler(config) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing (retained from RoBERTa code)
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embedding_layer.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embedding_layer.word_embeddings = value
+
+    @add_start_docstrings_to_model_forward(MEGA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPoolingAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+            device = input_ids.device
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            device = inputs_embeds.device
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if self.config.use_chunking:
+            input_shape = torch.tensor([input_shape[0], self.config.chunk_size])
+
+        batch_size, sequence_length = input_shape
+
+        if self.config.use_chunking and (sequence_length > self.config.chunk_size):
+            if sequence_length % self.config.chunk_size != 0:
+                raise ValueError(
+                    f"config.use_chunking is activated; input sequence length must be shorter than or a multiple of config.chunk_size\nreceived sequence length of {sequence_length} with chunk size {self.config.chunk_size}"
+                )
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+            # Mega expects the causal mask to be a 2D square matrix of (from) x (to) over the input sequence length
+            # the HF utility function generates a 3D causal mask which includes batch size, so we'll create a dummy
+            # mask with the correct device and all ones
+            temp_mask_for_extension = torch.ones((1, sequence_length), dtype=torch.long, device=device)
+            causal_mask = self.create_extended_attention_mask_for_decoder(input_shape, temp_mask_for_extension)
+
+            # get rid of batch dimension in the generated mask; result is (sequence_length X sequence_length)
+            causal_mask = causal_mask.squeeze(0)
+        else:
+            use_cache = False
+            causal_mask = None
+
+        # if using cache, make sure we have a tuple of tuples which matches the length of our hidden layers
+        if (past_key_values is not None) and (len(past_key_values) != self.config.num_hidden_layers):
+            raise ValueError(
+                f"Received past key/value cache with size mismatch; expected {self.config.num_hidden_layers}, received {len(past_key_values)}"
+            )
+
+        # get embeddings (batch X sequence length X embed dim)
+        embedding_output = self.embedding_layer(
+            input_ids=input_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
+        )
+
+        # transpose for Mega --> (seq len X batch X embed dim)
+        hidden_states = embedding_output.transpose(0, 1)
+
+        # we expect encoder hidden states to also have batch first in line
+        # with typical Hugging Face behavior (which is also how we return them)
+        # Mega expects sequence length first, so do the same transpose here
+        if encoder_hidden_states is not None:
+            encoder_hidden_states = encoder_hidden_states.transpose(0, 1)
+
+        # pass through mega layers
+        all_hidden_states = (embedding_output,) if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+        next_decoder_cache = () if use_cache else None
+        for i, mega_layer in enumerate(self.layers):
+            current_decoder_cache = past_key_values[i] if past_key_values is not None else None
+            mega_outputs = mega_layer(
+                hidden_states=hidden_states,
+                attention_mask=attention_mask,
+                causal_mask=causal_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                past_key_values=current_decoder_cache,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+
+            hidden_states = mega_outputs[0]
+            if output_hidden_states:
+                # store layer-wise hidden states in the way that the user expects
+                # (seq len X batch X embed dim) --> (batch X seq len X embed dim)
+                all_hidden_states += (hidden_states.transpose(0, 1),)
+            if output_attentions:
+                self_attn_weights = mega_outputs[1]
+                all_self_attentions += (self_attn_weights,)
+                if self.config.add_cross_attention:
+                    cross_attn_weights = mega_outputs[2]
+                    all_cross_attentions += (cross_attn_weights,)
+            if use_cache:
+                updated_cache = mega_outputs[-1]
+                next_decoder_cache += (updated_cache,)
+
+        # transpose final hidden states
+        hidden_states = hidden_states.transpose(0, 1)
+
+        # optional pooling layer
+        pooled_output = self.pooler(hidden_states) if self.pooler is not None else None
+
+        if not return_dict:
+            return (hidden_states, pooled_output) + (
+                all_hidden_states,
+                next_decoder_cache,
+                all_self_attentions,
+                all_cross_attentions,
+            )
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            pooler_output=pooled_output,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    """MEGA Model with a `language modeling` head on top for CLM fine-tuning.""", MEGA_START_DOCSTRING
+)
+class MegaForCausalLM(MegaPreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config: MegaConfig):
+        super().__init__(config)
+
+        if not config.is_decoder:
+            logger.warning("If you want to use `MegaForCausalLM` as a standalone, add `is_decoder=True.`")
+
+        self.mega = MegaModel(config, add_pooling_layer=False)
+
+        if config.add_lm_hidden_dense_layer:
+            self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+            self.hidden_activation = nn.Tanh()
+        else:
+            self.dense = None
+            self.hidden_activation = None
+
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(MEGA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], CausalLMOutputWithCrossAttentions]:
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+            ignored (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, MegaForCausalLM, AutoConfig
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("mnaylor/mega-base-wikitext")
+        >>> config = AutoConfig.from_pretrained("mnaylor/mega-base-wikitext")
+        >>> config.is_decoder = True
+        >>> config.bidirectional = False
+        >>> model = MegaForCausalLM.from_pretrained(
+        ...     "mnaylor/mega-base-wikitext", config=config, ignore_mismatched_sizes=True
+        ... )
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> prediction_logits = outputs.logits
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        outputs = self.mega(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        if self.dense is not None:
+            sequence_output = self.dense(sequence_output)
+            sequence_output = self.hidden_activation(sequence_output)
+
+        prediction_scores = self.lm_head(sequence_output)
+
+        lm_loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss()
+            lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_shape)
+
+        # cut decoder_input_ids if past is used
+        if past_key_values is not None:
+            input_ids = input_ids[:, -1:]
+
+        return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
+
+    def _reorder_cache(self, past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+
+
+@add_start_docstrings("""MEGA Model with a `language modeling` head on top.""", MEGA_START_DOCSTRING)
+class MegaForMaskedLM(MegaPreTrainedModel):
+    _tied_weights_keys = ["mlm_head.weight"]
+
+    def __init__(self, config: MegaConfig):
+        super().__init__(config)
+
+        if config.is_decoder:
+            logger.warning(
+                "If you want to use `MegaForMaskedLM`, set `config.is_decoder=False` for "
+                "bi-directional self-attention."
+            )
+
+        self.mega = MegaModel(config, add_pooling_layer=False)
+        if config.add_lm_hidden_dense_layer:
+            self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+            self.hidden_activation = nn.Tanh()
+        else:
+            self.dense = None
+            self.hidden_activation = None
+        self.mlm_head = nn.Linear(config.hidden_size, config.vocab_size)
+        self.dropout = nn.Dropout(config.dropout_prob)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.mlm_head
+
+    def set_output_embeddings(self, new_embeddings):
+        self.mlm_head = new_embeddings
+
+    @add_start_docstrings_to_model_forward(MEGA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MaskedLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+        mask="<mask>",
+        expected_output="' Paris'",
+        expected_loss=0.1,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], MaskedLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        kwargs (`dict[str, any]`, optional, defaults to *{}*):
+            Used to hide legacy arguments that have been deprecated.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.mega(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        if self.dense is not None:
+            sequence_output = self.dense(sequence_output)
+            sequence_output = self.hidden_activation(sequence_output)
+        prediction_scores = self.mlm_head(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    MEGA Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
+    output) e.g. for GLUE tasks.
+    """,
+    MEGA_START_DOCSTRING,
+)
+class MegaForSequenceClassification(MegaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+
+        self.mega = MegaModel(config, add_pooling_layer=False)
+        self.classifier = MegaClassificationHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(MEGA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.mega(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0]
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    MEGA Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    MEGA_START_DOCSTRING,
+)
+class MegaForMultipleChoice(MegaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.mega = MegaModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(MEGA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], MultipleChoiceModelOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+            `input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+
+        flat_input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        flat_token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        flat_attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        flat_inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+
+        outputs = self.mega(
+            flat_input_ids,
+            token_type_ids=flat_token_type_ids,
+            attention_mask=flat_attention_mask,
+            inputs_embeds=flat_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    MEGA Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    MEGA_START_DOCSTRING,
+)
+class MegaForTokenClassification(MegaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.mega = MegaModel(config, add_pooling_layer=False)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(MEGA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.mega(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+# copied from transformers.models.roberta.modeling_roberta.RobertaClassificationHead with Roberta->Mega
+class MegaClassificationHead(nn.Module):
+    """Head for sentence-level classification tasks."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.out_proj = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(self, features, **kwargs):
+        x = features[:, 0, :]  # take <s> token (equiv. to [CLS])
+        x = self.dropout(x)
+        x = self.dense(x)
+        x = torch.tanh(x)
+        x = self.dropout(x)
+        x = self.out_proj(x)
+        return x
+
+
+@add_start_docstrings(
+    """
+    MEGA Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    MEGA_START_DOCSTRING,
+)
+class MegaForQuestionAnswering(MegaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.mega = MegaModel(config, add_pooling_layer=False)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(MEGA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=QuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], QuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.mega(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = [
+    "MegaForCausalLM",
+    "MegaForMaskedLM",
+    "MegaForMultipleChoice",
+    "MegaForQuestionAnswering",
+    "MegaForSequenceClassification",
+    "MegaForTokenClassification",
+    "MegaModel",
+    "MegaPreTrainedModel",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mmbt/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mmbt/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..03b556e2eddf6d5f81f6f4a15346596dd5a32f85
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mmbt/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ....utils import _LazyModule
+from ....utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_mmbt import *
+    from .modeling_mmbt import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mmbt/configuration_mmbt.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mmbt/configuration_mmbt.py
new file mode 100644
index 0000000000000000000000000000000000000000..1c58e4e6cdd0d0a8c87b9b94ca896c87970b5654
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mmbt/configuration_mmbt.py
@@ -0,0 +1,45 @@
+# coding=utf-8
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""MMBT configuration"""
+
+from ....utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class MMBTConfig:
+    """
+    This is the configuration class to store the configuration of a [`MMBTModel`]. It is used to instantiate a MMBT
+    model according to the specified arguments, defining the model architecture.
+
+    Args:
+        config ([`PreTrainedConfig`]):
+            Config of the underlying Transformer models. Its values are copied over to use a single config.
+        num_labels (`int`, *optional*):
+            Size of final Linear layer for classification.
+        modal_hidden_size (`int`, *optional*, defaults to 2048):
+            Embedding dimension of the non-text modality encoder.
+    """
+
+    def __init__(self, config, num_labels=None, modal_hidden_size=2048):
+        self.__dict__ = config.__dict__
+        self.modal_hidden_size = modal_hidden_size
+        if num_labels:
+            self.num_labels = num_labels
+
+
+__all__ = ["MMBTConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mmbt/modeling_mmbt.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mmbt/modeling_mmbt.py
new file mode 100644
index 0000000000000000000000000000000000000000..45ae577f7fced2d276f4f54c5bf859e27e08ebae
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/mmbt/modeling_mmbt.py
@@ -0,0 +1,410 @@
+# coding=utf-8
+# Copyright (c) Facebook, Inc. and its affiliates.
+# Copyright (c) HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch MMBT model."""
+
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss, MSELoss
+
+from ....modeling_outputs import BaseModelOutputWithPooling, SequenceClassifierOutput
+from ....modeling_utils import ModuleUtilsMixin
+from ....utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "MMBTConfig"
+
+
+class ModalEmbeddings(nn.Module):
+    """Generic Modal Embeddings which takes in an encoder, and a transformer embedding."""
+
+    def __init__(self, config, encoder, embeddings):
+        super().__init__()
+        self.config = config
+        self.encoder = encoder
+        self.proj_embeddings = nn.Linear(config.modal_hidden_size, config.hidden_size)
+        self.position_embeddings = embeddings.position_embeddings
+        self.token_type_embeddings = embeddings.token_type_embeddings
+        self.word_embeddings = embeddings.word_embeddings
+        self.LayerNorm = embeddings.LayerNorm
+        self.dropout = nn.Dropout(p=config.hidden_dropout_prob)
+
+    def forward(self, input_modal, start_token=None, end_token=None, position_ids=None, token_type_ids=None):
+        token_embeddings = self.proj_embeddings(self.encoder(input_modal))
+        seq_length = token_embeddings.size(1)
+
+        if start_token is not None:
+            start_token_embeds = self.word_embeddings(start_token)
+            seq_length += 1
+            token_embeddings = torch.cat([start_token_embeds.unsqueeze(1), token_embeddings], dim=1)
+
+        if end_token is not None:
+            end_token_embeds = self.word_embeddings(end_token)
+            seq_length += 1
+            token_embeddings = torch.cat([token_embeddings, end_token_embeds.unsqueeze(1)], dim=1)
+
+        if position_ids is None:
+            position_ids = torch.arange(seq_length, dtype=torch.long, device=input_modal.device)
+            position_ids = position_ids.unsqueeze(0).expand(input_modal.size(0), seq_length)
+
+        if token_type_ids is None:
+            token_type_ids = torch.zeros(
+                (input_modal.size(0), seq_length), dtype=torch.long, device=input_modal.device
+            )
+
+        position_embeddings = self.position_embeddings(position_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+        embeddings = token_embeddings + position_embeddings + token_type_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+MMBT_START_DOCSTRING = r"""
+    MMBT model was proposed in [Supervised Multimodal Bitransformers for Classifying Images and
+    Text](https://github.com/facebookresearch/mmbt) by Douwe Kiela, Suvrat Bhooshan, Hamed Firooz, Davide Testuggine.
+    It's a supervised multimodal bitransformer model that fuses information from text and other image encoders, and
+    obtain state-of-the-art performance on various multimodal classification benchmark tasks.
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`MMBTConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration.
+        transformer (`nn.Module`): A text transformer that is used by MMBT.
+            It should have embeddings, encoder, and pooler attributes.
+        encoder (`nn.Module`): Encoder for the second modality.
+            It should take in a batch of modal inputs and return k, n dimension embeddings.
+"""
+
+MMBT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_modal (`torch.FloatTensor` of shape `(batch_size, ***)`):
+            The other modality data. It will be the shape that the encoder for that type expects. e.g. With an Image
+            Encoder, the shape would be (batch_size, channels, height, width)
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. It does not expect [CLS] token to be added as it's
+            appended to the end of other modality embeddings. Indices can be obtained using [`AutoTokenizer`]. See
+            [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        modal_start_tokens (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Optional start token to be added to Other Modality Embedding. [CLS] Most commonly used for classification
+            tasks.
+        modal_end_tokens (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Optional end token to be added to Other Modality Embedding. [SEP] Most commonly used.
+        attention_mask (*optional*) `torch.FloatTensor` of shape `(batch_size, sequence_length)`:
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (*optional*) `torch.LongTensor` of shape `(batch_size, sequence_length)`:
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        modal_token_type_ids (*optional*) `torch.LongTensor` of shape `(batch_size, modal_sequence_length)`:
+            Segment token indices to indicate different portions of the non-text modality. The embeddings from these
+            tokens will be summed with the respective token embeddings for the non-text modality.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        modal_position_ids (`torch.LongTensor` of shape `(batch_size, modal_sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings for the non-text modality.
+            Selected in the range `[0, config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, embedding_dim)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare MMBT Model outputting raw hidden-states without any specific head on top.",
+    MMBT_START_DOCSTRING,
+)
+class MMBTModel(nn.Module, ModuleUtilsMixin):
+    def __init__(self, config, transformer, encoder):
+        super().__init__()
+        self.config = config
+        self.transformer = transformer
+        self.modal_encoder = ModalEmbeddings(config, encoder, transformer.embeddings)
+
+    @add_start_docstrings_to_model_forward(MMBT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutputWithPooling, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_modal,
+        input_ids=None,
+        modal_start_tokens=None,
+        modal_end_tokens=None,
+        attention_mask=None,
+        token_type_ids=None,
+        modal_token_type_ids=None,
+        position_ids=None,
+        modal_position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        # For example purposes. Not runnable.
+        transformer = BertModel.from_pretrained("google-bert/bert-base-uncased")
+        encoder = ImageEncoder(args)
+        mmbt = MMBTModel(config, transformer, encoder)
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_txt_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_txt_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        modal_embeddings = self.modal_encoder(
+            input_modal,
+            start_token=modal_start_tokens,
+            end_token=modal_end_tokens,
+            position_ids=modal_position_ids,
+            token_type_ids=modal_token_type_ids,
+        )
+
+        input_modal_shape = modal_embeddings.size()[:-1]
+
+        if token_type_ids is None:
+            token_type_ids = torch.ones(input_txt_shape, dtype=torch.long, device=device)
+
+        txt_embeddings = self.transformer.embeddings(
+            input_ids=input_ids, position_ids=position_ids, token_type_ids=token_type_ids, inputs_embeds=inputs_embeds
+        )
+
+        embedding_output = torch.cat([modal_embeddings, txt_embeddings], 1)
+
+        input_shape = embedding_output.size()[:-1]
+
+        if attention_mask is None:
+            attention_mask = torch.ones(input_shape, device=device)
+        else:
+            attention_mask = torch.cat(
+                [torch.ones(input_modal_shape, device=device, dtype=torch.long), attention_mask], dim=1
+            )
+        if encoder_attention_mask is None:
+            encoder_attention_mask = torch.ones(input_shape, device=device)
+        else:
+            encoder_attention_mask = torch.cat(
+                [torch.ones(input_modal_shape, device=device), encoder_attention_mask], dim=1
+            )
+
+        extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
+        encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        encoder_outputs = self.transformer.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.transformer.pooler(sequence_output)
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+
+@add_start_docstrings(
+    """
+    MMBT Model with a sequence classification/regression head on top (a linear layer on top of the pooled output)
+    """,
+    MMBT_START_DOCSTRING,
+    MMBT_INPUTS_DOCSTRING,
+)
+class MMBTForClassification(nn.Module):
+    r"""
+    **labels**: (*optional*) `torch.LongTensor` of shape `(batch_size,)`:
+        Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+        config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+        `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+
+    Returns: *Tuple* comprising various elements depending on the configuration (config) and inputs: **loss**:
+    (*optional*, returned when `labels` is provided) `torch.FloatTensor` of shape `(1,)`: Classification (or
+    regression if config.num_labels==1) loss. **logits**:
+        `torch.FloatTensor` of shape `(batch_size, config.num_labels)` Classification (or regression if
+        config.num_labels==1) scores (before SoftMax).
+    **hidden_states**: (*optional*, returned when `output_hidden_states=True`) list of `torch.FloatTensor` (one for
+    the output of each layer + the output of the embeddings) of shape `(batch_size, sequence_length, hidden_size)`:
+    Hidden-states of the model at the output of each layer plus the initial embedding outputs. **attentions**:
+    (*optional*, returned when `output_attentions=True`) list of `torch.FloatTensor` (one for each layer) of shape
+    `(batch_size, num_heads, sequence_length, sequence_length)`: Attentions weights after the attention softmax, used
+    to compute the weighted average in the self-attention heads.
+
+    Examples:
+
+    ```python
+    # For example purposes. Not runnable.
+    transformer = BertModel.from_pretrained("google-bert/bert-base-uncased")
+    encoder = ImageEncoder(args)
+    model = MMBTForClassification(config, transformer, encoder)
+    outputs = model(input_modal, input_ids, labels=labels)
+    loss, logits = outputs[:2]
+    ```"""
+
+    def __init__(self, config, transformer, encoder):
+        super().__init__()
+        self.num_labels = config.num_labels
+
+        self.mmbt = MMBTModel(config, transformer, encoder)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+    def forward(
+        self,
+        input_modal,
+        input_ids=None,
+        modal_start_tokens=None,
+        modal_end_tokens=None,
+        attention_mask=None,
+        token_type_ids=None,
+        modal_token_type_ids=None,
+        position_ids=None,
+        modal_position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        labels=None,
+        return_dict=None,
+    ):
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.mmbt(
+            input_modal=input_modal,
+            input_ids=input_ids,
+            modal_start_tokens=modal_start_tokens,
+            modal_end_tokens=modal_end_tokens,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            modal_token_type_ids=modal_token_type_ids,
+            position_ids=position_ids,
+            modal_position_ids=modal_position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.num_labels == 1:
+                #  We are doing regression
+                loss_fct = MSELoss()
+                loss = loss_fct(logits.view(-1), labels.view(-1))
+            else:
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = ["MMBTForClassification", "MMBTModel", "ModalEmbeddings"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nat/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nat/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c5373969ce7831491b0d5fa5495078fb1d3f6e4e
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nat/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ....utils import _LazyModule
+from ....utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_nat import *
+    from .modeling_nat import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nat/configuration_nat.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nat/configuration_nat.py
new file mode 100644
index 0000000000000000000000000000000000000000..002eaa7f82b6927ab383523e9b9a6abc5c2433a9
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nat/configuration_nat.py
@@ -0,0 +1,148 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Neighborhood Attention Transformer model configuration"""
+
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
+from ....utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
+
+
+logger = logging.get_logger(__name__)
+
+
+class NatConfig(BackboneConfigMixin, PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`NatModel`]. It is used to instantiate a Nat model
+    according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Nat
+    [shi-labs/nat-mini-in1k-224](https://huggingface.co/shi-labs/nat-mini-in1k-224) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        patch_size (`int`, *optional*, defaults to 4):
+            The size (resolution) of each patch. NOTE: Only patch size of 4 is supported at the moment.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        embed_dim (`int`, *optional*, defaults to 64):
+            Dimensionality of patch embedding.
+        depths (`list[int]`, *optional*, defaults to `[3, 4, 6, 5]`):
+            Number of layers in each level of the encoder.
+        num_heads (`list[int]`, *optional*, defaults to `[2, 4, 8, 16]`):
+            Number of attention heads in each layer of the Transformer encoder.
+        kernel_size (`int`, *optional*, defaults to 7):
+            Neighborhood Attention kernel size.
+        mlp_ratio (`float`, *optional*, defaults to 3.0):
+            Ratio of MLP hidden dimensionality to embedding dimensionality.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether or not a learnable bias should be added to the queries, keys and values.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings and encoder.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        drop_path_rate (`float`, *optional*, defaults to 0.1):
+            Stochastic depth rate.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder. If string, `"gelu"`, `"relu"`,
+            `"selu"` and `"gelu_new"` are supported.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        layer_scale_init_value (`float`, *optional*, defaults to 0.0):
+            The initial value for the layer scale. Disabled if <=0.
+        out_features (`list[str]`, *optional*):
+            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+        out_indices (`list[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+
+    Example:
+
+    ```python
+    >>> from transformers import NatConfig, NatModel
+
+    >>> # Initializing a Nat shi-labs/nat-mini-in1k-224 style configuration
+    >>> configuration = NatConfig()
+
+    >>> # Initializing a model (with random weights) from the shi-labs/nat-mini-in1k-224 style configuration
+    >>> model = NatModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "nat"
+
+    attribute_map = {
+        "num_attention_heads": "num_heads",
+        "num_hidden_layers": "num_layers",
+    }
+
+    def __init__(
+        self,
+        patch_size=4,
+        num_channels=3,
+        embed_dim=64,
+        depths=[3, 4, 6, 5],
+        num_heads=[2, 4, 8, 16],
+        kernel_size=7,
+        mlp_ratio=3.0,
+        qkv_bias=True,
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        drop_path_rate=0.1,
+        hidden_act="gelu",
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        layer_scale_init_value=0.0,
+        out_features=None,
+        out_indices=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.embed_dim = embed_dim
+        self.depths = depths
+        self.num_layers = len(depths)
+        self.num_heads = num_heads
+        self.kernel_size = kernel_size
+        self.mlp_ratio = mlp_ratio
+        self.qkv_bias = qkv_bias
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.drop_path_rate = drop_path_rate
+        self.hidden_act = hidden_act
+        self.layer_norm_eps = layer_norm_eps
+        self.initializer_range = initializer_range
+        # we set the hidden_size attribute in order to make Nat work with VisionEncoderDecoderModel
+        # this indicates the channel dimension after the last stage of the model
+        self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1))
+        self.layer_scale_init_value = layer_scale_init_value
+        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
+
+
+__all__ = ["NatConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nat/modeling_nat.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nat/modeling_nat.py
new file mode 100644
index 0000000000000000000000000000000000000000..a619cdb11225779eb6ab499840658754be65c9ff
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nat/modeling_nat.py
@@ -0,0 +1,932 @@
+# coding=utf-8
+# Copyright 2022 SHI Labs and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Neighborhood Attention Transformer model."""
+
+import math
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+from torch import nn
+
+from ....activations import ACT2FN
+from ....modeling_outputs import BackboneOutput
+from ....modeling_utils import PreTrainedModel
+from ....pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ....utils import (
+    ModelOutput,
+    OptionalDependencyNotAvailable,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_natten_available,
+    logging,
+    replace_return_docstrings,
+    requires_backends,
+)
+from ....utils.backbone_utils import BackboneMixin
+from .configuration_nat import NatConfig
+
+
+if is_natten_available():
+    from natten.functional import natten2dav, natten2dqkrpb
+else:
+
+    def natten2dqkrpb(*args, **kwargs):
+        raise OptionalDependencyNotAvailable()
+
+    def natten2dav(*args, **kwargs):
+        raise OptionalDependencyNotAvailable()
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "NatConfig"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "shi-labs/nat-mini-in1k-224"
+_EXPECTED_OUTPUT_SHAPE = [1, 7, 7, 512]
+
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "shi-labs/nat-mini-in1k-224"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "tiger cat"
+
+
+# drop_path and NatDropPath are from the timm library.
+
+
+@dataclass
+class NatEncoderOutput(ModelOutput):
+    """
+    Nat encoder's outputs, with potential hidden states and attentions.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+            shape `(batch_size, hidden_size, height, width)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
+            include the spatial dimensions.
+    """
+
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+    reshaped_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+@dataclass
+class NatModelOutput(ModelOutput):
+    """
+    Nat model's outputs that also contains a pooling of the last hidden states.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
+            Average pooling of the last layer hidden-state.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+            shape `(batch_size, hidden_size, height, width)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
+            include the spatial dimensions.
+    """
+
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    pooler_output: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+    reshaped_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+@dataclass
+class NatImageClassifierOutput(ModelOutput):
+    """
+    Nat outputs for image classification.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Classification (or regression if config.num_labels==1) loss.
+        logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+            shape `(batch_size, hidden_size, height, width)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
+            include the spatial dimensions.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+    reshaped_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+class NatEmbeddings(nn.Module):
+    """
+    Construct the patch and position embeddings.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.patch_embeddings = NatPatchEmbeddings(config)
+
+        self.norm = nn.LayerNorm(config.embed_dim)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, pixel_values: Optional[torch.FloatTensor]) -> tuple[torch.Tensor]:
+        embeddings = self.patch_embeddings(pixel_values)
+        embeddings = self.norm(embeddings)
+
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+class NatPatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, height, width, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        patch_size = config.patch_size
+        num_channels, hidden_size = config.num_channels, config.embed_dim
+        self.num_channels = num_channels
+
+        if patch_size == 4:
+            pass
+        else:
+            # TODO: Support arbitrary patch sizes.
+            raise ValueError("Dinat only supports patch size of 4 at the moment.")
+
+        self.projection = nn.Sequential(
+            nn.Conv2d(self.num_channels, hidden_size // 2, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)),
+            nn.Conv2d(hidden_size // 2, hidden_size, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)),
+        )
+
+    def forward(self, pixel_values: Optional[torch.FloatTensor]) -> torch.Tensor:
+        _, num_channels, height, width = pixel_values.shape
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        embeddings = self.projection(pixel_values)
+        embeddings = embeddings.permute(0, 2, 3, 1)
+
+        return embeddings
+
+
+class NatDownsampler(nn.Module):
+    """
+    Convolutional Downsampling Layer.
+
+    Args:
+        dim (`int`):
+            Number of input channels.
+        norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
+            Normalization layer class.
+    """
+
+    def __init__(self, dim: int, norm_layer: nn.Module = nn.LayerNorm) -> None:
+        super().__init__()
+        self.dim = dim
+        self.reduction = nn.Conv2d(dim, 2 * dim, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
+        self.norm = norm_layer(2 * dim)
+
+    def forward(self, input_feature: torch.Tensor) -> torch.Tensor:
+        input_feature = self.reduction(input_feature.permute(0, 3, 1, 2)).permute(0, 2, 3, 1)
+        input_feature = self.norm(input_feature)
+        return input_feature
+
+
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+
+
+class NatDropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return f"p={self.drop_prob}"
+
+
+class NeighborhoodAttention(nn.Module):
+    def __init__(self, config, dim, num_heads, kernel_size):
+        super().__init__()
+        if dim % num_heads != 0:
+            raise ValueError(
+                f"The hidden size ({dim}) is not a multiple of the number of attention heads ({num_heads})"
+            )
+
+        self.num_attention_heads = num_heads
+        self.attention_head_size = int(dim / num_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.kernel_size = kernel_size
+
+        # rpb is learnable relative positional biases; same concept is used Swin.
+        self.rpb = nn.Parameter(torch.zeros(num_heads, (2 * self.kernel_size - 1), (2 * self.kernel_size - 1)))
+
+        self.query = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
+        self.key = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
+        self.value = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 3, 1, 2, 4)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.Tensor]:
+        query_layer = self.transpose_for_scores(self.query(hidden_states))
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        # Apply the scale factor before computing attention weights. It's usually more efficient because
+        # attention weights are typically a bigger tensor compared to query.
+        # It gives identical results because scalars are commutable in matrix multiplication.
+        query_layer = query_layer / math.sqrt(self.attention_head_size)
+
+        # Compute NA between "query" and "key" to get the raw attention scores, and add relative positional biases.
+        attention_scores = natten2dqkrpb(query_layer, key_layer, self.rpb, self.kernel_size, 1)
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        context_layer = natten2dav(attention_probs, value_layer, self.kernel_size, 1)
+        context_layer = context_layer.permute(0, 2, 3, 1, 4).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+class NeighborhoodAttentionOutput(nn.Module):
+    def __init__(self, config, dim):
+        super().__init__()
+        self.dense = nn.Linear(dim, dim)
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        return hidden_states
+
+
+class NeighborhoodAttentionModule(nn.Module):
+    def __init__(self, config, dim, num_heads, kernel_size):
+        super().__init__()
+        self.self = NeighborhoodAttention(config, dim, num_heads, kernel_size)
+        self.output = NeighborhoodAttentionOutput(config, dim)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.Tensor]:
+        self_outputs = self.self(hidden_states, output_attentions)
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class NatIntermediate(nn.Module):
+    def __init__(self, config, dim):
+        super().__init__()
+        self.dense = nn.Linear(dim, int(config.mlp_ratio * dim))
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class NatOutput(nn.Module):
+    def __init__(self, config, dim):
+        super().__init__()
+        self.dense = nn.Linear(int(config.mlp_ratio * dim), dim)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+class NatLayer(nn.Module):
+    def __init__(self, config, dim, num_heads, drop_path_rate=0.0):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.kernel_size = config.kernel_size
+        self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps)
+        self.attention = NeighborhoodAttentionModule(config, dim, num_heads, kernel_size=self.kernel_size)
+        self.drop_path = NatDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
+        self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps)
+        self.intermediate = NatIntermediate(config, dim)
+        self.output = NatOutput(config, dim)
+        self.layer_scale_parameters = (
+            nn.Parameter(config.layer_scale_init_value * torch.ones((2, dim)), requires_grad=True)
+            if config.layer_scale_init_value > 0
+            else None
+        )
+
+    def maybe_pad(self, hidden_states, height, width):
+        window_size = self.kernel_size
+        pad_values = (0, 0, 0, 0, 0, 0)
+        if height < window_size or width < window_size:
+            pad_l = pad_t = 0
+            pad_r = max(0, window_size - width)
+            pad_b = max(0, window_size - height)
+            pad_values = (0, 0, pad_l, pad_r, pad_t, pad_b)
+            hidden_states = nn.functional.pad(hidden_states, pad_values)
+        return hidden_states, pad_values
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        batch_size, height, width, channels = hidden_states.size()
+        shortcut = hidden_states
+
+        hidden_states = self.layernorm_before(hidden_states)
+        # pad hidden_states if they are smaller than kernel size
+        hidden_states, pad_values = self.maybe_pad(hidden_states, height, width)
+
+        _, height_pad, width_pad, _ = hidden_states.shape
+
+        attention_outputs = self.attention(hidden_states, output_attentions=output_attentions)
+
+        attention_output = attention_outputs[0]
+
+        was_padded = pad_values[3] > 0 or pad_values[5] > 0
+        if was_padded:
+            attention_output = attention_output[:, :height, :width, :].contiguous()
+
+        if self.layer_scale_parameters is not None:
+            attention_output = self.layer_scale_parameters[0] * attention_output
+
+        hidden_states = shortcut + self.drop_path(attention_output)
+
+        layer_output = self.layernorm_after(hidden_states)
+        layer_output = self.output(self.intermediate(layer_output))
+
+        if self.layer_scale_parameters is not None:
+            layer_output = self.layer_scale_parameters[1] * layer_output
+
+        layer_output = hidden_states + self.drop_path(layer_output)
+
+        layer_outputs = (layer_output, attention_outputs[1]) if output_attentions else (layer_output,)
+        return layer_outputs
+
+
+class NatStage(nn.Module):
+    def __init__(self, config, dim, depth, num_heads, drop_path_rate, downsample):
+        super().__init__()
+        self.config = config
+        self.dim = dim
+        self.layers = nn.ModuleList(
+            [
+                NatLayer(
+                    config=config,
+                    dim=dim,
+                    num_heads=num_heads,
+                    drop_path_rate=drop_path_rate[i],
+                )
+                for i in range(depth)
+            ]
+        )
+
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(dim=dim, norm_layer=nn.LayerNorm)
+        else:
+            self.downsample = None
+
+        self.pointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.Tensor]:
+        _, height, width, _ = hidden_states.size()
+        for i, layer_module in enumerate(self.layers):
+            layer_outputs = layer_module(hidden_states, output_attentions)
+            hidden_states = layer_outputs[0]
+
+        hidden_states_before_downsampling = hidden_states
+        if self.downsample is not None:
+            hidden_states = self.downsample(hidden_states_before_downsampling)
+
+        stage_outputs = (hidden_states, hidden_states_before_downsampling)
+
+        if output_attentions:
+            stage_outputs += layer_outputs[1:]
+        return stage_outputs
+
+
+class NatEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.num_levels = len(config.depths)
+        self.config = config
+        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths), device="cpu")]
+        self.levels = nn.ModuleList(
+            [
+                NatStage(
+                    config=config,
+                    dim=int(config.embed_dim * 2**i_layer),
+                    depth=config.depths[i_layer],
+                    num_heads=config.num_heads[i_layer],
+                    drop_path_rate=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])],
+                    downsample=NatDownsampler if (i_layer < self.num_levels - 1) else None,
+                )
+                for i_layer in range(self.num_levels)
+            ]
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        output_hidden_states_before_downsampling: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[tuple, NatEncoderOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_reshaped_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        if output_hidden_states:
+            # rearrange b h w c -> b c h w
+            reshaped_hidden_state = hidden_states.permute(0, 3, 1, 2)
+            all_hidden_states += (hidden_states,)
+            all_reshaped_hidden_states += (reshaped_hidden_state,)
+
+        for i, layer_module in enumerate(self.levels):
+            layer_outputs = layer_module(hidden_states, output_attentions)
+
+            hidden_states = layer_outputs[0]
+            hidden_states_before_downsampling = layer_outputs[1]
+
+            if output_hidden_states and output_hidden_states_before_downsampling:
+                # rearrange b h w c -> b c h w
+                reshaped_hidden_state = hidden_states_before_downsampling.permute(0, 3, 1, 2)
+                all_hidden_states += (hidden_states_before_downsampling,)
+                all_reshaped_hidden_states += (reshaped_hidden_state,)
+            elif output_hidden_states and not output_hidden_states_before_downsampling:
+                # rearrange b h w c -> b c h w
+                reshaped_hidden_state = hidden_states.permute(0, 3, 1, 2)
+                all_hidden_states += (hidden_states,)
+                all_reshaped_hidden_states += (reshaped_hidden_state,)
+
+            if output_attentions:
+                all_self_attentions += layer_outputs[2:]
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+
+        return NatEncoderOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            reshaped_hidden_states=all_reshaped_hidden_states,
+        )
+
+
+class NatPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config: NatConfig
+    base_model_prefix = "nat"
+    main_input_name = "pixel_values"
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+NAT_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`NatConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+NAT_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See [`ViTImageProcessor.__call__`]
+            for details.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Nat Model transformer outputting raw hidden-states without any specific head on top.",
+    NAT_START_DOCSTRING,
+)
+class NatModel(NatPreTrainedModel):
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+
+        requires_backends(self, ["natten"])
+
+        self.config = config
+        self.num_levels = len(config.depths)
+        self.num_features = int(config.embed_dim * 2 ** (self.num_levels - 1))
+
+        self.embeddings = NatEmbeddings(config)
+        self.encoder = NatEncoder(config)
+
+        self.layernorm = nn.LayerNorm(self.num_features, eps=config.layer_norm_eps)
+        self.pooler = nn.AdaptiveAvgPool1d(1) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.patch_embeddings
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(NAT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=NatModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, NatModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        embedding_output = self.embeddings(pixel_values)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+
+        pooled_output = None
+        if self.pooler is not None:
+            pooled_output = self.pooler(sequence_output.flatten(1, 2).transpose(1, 2))
+            pooled_output = torch.flatten(pooled_output, 1)
+
+        if not return_dict:
+            output = (sequence_output, pooled_output) + encoder_outputs[1:]
+
+            return output
+
+        return NatModelOutput(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            reshaped_hidden_states=encoder_outputs.reshaped_hidden_states,
+        )
+
+
+@add_start_docstrings(
+    """
+    Nat Model transformer with an image classification head on top (a linear layer on top of the final hidden state of
+    the [CLS] token) e.g. for ImageNet.
+    """,
+    NAT_START_DOCSTRING,
+)
+class NatForImageClassification(NatPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        requires_backends(self, ["natten"])
+
+        self.num_labels = config.num_labels
+        self.nat = NatModel(config)
+
+        # Classifier head
+        self.classifier = (
+            nn.Linear(self.nat.num_features, config.num_labels) if config.num_labels > 0 else nn.Identity()
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(NAT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=NatImageClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, NatImageClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.nat(
+            pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(labels, logits, self.config)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return NatImageClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            reshaped_hidden_states=outputs.reshaped_hidden_states,
+        )
+
+
+@add_start_docstrings(
+    "NAT backbone, to be used with frameworks like DETR and MaskFormer.",
+    NAT_START_DOCSTRING,
+)
+class NatBackbone(NatPreTrainedModel, BackboneMixin):
+    def __init__(self, config):
+        super().__init__(config)
+        super()._init_backbone(config)
+
+        requires_backends(self, ["natten"])
+
+        self.embeddings = NatEmbeddings(config)
+        self.encoder = NatEncoder(config)
+        self.num_features = [config.embed_dim] + [int(config.embed_dim * 2**i) for i in range(len(config.depths))]
+
+        # Add layer norms to hidden states of out_features
+        hidden_states_norms = {}
+        for stage, num_channels in zip(self.out_features, self.channels):
+            hidden_states_norms[stage] = nn.LayerNorm(num_channels)
+        self.hidden_states_norms = nn.ModuleDict(hidden_states_norms)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.patch_embeddings
+
+    @add_start_docstrings_to_model_forward(NAT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BackboneOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> BackboneOutput:
+        """
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, AutoBackbone
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> processor = AutoImageProcessor.from_pretrained("shi-labs/nat-mini-in1k-224")
+        >>> model = AutoBackbone.from_pretrained(
+        ...     "shi-labs/nat-mini-in1k-224", out_features=["stage1", "stage2", "stage3", "stage4"]
+        ... )
+
+        >>> inputs = processor(image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+
+        >>> feature_maps = outputs.feature_maps
+        >>> list(feature_maps[-1].shape)
+        [1, 512, 7, 7]
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        embedding_output = self.embeddings(pixel_values)
+
+        outputs = self.encoder(
+            embedding_output,
+            output_attentions=output_attentions,
+            output_hidden_states=True,
+            output_hidden_states_before_downsampling=True,
+            return_dict=True,
+        )
+
+        hidden_states = outputs.reshaped_hidden_states
+
+        feature_maps = ()
+        for stage, hidden_state in zip(self.stage_names, hidden_states):
+            if stage in self.out_features:
+                # TODO can we simplify this?
+                batch_size, num_channels, height, width = hidden_state.shape
+                hidden_state = hidden_state.permute(0, 2, 3, 1).contiguous()
+                hidden_state = hidden_state.view(batch_size, height * width, num_channels)
+                hidden_state = self.hidden_states_norms[stage](hidden_state)
+                hidden_state = hidden_state.view(batch_size, height, width, num_channels)
+                hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous()
+                feature_maps += (hidden_state,)
+
+        if not return_dict:
+            output = (feature_maps,)
+            if output_hidden_states:
+                output += (outputs.hidden_states,)
+            return output
+
+        return BackboneOutput(
+            feature_maps=feature_maps,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = ["NatForImageClassification", "NatModel", "NatPreTrainedModel", "NatBackbone"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nezha/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nezha/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f0690129ae9edf84829278790b5e065bbd6608ee
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nezha/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ....utils import _LazyModule
+from ....utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_nezha import *
+    from .modeling_nezha import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nezha/configuration_nezha.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nezha/configuration_nezha.py
new file mode 100644
index 0000000000000000000000000000000000000000..00d193cd1ae68655e1631b7caea2220987a29f0b
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nezha/configuration_nezha.py
@@ -0,0 +1,105 @@
+from .... import PretrainedConfig
+
+
+class NezhaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of an [`NezhaModel`]. It is used to instantiate an Nezha
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Nezha
+    [sijunhe/nezha-cn-base](https://huggingface.co/sijunhe/nezha-cn-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, optional, defaults to 21128):
+            Vocabulary size of the NEZHA model. Defines the different tokens that can be represented by the
+            *inputs_ids* passed to the forward method of [`NezhaModel`].
+        hidden_size (`int`, optional, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, optional, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, optional, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, optional, defaults to 3072):
+            The dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, optional, defaults to "gelu"):
+            The non-linear activation function (function or string) in the encoder and pooler.
+        hidden_dropout_prob (`float`, optional, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, optional, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, optional, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, optional, defaults to 2):
+            The vocabulary size of the *token_type_ids* passed into [`NezhaModel`].
+        initializer_range (`float`, optional, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, optional, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        classifier_dropout (`float`, optional, defaults to 0.1):
+            The dropout ratio for attached classifiers.
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
+
+    Example:
+
+    ```python
+    >>> from transformers import NezhaConfig, NezhaModel
+
+    >>> # Initializing an Nezha configuration
+    >>> configuration = NezhaConfig()
+
+    >>> # Initializing a model (with random weights) from the Nezha-base style configuration model
+    >>> model = NezhaModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "nezha"
+
+    def __init__(
+        self,
+        vocab_size=21128,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        max_relative_position=64,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        classifier_dropout=0.1,
+        pad_token_id=0,
+        bos_token_id=2,
+        eos_token_id=3,
+        use_cache=True,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.max_relative_position = max_relative_position
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.classifier_dropout = classifier_dropout
+        self.use_cache = use_cache
+
+
+__all__ = ["NezhaConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nezha/modeling_nezha.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nezha/modeling_nezha.py
new file mode 100644
index 0000000000000000000000000000000000000000..ddfecac9f50651ad16ff53814b262435e7edcc3e
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/nezha/modeling_nezha.py
@@ -0,0 +1,1689 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Nezha model."""
+
+import math
+import os
+import warnings
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ....activations import ACT2FN
+from ....cache_utils import Cache
+from ....modeling_layers import GradientCheckpointingLayer
+from ....modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    NextSentencePredictorOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from ....modeling_utils import PreTrainedModel
+from ....pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ....utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from ....utils.deprecation import deprecate_kwarg
+from .configuration_nezha import NezhaConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "sijunhe/nezha-cn-base"
+_CONFIG_FOR_DOC = "NezhaConfig"
+
+
+def load_tf_weights_in_nezha(model, config, tf_checkpoint_path):
+    """Load tf checkpoints in a pytorch model."""
+    try:
+        import re
+
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        logger.info(f"Loading TF weight {name} with shape {shape}")
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+
+    for name, array in zip(names, arrays):
+        name = name.split("/")
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(
+            n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
+            for n in name
+        ):
+            logger.info(f"Skipping {'/'.join(name)}")
+            continue
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
+                scope_names = re.split(r"_(\d+)", m_name)
+            else:
+                scope_names = [m_name]
+            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
+                pointer = getattr(pointer, "bias")
+            elif scope_names[0] == "output_weights":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "squad":
+                pointer = getattr(pointer, "classifier")
+            else:
+                try:
+                    pointer = getattr(pointer, scope_names[0])
+                except AttributeError:
+                    logger.info(f"Skipping {'/'.join(name)}")
+                    continue
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
+                pointer = pointer[num]
+        if m_name[-11:] == "_embeddings":
+            pointer = getattr(pointer, "weight")
+        elif m_name == "kernel":
+            array = np.transpose(array)
+        try:
+            if pointer.shape != array.shape:
+                raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        logger.info(f"Initialize PyTorch weight {name}")
+        pointer.data = torch.from_numpy(array)
+    return model
+
+
+class NezhaRelativePositionsEncoding(nn.Module):
+    """Implement the Functional Relative Position Encoding"""
+
+    def __init__(self, length, depth, max_relative_position=127):
+        super().__init__()
+        vocab_size = max_relative_position * 2 + 1
+        range_vec = torch.arange(length)
+        range_mat = range_vec.repeat(length).view(length, length)
+        distance_mat = range_mat - torch.t(range_mat)
+        distance_mat_clipped = torch.clamp(distance_mat, -max_relative_position, max_relative_position)
+        final_mat = distance_mat_clipped + max_relative_position
+
+        embeddings_table = torch.zeros(vocab_size, depth)
+        position = torch.arange(0, vocab_size, dtype=torch.int64).float().unsqueeze(1)
+        div_term = torch.exp(torch.arange(0, depth, 2).float() * (-math.log(10000.0) / depth))
+        embeddings_table[:, 0::2] = torch.sin(position * div_term)
+        embeddings_table[:, 1::2] = torch.cos(position * div_term)
+
+        flat_relative_positions_matrix = final_mat.view(-1)
+        one_hot_relative_positions_matrix = torch.nn.functional.one_hot(
+            flat_relative_positions_matrix, num_classes=vocab_size
+        ).float()
+        positions_encoding = torch.matmul(one_hot_relative_positions_matrix, embeddings_table)
+        my_shape = list(final_mat.size())
+        my_shape.append(depth)
+        positions_encoding = positions_encoding.view(my_shape)
+        self.register_buffer("positions_encoding", positions_encoding, persistent=False)
+
+    def forward(self, length):
+        return self.positions_encoding[:length, :length, :]
+
+
+class NezhaEmbeddings(nn.Module):
+    """Construct the embeddings from word and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.register_buffer(
+            "token_type_ids", torch.zeros((1, config.max_position_embeddings), dtype=torch.long), persistent=False
+        )
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+    ) -> torch.Tensor:
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=inputs_embeds.device)
+
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class NezhaSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0:
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.relative_positions_encoding = NezhaRelativePositionsEncoding(
+            length=config.max_position_embeddings,
+            depth=self.attention_head_size,
+            max_relative_position=config.max_relative_position,
+        )
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.Tensor]:
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_values is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_values[0]
+            value_layer = past_key_values[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_values is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_values[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_values[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_values` is always `None`
+            past_key_values = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        batch_size, num_attention_heads, from_seq_length, to_seq_length = attention_scores.size()
+        relations_keys = self.relative_positions_encoding(to_seq_length)
+        query_layer_t = query_layer.permute(2, 0, 1, 3)
+
+        query_layer_r = query_layer_t.contiguous().view(
+            from_seq_length, batch_size * num_attention_heads, self.attention_head_size
+        )
+        key_position_scores = torch.matmul(query_layer_r, relations_keys.permute(0, 2, 1))
+        key_position_scores_r = key_position_scores.view(
+            from_seq_length, batch_size, num_attention_heads, from_seq_length
+        )
+        key_position_scores_r_t = key_position_scores_r.permute(1, 2, 0, 3)
+        attention_scores = attention_scores + key_position_scores_r_t
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in NezhaModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+        relations_values = self.relative_positions_encoding(to_seq_length)
+        attention_probs_t = attention_probs.permute(2, 0, 1, 3)
+        attentions_probs_r = attention_probs_t.contiguous().view(
+            from_seq_length, batch_size * num_attention_heads, to_seq_length
+        )
+        value_position_scores = torch.matmul(attentions_probs_r, relations_values)
+        value_position_scores_r = value_position_scores.view(
+            from_seq_length, batch_size, num_attention_heads, self.attention_head_size
+        )
+        value_position_scores_r_t = value_position_scores_r.permute(1, 2, 0, 3)
+        context_layer = context_layer + value_position_scores_r_t
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_values,)
+        return outputs
+
+
+class NezhaSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class NezhaAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.self = NezhaSelfAttention(config)
+        self.output = NezhaSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.Tensor]:
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_values,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class NezhaIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class NezhaOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class NezhaLayer(GradientCheckpointingLayer):
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = NezhaAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
+            self.crossattention = NezhaAttention(config)
+        self.intermediate = NezhaIntermediate(config)
+        self.output = NezhaOutput(config)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.Tensor]:
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_values[:2] if past_key_values is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_values=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
+                    " by setting `config.add_cross_attention=True`"
+                )
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_values tuple
+            cross_attn_past_key_value = past_key_values[-2:] if past_key_values is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class NezhaEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([NezhaLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            layer_outputs = layer_module(
+                hidden_states,
+                attention_mask,
+                layer_head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                past_key_values[i] if past_key_values is not None else None,
+                output_attentions,
+            )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class NezhaPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class NezhaPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class NezhaLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = NezhaPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def _tie_weights(self):
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+class NezhaOnlyMLMHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = NezhaLMPredictionHead(config)
+
+    def forward(self, sequence_output: torch.Tensor) -> torch.Tensor:
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class NezhaOnlyNSPHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, pooled_output):
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+
+
+class NezhaPreTrainingHeads(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = NezhaLMPredictionHead(config)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+class NezhaPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config: NezhaConfig
+    load_tf_weights = load_tf_weights_in_nezha
+    base_model_prefix = "nezha"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+@dataclass
+class NezhaForPreTrainingOutput(ModelOutput):
+    """
+    Output type of [`NezhaForPreTraining`].
+
+    Args:
+        loss (*optional*, returned when `labels` is provided, `torch.FloatTensor` of shape `(1,)`):
+            Total loss as the sum of the masked language modeling loss and the next sequence prediction
+            (classification) loss.
+        prediction_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        seq_relationship_logits (`torch.FloatTensor` of shape `(batch_size, 2)`):
+            Prediction scores of the next sequence prediction (classification) head (scores of True/False continuation
+            before SoftMax).
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    prediction_logits: Optional[torch.FloatTensor] = None
+    seq_relationship_logits: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+
+
+NEZHA_START_DOCSTRING = r"""
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`NezhaConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+NEZHA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Nezha Model transformer outputting raw hidden-states without any specific head on top.",
+    NEZHA_START_DOCSTRING,
+)
+class NezhaModel(NezhaPreTrainedModel):
+    """
+
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
+    all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
+    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
+    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
+    """
+
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = NezhaEmbeddings(config)
+        self.encoder = NezhaEncoder(config)
+
+        self.pooler = NezhaPooler(config) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPoolingAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPoolingAndCrossAttentions]:
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        # past_key_values_length
+        past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0
+
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, "token_type_ids"):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Nezha Model with two heads on top as done during the pretraining: a `masked language modeling` head and a `next
+    sentence prediction (classification)` head.
+    """,
+    NEZHA_START_DOCSTRING,
+)
+class NezhaForPreTraining(NezhaPreTrainedModel):
+    _tied_weights_keys = ["cls.predictions.decoder"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.nezha = NezhaModel(config)
+        self.cls = NezhaPreTrainingHeads(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+        self.cls.predictions.bias = new_embeddings.bias
+
+    @add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=NezhaForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        next_sentence_label: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], NezhaForPreTrainingOutput]:
+        r"""
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+                config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked),
+                the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+            next_sentence_label (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+                Labels for computing the next sequence prediction (classification) loss. Input should be a sequence
+                pair (see `input_ids` docstring) Indices should be in `[0, 1]`:
+
+                - 0 indicates sequence B is a continuation of sequence A,
+                - 1 indicates sequence B is a random sequence.
+            kwargs (`dict[str, any]`, optional, defaults to *{}*):
+                Used to hide legacy arguments that have been deprecated.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, NezhaForPreTraining
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("sijunhe/nezha-cn-base")
+        >>> model = NezhaForPreTraining.from_pretrained("sijunhe/nezha-cn-base")
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> prediction_logits = outputs.prediction_logits
+        >>> seq_relationship_logits = outputs.seq_relationship_logits
+        ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.nezha(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output, pooled_output = outputs[:2]
+        prediction_scores, seq_relationship_score = self.cls(sequence_output, pooled_output)
+
+        total_loss = None
+        if labels is not None and next_sentence_label is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+            next_sentence_loss = loss_fct(seq_relationship_score.view(-1, 2), next_sentence_label.view(-1))
+            total_loss = masked_lm_loss + next_sentence_loss
+
+        if not return_dict:
+            output = (prediction_scores, seq_relationship_score) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return NezhaForPreTrainingOutput(
+            loss=total_loss,
+            prediction_logits=prediction_scores,
+            seq_relationship_logits=seq_relationship_score,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings("""Nezha Model with a `language modeling` head on top.""", NEZHA_START_DOCSTRING)
+class NezhaForMaskedLM(NezhaPreTrainedModel):
+    _tied_weights_keys = ["cls.predictions.decoder"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        if config.is_decoder:
+            logger.warning(
+                "If you want to use `NezhaForMaskedLM` make sure `config.is_decoder=False` for "
+                "bi-directional self-attention."
+            )
+
+        self.nezha = NezhaModel(config, add_pooling_layer=False)
+        self.cls = NezhaOnlyMLMHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+        self.cls.predictions.bias = new_embeddings.bias
+
+    @add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MaskedLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], MaskedLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.nezha(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, attention_mask=None, **model_kwargs):
+        input_shape = input_ids.shape
+        effective_batch_size = input_shape[0]
+
+        #  add a dummy token
+        if self.config.pad_token_id is None:
+            raise ValueError("The PAD token should be defined for generation")
+
+        attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1)
+        dummy_token = torch.full(
+            (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device
+        )
+        input_ids = torch.cat([input_ids, dummy_token], dim=1)
+
+        return {"input_ids": input_ids, "attention_mask": attention_mask}
+
+
+@add_start_docstrings(
+    """Nezha Model with a `next sentence prediction (classification)` head on top.""",
+    NEZHA_START_DOCSTRING,
+)
+class NezhaForNextSentencePrediction(NezhaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.nezha = NezhaModel(config)
+        self.cls = NezhaOnlyNSPHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[tuple[torch.Tensor], NextSentencePredictorOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
+            (see `input_ids` docstring). Indices should be in `[0, 1]`:
+
+            - 0 indicates sequence B is a continuation of sequence A,
+            - 1 indicates sequence B is a random sequence.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, NezhaForNextSentencePrediction
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("sijunhe/nezha-cn-base")
+        >>> model = NezhaForNextSentencePrediction.from_pretrained("sijunhe/nezha-cn-base")
+
+        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+        >>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt")
+
+        >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
+        >>> logits = outputs.logits
+        >>> assert logits[0, 0] < logits[0, 1]  # next sentence was random
+        ```
+        """
+
+        if "next_sentence_label" in kwargs:
+            warnings.warn(
+                "The `next_sentence_label` argument is deprecated and will be removed in a future version, use"
+                " `labels` instead.",
+                FutureWarning,
+            )
+            labels = kwargs.pop("next_sentence_label")
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.nezha(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        seq_relationship_scores = self.cls(pooled_output)
+
+        next_sentence_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            next_sentence_loss = loss_fct(seq_relationship_scores.view(-1, 2), labels.view(-1))
+
+        if not return_dict:
+            output = (seq_relationship_scores,) + outputs[2:]
+            return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output
+
+        return NextSentencePredictorOutput(
+            loss=next_sentence_loss,
+            logits=seq_relationship_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Nezha Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
+    output) e.g. for GLUE tasks.
+    """,
+    NEZHA_START_DOCSTRING,
+)
+class NezhaForSequenceClassification(NezhaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+
+        self.nezha = NezhaModel(config)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.nezha(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Nezha Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    NEZHA_START_DOCSTRING,
+)
+class NezhaForMultipleChoice(NezhaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.nezha = NezhaModel(config)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], MultipleChoiceModelOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+            `input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+
+        outputs = self.nezha(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+        print(pooled_output.shape)
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        print(logits.shape)
+        print(num_choices)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Nezha Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    NEZHA_START_DOCSTRING,
+)
+class NezhaForTokenClassification(NezhaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.nezha = NezhaModel(config, add_pooling_layer=False)
+        classifier_dropout = (
+            config.classifier_dropout if config.classifier_dropout is not None else config.hidden_dropout_prob
+        )
+        self.dropout = nn.Dropout(classifier_dropout)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.nezha(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Nezha Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    NEZHA_START_DOCSTRING,
+)
+class NezhaForQuestionAnswering(NezhaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.nezha = NezhaModel(config, add_pooling_layer=False)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(NEZHA_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=QuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        start_positions: Optional[torch.Tensor] = None,
+        end_positions: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], QuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.nezha(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = [
+    "NezhaForNextSentencePrediction",
+    "NezhaForMaskedLM",
+    "NezhaForPreTraining",
+    "NezhaForMultipleChoice",
+    "NezhaForQuestionAnswering",
+    "NezhaForSequenceClassification",
+    "NezhaForTokenClassification",
+    "NezhaModel",
+    "NezhaPreTrainedModel",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/open_llama/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/open_llama/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b3964d194bed041987c6236b5c60bfcf3b7caf4
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/open_llama/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2023 EleutherAI and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ....utils import _LazyModule
+from ....utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_open_llama import *
+    from .modeling_open_llama import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/open_llama/configuration_open_llama.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/open_llama/configuration_open_llama.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4bc9cc72a7661fe571cce6649b40f2307d9b3ce
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/open_llama/configuration_open_llama.py
@@ -0,0 +1,169 @@
+# coding=utf-8
+# Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Open-Llama model configuration"""
+
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class OpenLlamaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`OpenLlamaModel`]. It is used to instantiate an
+    Open-Llama model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the
+    [s-JoL/Open-Llama-V1](https://huggingface.co/s-JoL/Open-Llama-V1).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Open-Llama model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`OpenLlamaModel`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        tie_word_embeddings(`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
+            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
+            these scaling strategies behave:
+            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
+            experimental feature, subject to breaking API changes in future versions.
+
+        Example:
+
+    ```python
+    >>> from transformers import OpenLlamaModel, OpenLlamaConfig
+
+    >>> # Initializing a Open-Llama open_llama-7b style configuration
+    >>> configuration = OpenLlamaConfig()
+
+    >>> # Initializing a model from the open_llama-7b style configuration
+    >>> model = OpenLlamaModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "open-llama"
+
+    def __init__(
+        self,
+        vocab_size=100000,
+        hidden_size=4096,
+        intermediate_size=11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-6,
+        use_cache=True,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        use_memory_efficient_attention=True,
+        hidden_dropout_prob=0.1,
+        attention_dropout_prob=0.1,
+        use_stable_embedding=True,
+        shared_input_output_embedding=True,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.use_memory_efficient_attention = kwargs.pop(
+            "use_memorry_efficient_attention", use_memory_efficient_attention
+        )
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_dropout_prob = attention_dropout_prob
+        self.use_stable_embedding = use_stable_embedding
+        self.shared_input_output_embedding = shared_input_output_embedding
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self._rope_scaling_validation()
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                f"`rope_scaling` must be a dictionary with two fields, `type` and `factor`, got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+            raise ValueError(
+                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+            )
+        if rope_scaling_factor is None or not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+            raise ValueError(f"`rope_scaling`'s factor field must be a float > 1, got {rope_scaling_factor}")
+
+
+__all__ = ["OpenLlamaConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/open_llama/modeling_open_llama.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/open_llama/modeling_open_llama.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e182e0f813f1c0c9ed138c39561cc23eff8cb26
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/open_llama/modeling_open_llama.py
@@ -0,0 +1,940 @@
+# coding=utf-8
+# Copyright 2023 EleutherAI and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Open-Llama model."""
+
+import math
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ....activations import ACT2FN
+from ....cache_utils import Cache
+from ....modeling_attn_mask_utils import _prepare_4d_causal_attention_mask
+from ....modeling_layers import GradientCheckpointingLayer
+from ....modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from ....modeling_utils import PreTrainedModel
+from ....utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from ....utils.deprecation import deprecate_kwarg
+from .configuration_open_llama import OpenLlamaConfig
+
+
+logger = logging.get_logger(__name__)
+
+try:
+    from xformers import ops as xops
+except ImportError:
+    xops = None
+
+
+_CONFIG_FOR_DOC = "OpenLlamaConfig"
+
+
+class OpenLlamaRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        OpenLlamaRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class OpenLlamaRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+    cos_cached: torch.Tensor
+    sin_cached: torch.Tensor
+
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None):
+        super().__init__()
+
+        self.dim = dim
+        self.max_position_embeddings = max_position_embeddings
+        self.base = base
+        inv_freq = 1.0 / (
+            self.base
+            ** (torch.arange(0, self.dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / self.dim)
+        )
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        # Build here to make `torch.jit.trace` work.
+        self._set_cos_sin_cache(
+            seq_len=max_position_embeddings, device=self.inv_freq.device, dtype=torch.get_default_dtype()
+        )
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
+
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+    def forward(self, x, seq_len=None):
+        # x: [bs, num_attention_heads, seq_len, head_size]
+        if seq_len > self.max_seq_len_cached:
+            self._set_cos_sin_cache(seq_len=seq_len, device=x.device, dtype=x.dtype)
+
+        return (
+            self.cos_cached[:seq_len].to(dtype=x.dtype),
+            self.sin_cached[:seq_len].to(dtype=x.dtype),
+        )
+
+
+class OpenLlamaLinearScalingRotaryEmbedding(OpenLlamaRotaryEmbedding):
+    """OpenLlamaRotaryEmbedding extended with linear scaling. Credits to the Reddit user /u/kaiokendev"""
+
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
+        t = t / self.scaling_factor
+
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+
+class OpenLlamaDynamicNTKScalingRotaryEmbedding(OpenLlamaRotaryEmbedding):
+    """OpenLlamaRotaryEmbedding extended with Dynamic NTK scaling. Credits to the Reddit users /u/bloc97 and /u/emozilla"""
+
+    def __init__(self, dim, max_position_embeddings=2048, base=10000, device=None, scaling_factor=1.0):
+        self.scaling_factor = scaling_factor
+        super().__init__(dim, max_position_embeddings, base, device)
+
+    def _set_cos_sin_cache(self, seq_len, device, dtype):
+        self.max_seq_len_cached = seq_len
+
+        if seq_len > self.max_position_embeddings:
+            base = self.base * (
+                (self.scaling_factor * seq_len / self.max_position_embeddings) - (self.scaling_factor - 1)
+            ) ** (self.dim / (self.dim - 2))
+            inv_freq = 1.0 / (
+                base
+                ** (torch.arange(0, self.dim, 2, dtype=torch.int64).to(device=device, dtype=torch.float) / self.dim)
+            )
+            self.register_buffer("inv_freq", inv_freq, persistent=False)
+
+        t = torch.arange(self.max_seq_len_cached, device=device, dtype=torch.int64).type_as(self.inv_freq)
+
+        freqs = torch.outer(t, self.inv_freq)
+        # Different from paper, but it uses a different permutation in order to obtain the same calculation
+        emb = torch.cat((freqs, freqs), dim=-1)
+        self.register_buffer("cos_cached", emb.cos().to(dtype), persistent=False)
+        self.register_buffer("sin_cached", emb.sin().to(dtype), persistent=False)
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`):
+            The position indices of the tokens corresponding to the query and key tensors. For example, this can be
+            used to pass offsetted position ids when working with a KV-cache.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos[position_ids].unsqueeze(unsqueeze_dim)
+    sin = sin[position_ids].unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+class OpenLlamaMLP(nn.Module):
+    def __init__(
+        self,
+        hidden_size: int,
+        intermediate_size: int,
+        hidden_act: str,
+        dropout_prob: float,
+    ):
+        super().__init__()
+        self.gate_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
+        self.down_proj = nn.Linear(intermediate_size, hidden_size, bias=False)
+        self.up_proj = nn.Linear(hidden_size, intermediate_size, bias=False)
+        self.act_fn = ACT2FN[hidden_act]
+        self.dropout = nn.Dropout(dropout_prob)
+
+    def forward(self, x):
+        out = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return self.dropout(out)
+
+
+class OpenLlamaAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: OpenLlamaConfig):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.hidden_size // self.num_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.dropout_prob = config.attention_dropout_prob
+        self.rope_theta = config.rope_theta
+
+        if (self.head_dim * self.num_heads) != self.hidden_size:
+            raise ValueError(
+                f"hidden_size must be divisible by num_heads (got `hidden_size`: {self.hidden_size}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+        self._init_rope()
+
+    def _init_rope(self):
+        if self.config.rope_scaling is None:
+            self.rotary_emb = OpenLlamaRotaryEmbedding(
+                self.head_dim,
+                max_position_embeddings=self.max_position_embeddings,
+                base=self.rope_theta,
+            )
+        else:
+            scaling_type = self.config.rope_scaling["type"]
+            scaling_factor = self.config.rope_scaling["factor"]
+            if scaling_type == "linear":
+                self.rotary_emb = OpenLlamaLinearScalingRotaryEmbedding(
+                    self.head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                )
+            elif scaling_type == "dynamic":
+                self.rotary_emb = OpenLlamaDynamicNTKScalingRotaryEmbedding(
+                    self.head_dim,
+                    max_position_embeddings=self.max_position_embeddings,
+                    scaling_factor=scaling_factor,
+                    base=self.rope_theta,
+                )
+            else:
+                raise ValueError(f"Unknown RoPE scaling type {scaling_type}")
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: bool = False,
+        use_cache: bool = False,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+
+        kv_seq_len = key_states.shape[-2]
+        if past_key_values is not None:
+            kv_seq_len += past_key_values[0].shape[-2]
+        cos, sin = self.rotary_emb(value_states, seq_len=kv_seq_len)
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin, position_ids)
+        # [bsz, nh, t, hd]
+
+        if past_key_values is not None:
+            # reuse k, v, self_attention
+            key_states = torch.cat([past_key_values[0], key_states], dim=2)
+            value_states = torch.cat([past_key_values[1], value_states], dim=2)
+
+        past_key_values = (key_states, value_states) if use_cache else None
+
+        if self.config.use_memory_efficient_attention and xops is not None and self.training:
+            attn_weights = None
+            query_states = query_states.transpose(1, 2)
+            key_states = key_states.transpose(1, 2)
+            value_states = value_states.transpose(1, 2)
+            attn_output = xops.memory_efficient_attention(
+                query_states, key_states, value_states, attn_bias=xops.LowerTriangularMask(), p=self.dropout_prob
+            )
+        else:
+            attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+            if attn_weights.size() != (bsz, self.num_heads, q_len, kv_seq_len):
+                raise ValueError(
+                    f"Attention weights should be of size {(bsz * self.num_heads, q_len, kv_seq_len)}, but is"
+                    f" {attn_weights.size()}"
+                )
+
+            if attention_mask is not None:
+                if attention_mask.size() != (bsz, 1, q_len, kv_seq_len):
+                    raise ValueError(
+                        f"Attention mask should be of size {(bsz, 1, q_len, kv_seq_len)}, but is {attention_mask.size()}"
+                    )
+                attn_weights = attn_weights + attention_mask
+                attn_weights = torch.max(
+                    attn_weights, torch.tensor(torch.finfo(attn_weights.dtype).min, device=attn_weights.device)
+                )
+
+            # upcast attention to fp32
+            attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+            attn_output = torch.matmul(attn_weights, value_states)
+
+            if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
+                raise ValueError(
+                    f"`attn_output` should be of size {(bsz, self.num_heads, q_len, self.head_dim)}, but is"
+                    f" {attn_output.size()}"
+                )
+
+            attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(bsz, q_len, self.hidden_size)
+
+        attn_output = self.o_proj(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights, past_key_values
+
+
+class OpenLlamaDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: OpenLlamaConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = OpenLlamaAttention(config=config)
+        self.mlp = OpenLlamaMLP(
+            hidden_size=self.hidden_size,
+            intermediate_size=config.intermediate_size,
+            hidden_act=config.hidden_act,
+            dropout_prob=config.hidden_dropout_prob,
+        )
+        self.input_layernorm = OpenLlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = OpenLlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            past_key_values (`Tuple(torch.FloatTensor)`, *optional*): cached past key and value projection states
+        """
+
+        residual = hidden_states
+
+        hidden_states = self.input_layernorm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+OPEN_LLAMA_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`OpenLlamaConfig`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    "The bare Open-Llama Model outputting raw hidden-states without any specific head on top.",
+    OPEN_LLAMA_START_DOCSTRING,
+)
+class OpenLlamaPreTrainedModel(PreTrainedModel):
+    config: OpenLlamaConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["OpenLlamaDecoderLayer"]
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            if self.config.use_stable_embedding:
+                torch.nn.init.xavier_normal_(module.weight.data)
+            else:
+                module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+OPEN_LLAMA_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+            If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
+            and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
+            information on the default strategy.
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.n_positions - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of shape
+            `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of shape
+            `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+            Contains pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
+            blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Open-Llama Model outputting raw hidden-states without any specific head on top.",
+    OPEN_LLAMA_START_DOCSTRING,
+)
+class OpenLlamaModel(OpenLlamaPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`OpenLlamaDecoderLayer`]
+
+    Args:
+        config: OpenLlamaConfig
+    """
+
+    def __init__(self, config: OpenLlamaConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        if config.use_stable_embedding:
+            self.embed_layer_norm = nn.LayerNorm(config.hidden_size)
+        else:
+            self.embed_layer_norm = None
+        self.layers = nn.ModuleList([OpenLlamaDecoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.norm = OpenLlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(OPEN_LLAMA_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            batch_size, seq_length = input_ids.shape
+        elif inputs_embeds is not None:
+            batch_size, seq_length, _ = inputs_embeds.shape
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        seq_length_with_past = seq_length
+        past_key_values_length = 0
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        if past_key_values is not None:
+            past_key_values_length = past_key_values.get_seq_length()
+            seq_length_with_past = seq_length_with_past + past_key_values_length
+
+        if position_ids is None:
+            device = input_ids.device if input_ids is not None else inputs_embeds.device
+            position_ids = torch.arange(
+                past_key_values_length, seq_length + past_key_values_length, dtype=torch.long, device=device
+            )
+            position_ids = position_ids.unsqueeze(0)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+            if self.embed_layer_norm:
+                inputs_embeds = self.embed_layer_norm(inputs_embeds)
+        # embed positions
+        if self.config.use_memory_efficient_attention and self.training:
+            attention_mask = None
+        elif attention_mask is None:
+            attention_mask = torch.ones(
+                (batch_size, seq_length_with_past), dtype=torch.bool, device=inputs_embeds.device
+            )
+
+        input_shape = (batch_size, seq_length)
+        attention_mask = _prepare_4d_causal_attention_mask(
+            attention_mask, input_shape, inputs_embeds, past_key_values_length
+        )
+
+        hidden_states = inputs_embeds
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        next_decoder_cache = () if use_cache else None
+
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values[idx] if past_key_values is not None else None,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[2 if output_attentions else 1],)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(v for v in [hidden_states, next_cache, all_hidden_states, all_self_attns] if v is not None)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+
+
+class OpenLlamaForCausalLM(OpenLlamaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = OpenLlamaModel(config)
+        if config.shared_input_output_embedding:
+            self.lm_head = None
+        else:
+            self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(OPEN_LLAMA_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=CausalLMOutputWithPast, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, CausalLMOutputWithPast]:
+        r"""
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, OpenLlamaForCausalLM
+
+        >>> model = OpenLlamaForCausalLM.from_pretrained("openlm-research/open_llama_7b")
+        >>> tokenizer = AutoTokenizer.from_pretrained("openlm-research/open_llama_7b")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        if self.config.shared_input_output_embedding:
+            logits = torch.einsum(
+                "blh,vh->blv", hidden_states.to(self.model.embed_tokens.weight.device), self.model.embed_tokens.weight
+            )
+        else:
+            logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # move labels to correct device to enable model parallelism
+            labels = labels.to(logits.device)
+            # Shift so that tokens < n predict n
+            shift_logits = logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            shift_logits = shift_logits.view(-1, self.config.vocab_size)
+            shift_labels = shift_labels.view(-1)
+            # Enable model parallelism
+            shift_labels = shift_labels.to(shift_logits.device)
+            loss = loss_fct(shift_logits, shift_labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, inputs_embeds=None, **kwargs
+    ):
+        if past_key_values is not None:
+            past_length = past_key_values.get_seq_length()
+
+            # Some generation methods already pass only the last input ID
+            if input_ids.shape[1] > past_length:
+                remove_prefix_length = past_length
+            else:
+                # Default to old behavior: keep only final ID
+                remove_prefix_length = input_ids.shape[1] - 1
+
+            input_ids = input_ids[:, remove_prefix_length:]
+
+        position_ids = kwargs.get("position_ids")
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if past_key_values:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and past_key_values is None:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids}
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": kwargs.get("use_cache"),
+                "attention_mask": attention_mask,
+            }
+        )
+        return model_inputs
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+
+
+@add_start_docstrings(
+    """
+    The LLaMa Model transformer with a sequence classification head on top (linear layer).
+
+    [`OpenLlamaForSequenceClassification`] uses the last token in order to do the classification, as other causal
+    models (e.g. GPT-2) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    OPEN_LLAMA_START_DOCSTRING,
+)
+class OpenLlamaForSequenceClassification(OpenLlamaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = OpenLlamaModel(config)
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(OPEN_LLAMA_INPUTS_DOCSTRING)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
+                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
+                sequence_lengths = sequence_lengths % input_ids.shape[-1]
+                sequence_lengths = sequence_lengths.to(logits.device)
+            else:
+                sequence_lengths = -1
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), sequence_lengths]
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+__all__ = ["OpenLlamaPreTrainedModel", "OpenLlamaModel", "OpenLlamaForCausalLM", "OpenLlamaForSequenceClassification"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/qdqbert/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/qdqbert/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..864b321bc2ee3a521e3d6da5403cab7363dea56b
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/qdqbert/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2021 NVIDIA Corporation and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ....utils import _LazyModule
+from ....utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_qdqbert import *
+    from .modeling_qdqbert import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/qdqbert/configuration_qdqbert.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/qdqbert/configuration_qdqbert.py
new file mode 100644
index 0000000000000000000000000000000000000000..91ac82bc5a0292355ac659548c7c3d4336f2536c
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/qdqbert/configuration_qdqbert.py
@@ -0,0 +1,123 @@
+# coding=utf-8
+# Copyright 2021 NVIDIA Corporation and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""QDQBERT model configuration"""
+
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class QDQBertConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`QDQBertModel`]. It is used to instantiate an
+    QDQBERT model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the BERT
+    [google-bert/bert-base-uncased](https://huggingface.co/google-bert/bert-base-uncased) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the QDQBERT model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`QDQBertModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimension of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`QDQBertModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        is_decoder (`bool`, *optional*, defaults to `False`):
+            Whether the model is used as a decoder or not. If `False`, the model is used as an encoder.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+
+    Examples:
+
+    ```python
+    >>> from transformers import QDQBertModel, QDQBertConfig
+
+    >>> # Initializing a QDQBERT google-bert/bert-base-uncased style configuration
+    >>> configuration = QDQBertConfig()
+
+    >>> # Initializing a model from the google-bert/bert-base-uncased style configuration
+    >>> model = QDQBertModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "qdqbert"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        use_cache=True,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.type_vocab_size = type_vocab_size
+        self.layer_norm_eps = layer_norm_eps
+        self.use_cache = use_cache
+
+
+__all__ = ["QDQBertConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/qdqbert/modeling_qdqbert.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/qdqbert/modeling_qdqbert.py
new file mode 100644
index 0000000000000000000000000000000000000000..f92bc07a8bfb1c495773be09799a4f06744849f9
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/qdqbert/modeling_qdqbert.py
@@ -0,0 +1,1736 @@
+# coding=utf-8
+# Copyright 2021 NVIDIA Corporation and The HuggingFace Team.
+# Copyright (c) 2018-2021, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch QDQBERT model."""
+
+import math
+import os
+import warnings
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ....activations import ACT2FN
+from ....cache_utils import Cache
+from ....modeling_layers import GradientCheckpointingLayer
+from ....modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    MaskedLMOutput,
+    MultipleChoiceModelOutput,
+    NextSentencePredictorOutput,
+    QuestionAnsweringModelOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+)
+from ....modeling_utils import PreTrainedModel
+from ....pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ....utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    is_pytorch_quantization_available,
+    logging,
+    replace_return_docstrings,
+    requires_backends,
+)
+from ....utils.deprecation import deprecate_kwarg
+from .configuration_qdqbert import QDQBertConfig
+
+
+logger = logging.get_logger(__name__)
+
+# soft dependency
+if is_pytorch_quantization_available():
+    try:
+        from pytorch_quantization import nn as quant_nn
+        from pytorch_quantization.nn.modules.tensor_quantizer import TensorQuantizer
+    except OSError:
+        logger.error(
+            "QDQBERT model are not usable since `pytorch_quantization` can't be loaded. Please try to reinstall it"
+            " following the instructions here:"
+            " https://github.com/NVIDIA/TensorRT/tree/master/tools/pytorch-quantization."
+        )
+
+_CHECKPOINT_FOR_DOC = "google-bert/bert-base-uncased"
+_CONFIG_FOR_DOC = "QDQBertConfig"
+
+
+def load_tf_weights_in_qdqbert(model, tf_checkpoint_path):
+    """Load tf checkpoints in a pytorch model."""
+    try:
+        import re
+
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        logger.info(f"Loading TF weight {name} with shape {shape}")
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+
+    for name, array in zip(names, arrays):
+        name = name.split("/")
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(
+            n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
+            for n in name
+        ):
+            logger.info(f"Skipping {'/'.join(name)}")
+            continue
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
+                scope_names = re.split(r"_(\d+)", m_name)
+            else:
+                scope_names = [m_name]
+            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
+                pointer = getattr(pointer, "bias")
+            elif scope_names[0] == "output_weights":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "squad":
+                pointer = getattr(pointer, "classifier")
+            else:
+                try:
+                    pointer = getattr(pointer, scope_names[0])
+                except AttributeError:
+                    logger.info(f"Skipping {'/'.join(name)}")
+                    continue
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
+                pointer = pointer[num]
+        if m_name[-11:] == "_embeddings":
+            pointer = getattr(pointer, "weight")
+        elif m_name == "kernel":
+            array = np.transpose(array)
+        try:
+            if pointer.shape != array.shape:
+                raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        logger.info(f"Initialize PyTorch weight {name}")
+        pointer.data = torch.from_numpy(array)
+    return model
+
+
+class QDQBertEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+        self.register_buffer(
+            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
+        )
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        past_key_values_length: int = 0,
+    ) -> torch.Tensor:
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
+
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class QDQBertSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = quant_nn.QuantLinear(config.hidden_size, self.all_head_size)
+        self.key = quant_nn.QuantLinear(config.hidden_size, self.all_head_size)
+        self.value = quant_nn.QuantLinear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+
+        self.matmul_q_input_quantizer = TensorQuantizer(quant_nn.QuantLinear.default_quant_desc_input)
+        self.matmul_k_input_quantizer = TensorQuantizer(quant_nn.QuantLinear.default_quant_desc_input)
+        self.matmul_v_input_quantizer = TensorQuantizer(quant_nn.QuantLinear.default_quant_desc_input)
+        self.matmul_a_input_quantizer = TensorQuantizer(quant_nn.QuantLinear.default_quant_desc_input)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        output_attentions=False,
+    ):
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_values is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_values[0]
+            value_layer = past_key_values[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_values is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_values[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_values[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_values` is always `None`
+            past_key_values = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(
+            self.matmul_q_input_quantizer(query_layer), self.matmul_k_input_quantizer(key_layer.transpose(-1, -2))
+        )
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            seq_length = hidden_states.size()[1]
+            position_ids_l = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(seq_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in QDQBertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(
+            self.matmul_a_input_quantizer(attention_probs), self.matmul_v_input_quantizer(value_layer)
+        )
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_values,)
+        return outputs
+
+
+class QDQBertSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        # Quantize Linear layer
+        self.dense = quant_nn.QuantLinear(config.hidden_size, config.hidden_size)
+
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        # Quantize the inputs to the residual add
+        self.add_local_input_quantizer = TensorQuantizer(quant_nn.QuantLinear.default_quant_desc_input)
+        self.add_residual_input_quantizer = TensorQuantizer(quant_nn.QuantLinear.default_quant_desc_input)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        # Quantize the inputs to the residual add
+        add_local = self.add_local_input_quantizer(hidden_states)
+        add_residual = self.add_residual_input_quantizer(input_tensor)
+        hidden_states = self.LayerNorm(add_local + add_residual)
+        return hidden_states
+
+
+# Based on transformers.models.bert.modeling_bert.BertAttention with Bert -> QDQBert
+class QDQBertAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.self = QDQBertSelfAttention(config)
+        self.output = QDQBertSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        output_attentions=False,
+    ):
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_values,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class QDQBertIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        # Quantize Linear layer
+        self.dense = quant_nn.QuantLinear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class QDQBertOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        # Quantize Linear layer
+        self.dense = quant_nn.QuantLinear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+        # Quantize the inputs to the residual add
+        self.add_local_input_quantizer = TensorQuantizer(quant_nn.QuantLinear.default_quant_desc_input)
+        self.add_residual_input_quantizer = TensorQuantizer(quant_nn.QuantLinear.default_quant_desc_input)
+
+    def forward(self, hidden_states, input_tensor):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        # Quantize the inputs to the residual add
+        add_local = self.add_local_input_quantizer(hidden_states)
+        add_residual = self.add_residual_input_quantizer(input_tensor)
+        hidden_states = self.LayerNorm(add_local + add_residual)
+        return hidden_states
+
+
+# Based on transformers.models.bert.modeling_bert.BertLayer with Bert -> QDQBert
+class QDQBertLayer(GradientCheckpointingLayer):
+    def __init__(self, config):
+        super().__init__()
+        self.seq_len_dim = 1
+        self.attention = QDQBertAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
+            self.crossattention = QDQBertAttention(config)
+        self.intermediate = QDQBertIntermediate(config)
+        self.output = QDQBertOutput(config)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        output_attentions=False,
+    ):
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_values[:2] if past_key_values is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_values=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
+                    " by setting `config.add_cross_attention=True`"
+                )
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_values tuple
+            cross_attn_past_key_value = past_key_values[-2:] if past_key_values is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        layer_output = self.feed_forward_chunk(attention_output)
+        outputs = (layer_output,) + outputs
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+# Based on transformers.models.bert.modeling_bert.BertEncoder with Bert -> QDQBert
+class QDQBertEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([QDQBertLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            layer_outputs = layer_module(
+                hidden_states,
+                attention_mask,
+                layer_head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                past_key_values[i] if past_key_values is not None else None,
+                output_attentions,
+            )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class QDQBertPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class QDQBertPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+# Based on transformers.models.bert.modeling_bert.BertLMPredictionHead with Bert -> QDQBert
+class QDQBertLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = QDQBertPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def _tie_weights(self):
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+# Based on transformers.models.bert.modeling_bert.BertOnlyMLMHead with Bert -> QDQBert
+class QDQBertOnlyMLMHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = QDQBertLMPredictionHead(config)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class QDQBertOnlyNSPHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, pooled_output):
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return seq_relationship_score
+
+
+# Based on transformers.models.bert.modeling_bert.BertPreTrainingHeads with Bert -> QDQBert
+class QDQBertPreTrainingHeads(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = QDQBertLMPredictionHead(config)
+        self.seq_relationship = nn.Linear(config.hidden_size, 2)
+
+    def forward(self, sequence_output, pooled_output):
+        prediction_scores = self.predictions(sequence_output)
+        seq_relationship_score = self.seq_relationship(pooled_output)
+        return prediction_scores, seq_relationship_score
+
+
+# Based on transformers.models.bert.modeling_bert.BertPreTrainedModel with Bert -> QDQBert
+class QDQBertPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config: QDQBertConfig
+    load_tf_weights = load_tf_weights_in_qdqbert
+    base_model_prefix = "bert"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+QDQBERT_START_DOCSTRING = r"""
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`QDQBertConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+QDQBERT_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare QDQBERT Model transformer outputting raw hidden-states without any specific head on top.",
+    QDQBERT_START_DOCSTRING,
+)
+class QDQBertModel(QDQBertPreTrainedModel):
+    """
+
+    The model can behave as an encoder (with only self-attention) as well as a decoder, in which case a layer of
+    cross-attention is added between the self-attention layers, following the architecture described in [Attention is
+    all you need](https://huggingface.co/papers/1706.03762) by Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit,
+    Llion Jones, Aidan N. Gomez, Lukasz Kaiser and Illia Polosukhin.
+
+    To behave as an decoder the model needs to be initialized with the `is_decoder` argument of the configuration set
+    to `True`. To be used in a Seq2Seq model, the model needs to initialized with both `is_decoder` argument and
+    `add_cross_attention` set to `True`; an `encoder_hidden_states` is then expected as an input to the forward pass.
+    """
+
+    def __init__(self, config, add_pooling_layer: bool = True):
+        requires_backends(self, "pytorch_quantization")
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = QDQBertEmbeddings(config)
+        self.encoder = QDQBertEncoder(config)
+
+        self.pooler = QDQBertPooler(config) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune: dict[int, list[int]]):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(QDQBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPoolingAndCrossAttentions,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutputWithPoolingAndCrossAttentions]:
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+            batch_size, seq_length = input_shape
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            batch_size, seq_length = input_shape
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        # past_key_values_length
+        past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0
+
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, "token_type_ids"):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    """QDQBERT Model with a `language modeling` head on top for CLM fine-tuning.""", QDQBERT_START_DOCSTRING
+)
+class QDQBertLMHeadModel(QDQBertPreTrainedModel):
+    _tied_weights_keys = ["predictions.decoder.weight", "predictions.decoder.bias"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        if not config.is_decoder:
+            logger.warning("If you want to use `QDQBertLMHeadModel` as a standalone, add `is_decoder=True.`")
+
+        self.bert = QDQBertModel(config, add_pooling_layer=False)
+        self.cls = QDQBertOnlyMLMHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+        self.cls.predictions.bias = new_embeddings.bias
+
+    @add_start_docstrings_to_model_forward(QDQBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[tuple[tuple[torch.LongTensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, CausalLMOutputWithCrossAttentions]:
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, QDQBertLMHeadModel, QDQBertConfig
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-cased")
+        >>> config = QDQBertConfig.from_pretrained("google-bert/bert-base-cased")
+        >>> config.is_decoder = True
+        >>> model = QDQBertLMHeadModel.from_pretrained("google-bert/bert-base-cased", config=config)
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> prediction_logits = outputs.logits
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        lm_loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            shifted_prediction_scores = prediction_scores[:, :-1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss_fct = CrossEntropyLoss()
+            lm_loss = loss_fct(shifted_prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((lm_loss,) + output) if lm_loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=lm_loss,
+            logits=prediction_scores,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids: Optional[torch.LongTensor],
+        past_key_values=None,
+        attention_mask: Optional[torch.Tensor] = None,
+        **model_kwargs,
+    ):
+        input_shape = input_ids.shape
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_shape)
+
+        # cut decoder_input_ids if past_key_values is used
+        if past_key_values is not None:
+            past_length = past_key_values.get_seq_length()
+
+            # Some generation methods already pass only the last input ID
+            if input_ids.shape[1] > past_length:
+                remove_prefix_length = past_length
+            else:
+                # Default to old behavior: keep only final ID
+                remove_prefix_length = input_ids.shape[1] - 1
+
+            input_ids = input_ids[:, remove_prefix_length:]
+
+        return {"input_ids": input_ids, "attention_mask": attention_mask, "past_key_values": past_key_values}
+
+    def _reorder_cache(self, past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+
+
+@add_start_docstrings("""QDQBERT Model with a `language modeling` head on top.""", QDQBERT_START_DOCSTRING)
+class QDQBertForMaskedLM(QDQBertPreTrainedModel):
+    _tied_weights_keys = ["predictions.decoder.weight", "predictions.decoder.bias"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        if config.is_decoder:
+            logger.warning(
+                "If you want to use `QDQBertForMaskedLM` make sure `config.is_decoder=False` for "
+                "bi-directional self-attention."
+            )
+
+        self.bert = QDQBertModel(config, add_pooling_layer=False)
+        self.cls = QDQBertOnlyMLMHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+        self.cls.predictions.bias = new_embeddings.bias
+
+    @add_start_docstrings_to_model_forward(QDQBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MaskedLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, MaskedLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        prediction_scores = self.cls(sequence_output)
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()  # -100 index = padding token
+            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (prediction_scores,) + outputs[2:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids: torch.LongTensor, attention_mask: Optional[torch.FloatTensor] = None, **model_kwargs
+    ):
+        input_shape = input_ids.shape
+        effective_batch_size = input_shape[0]
+
+        #  add a dummy token
+        if self.config.pad_token_id is None:
+            raise ValueError("The PAD token should be defined for generation")
+
+        attention_mask = torch.cat([attention_mask, attention_mask.new_zeros((attention_mask.shape[0], 1))], dim=-1)
+        dummy_token = torch.full(
+            (effective_batch_size, 1), self.config.pad_token_id, dtype=torch.long, device=input_ids.device
+        )
+        input_ids = torch.cat([input_ids, dummy_token], dim=1)
+
+        return {"input_ids": input_ids, "attention_mask": attention_mask}
+
+
+@add_start_docstrings(
+    """Bert Model with a `next sentence prediction (classification)` head on top.""",
+    QDQBERT_START_DOCSTRING,
+)
+class QDQBertForNextSentencePrediction(QDQBertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = QDQBertModel(config)
+        self.cls = QDQBertOnlyNSPHead(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(QDQBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=NextSentencePredictorOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[tuple, NextSentencePredictorOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the next sequence prediction (classification) loss. Input should be a sequence pair
+            (see `input_ids` docstring). Indices should be in `[0, 1]`:
+
+            - 0 indicates sequence B is a continuation of sequence A,
+            - 1 indicates sequence B is a random sequence.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, QDQBertForNextSentencePrediction
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+        >>> model = QDQBertForNextSentencePrediction.from_pretrained("google-bert/bert-base-uncased")
+
+        >>> prompt = "In Italy, pizza served in formal settings, such as at a restaurant, is presented unsliced."
+        >>> next_sentence = "The sky is blue due to the shorter wavelength of blue light."
+        >>> encoding = tokenizer(prompt, next_sentence, return_tensors="pt")
+
+        >>> outputs = model(**encoding, labels=torch.LongTensor([1]))
+        >>> logits = outputs.logits
+        >>> assert logits[0, 0] < logits[0, 1]  # next sentence was random
+        ```"""
+
+        if "next_sentence_label" in kwargs:
+            warnings.warn(
+                "The `next_sentence_label` argument is deprecated and will be removed in a future version, use"
+                " `labels` instead.",
+                FutureWarning,
+            )
+            labels = kwargs.pop("next_sentence_label")
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        seq_relationship_scores = self.cls(pooled_output)
+
+        next_sentence_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            next_sentence_loss = loss_fct(seq_relationship_scores.view(-1, 2), labels.view(-1))
+
+        if not return_dict:
+            output = (seq_relationship_scores,) + outputs[2:]
+            return ((next_sentence_loss,) + output) if next_sentence_loss is not None else output
+
+        return NextSentencePredictorOutput(
+            loss=next_sentence_loss,
+            logits=seq_relationship_scores,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Bert Model transformer with a sequence classification/regression head on top (a linear layer on top of the pooled
+    output) e.g. for GLUE tasks.
+    """,
+    QDQBERT_START_DOCSTRING,
+)
+class QDQBertForSequenceClassification(QDQBertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.config = config
+
+        self.bert = QDQBertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(QDQBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(logits, labels)
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    Bert Model with a multiple choice classification head on top (a linear layer on top of the pooled output and a
+    softmax) e.g. for RocStories/SWAG tasks.
+    """,
+    QDQBERT_START_DOCSTRING,
+)
+class QDQBertForMultipleChoice(QDQBertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.bert = QDQBertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, 1)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(QDQBERT_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=MultipleChoiceModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, MultipleChoiceModelOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the multiple choice classification loss. Indices should be in `[0, ...,
+            num_choices-1]` where `num_choices` is the size of the second dimension of the input tensors. (See
+            `input_ids` above)
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        num_choices = input_ids.shape[1] if input_ids is not None else inputs_embeds.shape[1]
+
+        input_ids = input_ids.view(-1, input_ids.size(-1)) if input_ids is not None else None
+        attention_mask = attention_mask.view(-1, attention_mask.size(-1)) if attention_mask is not None else None
+        token_type_ids = token_type_ids.view(-1, token_type_ids.size(-1)) if token_type_ids is not None else None
+        position_ids = position_ids.view(-1, position_ids.size(-1)) if position_ids is not None else None
+        inputs_embeds = (
+            inputs_embeds.view(-1, inputs_embeds.size(-2), inputs_embeds.size(-1))
+            if inputs_embeds is not None
+            else None
+        )
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        pooled_output = self.dropout(pooled_output)
+        logits = self.classifier(pooled_output)
+        reshaped_logits = logits.view(-1, num_choices)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(reshaped_logits, labels)
+
+        if not return_dict:
+            output = (reshaped_logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return MultipleChoiceModelOutput(
+            loss=loss,
+            logits=reshaped_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    QDQBERT Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for
+    Named-Entity-Recognition (NER) tasks.
+    """,
+    QDQBERT_START_DOCSTRING,
+)
+class QDQBertForTokenClassification(QDQBertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.bert = QDQBertModel(config, add_pooling_layer=False)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(QDQBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the token classification loss. Indices should be in `[0, ..., config.num_labels - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        sequence_output = self.dropout(sequence_output)
+        logits = self.classifier(sequence_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    QDQBERT Model with a span classification head on top for extractive question-answering tasks like SQuAD (a linear
+    layers on top of the hidden-states output to compute `span start logits` and `span end logits`).
+    """,
+    QDQBERT_START_DOCSTRING,
+)
+class QDQBertForQuestionAnswering(QDQBertPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.bert = QDQBertModel(config, add_pooling_layer=False)
+        self.qa_outputs = nn.Linear(config.hidden_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(QDQBERT_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=QuestionAnsweringModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, QuestionAnsweringModelOutput]:
+        r"""
+        start_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.bert(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.qa_outputs(sequence_output)
+        start_logits, end_logits = logits.split(1, dim=-1)
+        start_logits = start_logits.squeeze(-1).contiguous()
+        end_logits = end_logits.squeeze(-1).contiguous()
+
+        total_loss = None
+        if start_positions is not None and end_positions is not None:
+            # If we are on multi-GPU, split add a dimension
+            if len(start_positions.size()) > 1:
+                start_positions = start_positions.squeeze(-1)
+            if len(end_positions.size()) > 1:
+                end_positions = end_positions.squeeze(-1)
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            ignored_index = start_logits.size(1)
+            start_positions = start_positions.clamp(0, ignored_index)
+            end_positions = end_positions.clamp(0, ignored_index)
+
+            loss_fct = CrossEntropyLoss(ignore_index=ignored_index)
+            start_loss = loss_fct(start_logits, start_positions)
+            end_loss = loss_fct(end_logits, end_positions)
+            total_loss = (start_loss + end_loss) / 2
+
+        if not return_dict:
+            output = (start_logits, end_logits) + outputs[2:]
+            return ((total_loss,) + output) if total_loss is not None else output
+
+        return QuestionAnsweringModelOutput(
+            loss=total_loss,
+            start_logits=start_logits,
+            end_logits=end_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = [
+    "QDQBertForMaskedLM",
+    "QDQBertForMultipleChoice",
+    "QDQBertForNextSentencePrediction",
+    "QDQBertForQuestionAnswering",
+    "QDQBertForSequenceClassification",
+    "QDQBertForTokenClassification",
+    "QDQBertLayer",
+    "QDQBertLMHeadModel",
+    "QDQBertModel",
+    "QDQBertPreTrainedModel",
+    "load_tf_weights_in_qdqbert",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cdfdeb5d179c8e954c9c6822d3f154d632394964
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/__init__.py
@@ -0,0 +1,30 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ....utils import _LazyModule
+from ....utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_realm import *
+    from .modeling_realm import *
+    from .retrieval_realm import *
+    from .tokenization_realm import *
+    from .tokenization_realm_fast import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/__pycache__/retrieval_realm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/__pycache__/retrieval_realm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..57d77c3606cc4e7925f9dc353570e509688d7e2d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/__pycache__/retrieval_realm.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/configuration_realm.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/configuration_realm.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbf32378a604beca939ff6e0241f69f4c4382120
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/configuration_realm.py
@@ -0,0 +1,169 @@
+# coding=utf-8
+# Copyright 2022 The REALM authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""REALM model configuration."""
+
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class RealmConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of
+
+    1. [`RealmEmbedder`]
+    2. [`RealmScorer`]
+    3. [`RealmKnowledgeAugEncoder`]
+    4. [`RealmRetriever`]
+    5. [`RealmReader`]
+    6. [`RealmForOpenQA`]
+
+    It is used to instantiate an REALM model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the REALM
+    [google/realm-cc-news-pretrained-embedder](https://huggingface.co/google/realm-cc-news-pretrained-embedder)
+    architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the REALM model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`RealmEmbedder`], [`RealmScorer`], [`RealmKnowledgeAugEncoder`], or
+            [`RealmReader`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimension of the encoder layers and the pooler layer.
+        retriever_proj_size (`int`, *optional*, defaults to 128):
+            Dimension of the retriever(embedder) projection.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_candidates (`int`, *optional*, defaults to 8):
+            Number of candidates inputted to the RealmScorer or RealmKnowledgeAugEncoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_new"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`RealmEmbedder`], [`RealmScorer`],
+            [`RealmKnowledgeAugEncoder`], or [`RealmReader`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        span_hidden_size (`int`, *optional*, defaults to 256):
+            Dimension of the reader's spans.
+        max_span_width (`int`, *optional*, defaults to 10):
+            Max span width of the reader.
+        reader_layer_norm_eps (`float`, *optional*, defaults to 1e-3):
+            The epsilon used by the reader's layer normalization layers.
+        reader_beam_size (`int`, *optional*, defaults to 5):
+            Beam size of the reader.
+        reader_seq_len (`int`, *optional*, defaults to 288+32):
+            Maximum sequence length of the reader.
+        num_block_records (`int`, *optional*, defaults to 13353718):
+            Number of block records.
+        searcher_beam_size (`int`, *optional*, defaults to 5000):
+            Beam size of the searcher. Note that when eval mode is enabled, *searcher_beam_size* will be the same as
+            *reader_beam_size*.
+
+    Example:
+
+    ```python
+    >>> from transformers import RealmConfig, RealmEmbedder
+
+    >>> # Initializing a REALM realm-cc-news-pretrained-* style configuration
+    >>> configuration = RealmConfig()
+
+    >>> # Initializing a model (with random weights) from the google/realm-cc-news-pretrained-embedder style configuration
+    >>> model = RealmEmbedder(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "realm"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        retriever_proj_size=128,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        num_candidates=8,
+        intermediate_size=3072,
+        hidden_act="gelu_new",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        span_hidden_size=256,
+        max_span_width=10,
+        reader_layer_norm_eps=1e-3,
+        reader_beam_size=5,
+        reader_seq_len=320,  # 288 + 32
+        num_block_records=13353718,
+        searcher_beam_size=5000,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+        # Common config
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.retriever_proj_size = retriever_proj_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_candidates = num_candidates
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.type_vocab_size = type_vocab_size
+        self.layer_norm_eps = layer_norm_eps
+
+        # Reader config
+        self.span_hidden_size = span_hidden_size
+        self.max_span_width = max_span_width
+        self.reader_layer_norm_eps = reader_layer_norm_eps
+        self.reader_beam_size = reader_beam_size
+        self.reader_seq_len = reader_seq_len
+
+        # Retrieval config
+        self.num_block_records = num_block_records
+        self.searcher_beam_size = searcher_beam_size
+
+
+__all__ = ["RealmConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/modeling_realm.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/modeling_realm.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e2de5c9c1c4491390a0a31ce9c0613e9500096b
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/modeling_realm.py
@@ -0,0 +1,1855 @@
+# coding=utf-8
+# Copyright 2022 The REALM authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch REALM model."""
+
+import math
+import os
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from ....activations import ACT2FN
+from ....cache_utils import Cache
+from ....modeling_layers import GradientCheckpointingLayer
+from ....modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    BaseModelOutputWithPoolingAndCrossAttentions,
+    MaskedLMOutput,
+    ModelOutput,
+)
+from ....modeling_utils import PreTrainedModel
+from ....pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ....utils import add_start_docstrings, add_start_docstrings_to_model_forward, logging, replace_return_docstrings
+from ....utils.deprecation import deprecate_kwarg
+from .configuration_realm import RealmConfig
+
+
+logger = logging.get_logger(__name__)
+_EMBEDDER_CHECKPOINT_FOR_DOC = "google/realm-cc-news-pretrained-embedder"
+_ENCODER_CHECKPOINT_FOR_DOC = "google/realm-cc-news-pretrained-encoder"
+_SCORER_CHECKPOINT_FOR_DOC = "google/realm-cc-news-pretrained-scorer"
+_CONFIG_FOR_DOC = "RealmConfig"
+
+
+def load_tf_weights_in_realm(model, config, tf_checkpoint_path):
+    """Load tf checkpoints in a pytorch model."""
+    try:
+        import re
+
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+
+    for name, shape in init_vars:
+        logger.info(f"Loading TF weight {name} with shape {shape}")
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+
+    for name, array in zip(names, arrays):
+        if isinstance(model, RealmReader) and "reader" not in name:
+            logger.info(f"Skipping {name} as it is not {model.__class__.__name__}'s parameter")
+            continue
+
+        # For pretrained openqa reader
+        if (name.startswith("bert") or name.startswith("cls")) and isinstance(model, RealmForOpenQA):
+            name = name.replace("bert/", "reader/realm/")
+            name = name.replace("cls/", "reader/cls/")
+
+        # For pretrained encoder
+        if (name.startswith("bert") or name.startswith("cls")) and isinstance(model, RealmKnowledgeAugEncoder):
+            name = name.replace("bert/", "realm/")
+
+        # For finetuned reader
+        if name.startswith("reader"):
+            reader_prefix = "" if isinstance(model, RealmReader) else "reader/"
+            name = name.replace("reader/module/bert/", f"{reader_prefix}realm/")
+            name = name.replace("reader/module/cls/", f"{reader_prefix}cls/")
+            name = name.replace("reader/dense/", f"{reader_prefix}qa_outputs/dense_intermediate/")
+            name = name.replace("reader/dense_1/", f"{reader_prefix}qa_outputs/dense_output/")
+            name = name.replace("reader/layer_normalization", f"{reader_prefix}qa_outputs/layer_normalization")
+
+        # For embedder and scorer
+        if name.startswith("module/module/module/"):  # finetuned
+            embedder_prefix = "" if isinstance(model, RealmEmbedder) else "embedder/"
+            name = name.replace("module/module/module/module/bert/", f"{embedder_prefix}realm/")
+            name = name.replace("module/module/module/LayerNorm/", f"{embedder_prefix}cls/LayerNorm/")
+            name = name.replace("module/module/module/dense/", f"{embedder_prefix}cls/dense/")
+            name = name.replace("module/module/module/module/cls/predictions/", f"{embedder_prefix}cls/predictions/")
+            name = name.replace("module/module/module/bert/", f"{embedder_prefix}realm/")
+            name = name.replace("module/module/module/cls/predictions/", f"{embedder_prefix}cls/predictions/")
+        elif name.startswith("module/module/"):  # pretrained
+            embedder_prefix = "" if isinstance(model, RealmEmbedder) else "embedder/"
+            name = name.replace("module/module/LayerNorm/", f"{embedder_prefix}cls/LayerNorm/")
+            name = name.replace("module/module/dense/", f"{embedder_prefix}cls/dense/")
+
+        name = name.split("/")
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(
+            n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
+            for n in name
+        ):
+            logger.info(f"Skipping {'/'.join(name)}")
+            continue
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
+                scope_names = re.split(r"_(\d+)", m_name)
+            else:
+                scope_names = [m_name]
+            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
+                pointer = getattr(pointer, "bias")
+            else:
+                try:
+                    pointer = getattr(pointer, scope_names[0])
+                except AttributeError:
+                    logger.info(f"Skipping {'/'.join(name)}")
+                    continue
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
+                pointer = pointer[num]
+        if m_name[-11:] == "_embeddings":
+            pointer = getattr(pointer, "weight")
+        elif m_name == "kernel":
+            array = np.transpose(array)
+        try:
+            assert pointer.shape == array.shape, (
+                f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
+            )
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        logger.info(f"Initialize PyTorch weight {name}")
+        pointer.data = torch.from_numpy(array)
+    return model
+
+
+class RealmEmbeddings(nn.Module):
+    """Construct the embeddings from word, position and token_type embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+        self.token_type_embeddings = nn.Embedding(config.type_vocab_size, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+        self.register_buffer(
+            "token_type_ids", torch.zeros(self.position_ids.size(), dtype=torch.long), persistent=False
+        )
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        past_key_values_length: int = 0,
+    ) -> torch.Tensor:
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
+
+        # Setting the token_type_ids to the registered buffer in constructor where it is all zeros, which usually occurs
+        # when its auto-generated, registered buffer helps users when tracing the model without passing token_type_ids, solves
+        # issue #5664
+        if token_type_ids is None:
+            if hasattr(self, "token_type_ids"):
+                buffered_token_type_ids = self.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(input_shape[0], seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=self.position_ids.device)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+        token_type_embeddings = self.token_type_embeddings(token_type_ids)
+
+        embeddings = inputs_embeds + token_type_embeddings
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class RealmSelfAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = position_embedding_type or getattr(
+            config, "position_embedding_type", "absolute"
+        )
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+
+        self.is_decoder = config.is_decoder
+
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.Tensor]:
+        mixed_query_layer = self.query(hidden_states)
+
+        # If this is instantiated as a cross-attention module, the keys
+        # and values come from an encoder; the attention mask needs to be
+        # such that the encoder's padding tokens are not attended to.
+        is_cross_attention = encoder_hidden_states is not None
+
+        if is_cross_attention and past_key_values is not None:
+            # reuse k,v, cross_attentions
+            key_layer = past_key_values[0]
+            value_layer = past_key_values[1]
+            attention_mask = encoder_attention_mask
+        elif is_cross_attention:
+            key_layer = self.transpose_for_scores(self.key(encoder_hidden_states))
+            value_layer = self.transpose_for_scores(self.value(encoder_hidden_states))
+            attention_mask = encoder_attention_mask
+        elif past_key_values is not None:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+            key_layer = torch.cat([past_key_values[0], key_layer], dim=2)
+            value_layer = torch.cat([past_key_values[1], value_layer], dim=2)
+        else:
+            key_layer = self.transpose_for_scores(self.key(hidden_states))
+            value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        use_cache = past_key_values is not None
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_values` is always `None`
+            past_key_values = (key_layer, value_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            query_length, key_length = query_layer.shape[2], key_layer.shape[2]
+            if use_cache:
+                position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view(
+                    -1, 1
+                )
+            else:
+                position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in RealmModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        if self.is_decoder:
+            outputs = outputs + (past_key_values,)
+        return outputs
+
+
+class RealmSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+REALM_SELF_ATTENTION_CLASSES = {
+    "eager": RealmSelfAttention,
+}
+
+
+class RealmAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None):
+        super().__init__()
+        self.self = REALM_SELF_ATTENTION_CLASSES[config._attn_implementation](
+            config, position_embedding_type=position_embedding_type
+        )
+        self.output = RealmSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.Tensor]:
+        self_outputs = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            past_key_values,
+            output_attentions,
+        )
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class RealmIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class RealmOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class RealmLayer(GradientCheckpointingLayer):
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = RealmAttention(config)
+        self.is_decoder = config.is_decoder
+        self.add_cross_attention = config.add_cross_attention
+        if self.add_cross_attention:
+            if not self.is_decoder:
+                raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
+            self.crossattention = RealmAttention(config, position_embedding_type="absolute")
+        self.intermediate = RealmIntermediate(config)
+        self.output = RealmOutput(config)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.Tensor]:
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_values[:2] if past_key_values is not None else None
+        self_attention_outputs = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_values=self_attn_past_key_value,
+        )
+        attention_output = self_attention_outputs[0]
+
+        # if decoder, the last output is tuple of self-attn cache
+        if self.is_decoder:
+            outputs = self_attention_outputs[1:-1]
+            present_key_value = self_attention_outputs[-1]
+        else:
+            outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        cross_attn_present_key_value = None
+        if self.is_decoder and encoder_hidden_states is not None:
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
+                    " by setting `config.add_cross_attention=True`"
+                )
+
+            # cross_attn cached key/values tuple is at positions 3,4 of past_key_values tuple
+            cross_attn_past_key_value = past_key_values[-2:] if past_key_values is not None else None
+            cross_attention_outputs = self.crossattention(
+                attention_output,
+                attention_mask,
+                head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                cross_attn_past_key_value,
+                output_attentions,
+            )
+            attention_output = cross_attention_outputs[0]
+            outputs = outputs + cross_attention_outputs[1:-1]  # add cross attentions if we output attention weights
+
+            # add cross-attn cache to positions 3,4 of present_key_value tuple
+            cross_attn_present_key_value = cross_attention_outputs[-1]
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        outputs = (layer_output,) + outputs
+
+        # if decoder, return the attn key/values as the last output
+        if self.is_decoder:
+            outputs = outputs + (present_key_value,)
+
+        return outputs
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class RealmEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([RealmLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPastAndCrossAttentions]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        next_decoder_cache = () if use_cache else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            layer_outputs = layer_module(
+                hidden_states,
+                attention_mask,
+                layer_head_mask,
+                encoder_hidden_states,
+                encoder_attention_mask,
+                past_key_values[i] if past_key_values is not None else None,
+                output_attentions,
+            )
+
+            hidden_states = layer_outputs[0]
+            if use_cache:
+                next_decoder_cache += (layer_outputs[-1],)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    next_decoder_cache,
+                    all_hidden_states,
+                    all_self_attentions,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_decoder_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class RealmPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+@dataclass
+class RealmEmbedderOutput(ModelOutput):
+    """
+    Outputs of [`RealmEmbedder`] models.
+
+    Args:
+        projected_score (`torch.FloatTensor` of shape `(batch_size, config.retriever_proj_size)`):
+
+            Projected score.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    projected_score: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class RealmScorerOutput(ModelOutput):
+    """
+    Outputs of [`RealmScorer`] models.
+
+    Args:
+        relevance_score (`torch.FloatTensor` of shape `(batch_size, config.num_candidates)`):
+            The relevance score of document candidates (before softmax).
+        query_score (`torch.FloatTensor` of shape `(batch_size, config.retriever_proj_size)`):
+            Query score derived from the query embedder.
+        candidate_score (`torch.FloatTensor` of shape `(batch_size, config.num_candidates, config.retriever_proj_size)`):
+            Candidate score derived from the embedder.
+    """
+
+    relevance_score: Optional[torch.FloatTensor] = None
+    query_score: Optional[torch.FloatTensor] = None
+    candidate_score: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+class RealmReaderOutput(ModelOutput):
+    """
+    Outputs of [`RealmReader`] models.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `start_positions`, `end_positions`, `has_answers` are provided):
+            Total loss.
+        retriever_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `start_positions`, `end_positions`, `has_answers` are provided):
+            Retriever loss.
+        reader_loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `start_positions`, `end_positions`, `has_answers` are provided):
+            Reader loss.
+        retriever_correct (`torch.BoolTensor` of shape `(config.searcher_beam_size,)`, *optional*):
+            Whether or not an evidence block contains answer.
+        reader_correct (`torch.BoolTensor` of shape `(config.reader_beam_size, num_candidates)`, *optional*):
+            Whether or not a span candidate contains answer.
+        block_idx (`torch.LongTensor` of shape `()`):
+            The index of the retrieved evidence block in which the predicted answer is most likely.
+        candidate (`torch.LongTensor` of shape `()`):
+            The index of the retrieved span candidates in which the predicted answer is most likely.
+        start_pos (`torch.IntTensor` of shape `()`):
+            Predicted answer starting position in *RealmReader*'s inputs.
+        end_pos (`torch.IntTensor` of shape `()`):
+            Predicted answer ending position in *RealmReader*'s inputs.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    retriever_loss: Optional[torch.FloatTensor] = None
+    reader_loss: Optional[torch.FloatTensor] = None
+    retriever_correct: Optional[torch.BoolTensor] = None
+    reader_correct: Optional[torch.BoolTensor] = None
+    block_idx: Optional[torch.LongTensor] = None
+    candidate: Optional[torch.LongTensor] = None
+    start_pos: Optional[torch.IntTensor] = None
+    end_pos: Optional[torch.IntTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class RealmForOpenQAOutput(ModelOutput):
+    """
+
+    Outputs of [`RealmForOpenQA`] models.
+
+    Args:
+        reader_output (`dict`):
+            Reader output.
+        predicted_answer_ids (`torch.LongTensor` of shape `(answer_sequence_length)`):
+            Predicted answer ids.
+    """
+
+    reader_output: Optional[dict] = None
+    predicted_answer_ids: Optional[torch.LongTensor] = None
+
+
+class RealmPredictionHeadTransform(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        if isinstance(config.hidden_act, str):
+            self.transform_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.transform_act_fn = config.hidden_act
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.transform_act_fn(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class RealmLMPredictionHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.transform = RealmPredictionHeadTransform(config)
+
+        # The output weights are the same as the input embeddings, but there is
+        # an output-only bias for each token.
+        self.decoder = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        self.bias = nn.Parameter(torch.zeros(config.vocab_size))
+
+        # Need a link between the two variables so that the bias is correctly resized with `resize_token_embeddings`
+        self.decoder.bias = self.bias
+
+    def _tie_weights(self):
+        self.decoder.bias = self.bias
+
+    def forward(self, hidden_states):
+        hidden_states = self.transform(hidden_states)
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+class RealmOnlyMLMHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = RealmLMPredictionHead(config)
+
+    def forward(self, sequence_output):
+        prediction_scores = self.predictions(sequence_output)
+        return prediction_scores
+
+
+class RealmScorerProjection(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.predictions = RealmLMPredictionHead(config)
+        self.dense = nn.Linear(config.hidden_size, config.retriever_proj_size)
+        self.LayerNorm = nn.LayerNorm(config.retriever_proj_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states)
+        return hidden_states
+
+
+class RealmReaderProjection(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.dense_intermediate = nn.Linear(config.hidden_size, config.span_hidden_size * 2)
+        self.dense_output = nn.Linear(config.span_hidden_size, 1)
+        self.layer_normalization = nn.LayerNorm(config.span_hidden_size, eps=config.reader_layer_norm_eps)
+        self.relu = nn.ReLU()
+
+    def forward(self, hidden_states, block_mask):
+        def span_candidates(masks):
+            """
+            Generate span candidates.
+
+            Args:
+                masks: <bool> [num_retrievals, max_sequence_len]
+
+            Returns:
+                starts: <int32> [num_spans] ends: <int32> [num_spans] span_masks: <int32> [num_retrievals, num_spans]
+                whether spans locate in evidence block.
+            """
+            _, max_sequence_len = masks.shape
+
+            def _spans_given_width(width):
+                current_starts = torch.arange(max_sequence_len - width + 1, device=masks.device)
+                current_ends = torch.arange(width - 1, max_sequence_len, device=masks.device)
+                return current_starts, current_ends
+
+            starts, ends = zip(*(_spans_given_width(w + 1) for w in range(self.config.max_span_width)))
+
+            # [num_spans]
+            starts = torch.cat(starts, 0)
+            ends = torch.cat(ends, 0)
+
+            # [num_retrievals, num_spans]
+            start_masks = torch.index_select(masks, dim=-1, index=starts)
+            end_masks = torch.index_select(masks, dim=-1, index=ends)
+            span_masks = start_masks * end_masks
+
+            return starts, ends, span_masks
+
+        def mask_to_score(mask, dtype=torch.float32):
+            return (1.0 - mask.type(dtype)) * torch.finfo(dtype).min
+
+        # [reader_beam_size, max_sequence_len, span_hidden_size * 2]
+        hidden_states = self.dense_intermediate(hidden_states)
+        # [reader_beam_size, max_sequence_len, span_hidden_size]
+        start_projection, end_projection = hidden_states.chunk(2, dim=-1)
+
+        candidate_starts, candidate_ends, candidate_mask = span_candidates(block_mask)
+
+        candidate_start_projections = torch.index_select(start_projection, dim=1, index=candidate_starts)
+        candidate_end_projections = torch.index_select(end_projection, dim=1, index=candidate_ends)
+        candidate_hidden = candidate_start_projections + candidate_end_projections
+
+        # [reader_beam_size, num_candidates, span_hidden_size]
+        candidate_hidden = self.relu(candidate_hidden)
+        # [reader_beam_size, num_candidates, span_hidden_size]
+        candidate_hidden = self.layer_normalization(candidate_hidden)
+        # [reader_beam_size, num_candidates]
+        reader_logits = self.dense_output(candidate_hidden).squeeze(-1)
+        # [reader_beam_size, num_candidates]
+        reader_logits += mask_to_score(candidate_mask, dtype=reader_logits.dtype)
+
+        return reader_logits, candidate_starts, candidate_ends
+
+
+REALM_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`RealmConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+REALM_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        position_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+
+            [What are position IDs?](../glossary#position-ids)
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `({0}, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert *input_ids* indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class RealmPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config: RealmConfig
+    load_tf_weights = load_tf_weights_in_realm
+    base_model_prefix = "realm"
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+    def _flatten_inputs(self, *inputs):
+        """Flatten inputs' shape to (-1, input_shape[-1])"""
+        flattened_inputs = []
+        for tensor in inputs:
+            if tensor is None:
+                flattened_inputs.append(None)
+            else:
+                input_shape = tensor.shape
+                if len(input_shape) > 2:
+                    tensor = tensor.view((-1, input_shape[-1]))
+                flattened_inputs.append(tensor)
+        return flattened_inputs
+
+
+class RealmBertModel(RealmPreTrainedModel):
+    """
+    Same as the original BertModel but remove docstrings.
+    """
+
+    def __init__(self, config, add_pooling_layer=True):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = RealmEmbeddings(config)
+        self.encoder = RealmEncoder(config)
+
+        self.pooler = RealmPooler(config) if add_pooling_layer else None
+
+        # Weights initialization is mostly managed by other Realm models,
+        # but we also have them initialized here to keep a consistency.
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        token_type_ids=None,
+        position_ids=None,
+        head_mask=None,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        past_key_values=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if self.config.is_decoder:
+            use_cache = use_cache if use_cache is not None else self.config.use_cache
+        else:
+            use_cache = False
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        batch_size, seq_length = input_shape
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        # past_key_values_length
+        past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0
+
+        if attention_mask is None:
+            attention_mask = torch.ones(((batch_size, seq_length + past_key_values_length)), device=device)
+
+        if token_type_ids is None:
+            if hasattr(self.embeddings, "token_type_ids"):
+                buffered_token_type_ids = self.embeddings.token_type_ids[:, :seq_length]
+                buffered_token_type_ids_expanded = buffered_token_type_ids.expand(batch_size, seq_length)
+                token_type_ids = buffered_token_type_ids_expanded
+            else:
+                token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+
+        # We can provide a self-attention mask of dimensions [batch_size, from_seq_length, to_seq_length]
+        # ourselves in which case we just need to make it broadcastable to all heads.
+        extended_attention_mask: torch.Tensor = self.get_extended_attention_mask(attention_mask, input_shape)
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.is_decoder and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            encoder_extended_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_extended_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            token_type_ids=token_type_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            head_mask=head_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_extended_attention_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            return (sequence_output, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndCrossAttentions(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            cross_attentions=encoder_outputs.cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    "The embedder of REALM outputting projected score that will be used to calculate relevance score.",
+    REALM_START_DOCSTRING,
+)
+class RealmEmbedder(RealmPreTrainedModel):
+    _tied_weights_keys = ["cls.predictions.decoder.bias"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.realm = RealmBertModel(self.config)
+        self.cls = RealmScorerProjection(self.config)
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.realm.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.realm.embeddings.word_embeddings = value
+
+    @add_start_docstrings_to_model_forward(REALM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=RealmEmbedderOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, RealmEmbedderOutput]:
+        r"""
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, RealmEmbedder
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/realm-cc-news-pretrained-embedder")
+        >>> model = RealmEmbedder.from_pretrained("google/realm-cc-news-pretrained-embedder")
+
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> projected_score = outputs.projected_score
+        ```
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        realm_outputs = self.realm(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        # [batch_size, hidden_size]
+        pooler_output = realm_outputs[1]
+        # [batch_size, retriever_proj_size]
+        projected_score = self.cls(pooler_output)
+
+        if not return_dict:
+            return (projected_score,) + realm_outputs[2:4]
+        else:
+            return RealmEmbedderOutput(
+                projected_score=projected_score,
+                hidden_states=realm_outputs.hidden_states,
+                attentions=realm_outputs.attentions,
+            )
+
+
+@add_start_docstrings(
+    "The scorer of REALM outputting relevance scores representing the score of document candidates (before softmax).",
+    REALM_START_DOCSTRING,
+)
+class RealmScorer(RealmPreTrainedModel):
+    r"""
+    Args:
+        query_embedder ([`RealmEmbedder`]):
+            Embedder for input sequences. If not specified, it will use the same embedder as candidate sequences.
+    """
+
+    def __init__(self, config, query_embedder=None):
+        super().__init__(config)
+
+        self.embedder = RealmEmbedder(self.config)
+
+        self.query_embedder = query_embedder if query_embedder is not None else self.embedder
+
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(REALM_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=RealmScorerOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        candidate_input_ids: Optional[torch.LongTensor] = None,
+        candidate_attention_mask: Optional[torch.FloatTensor] = None,
+        candidate_token_type_ids: Optional[torch.LongTensor] = None,
+        candidate_inputs_embeds: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, RealmScorerOutput]:
+        r"""
+        candidate_input_ids (`torch.LongTensor` of shape `(batch_size, num_candidates, sequence_length)`):
+            Indices of candidate input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        candidate_attention_mask (`torch.FloatTensor` of shape `(batch_size, num_candidates, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        candidate_token_type_ids (`torch.LongTensor` of shape `(batch_size, num_candidates, sequence_length)`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token.
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        candidate_inputs_embeds (`torch.FloatTensor` of shape `(batch_size * num_candidates, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `candidate_input_ids` you can choose to directly pass an embedded
+            representation. This is useful if you want more control over how to convert *candidate_input_ids* indices
+            into associated vectors than the model's internal embedding lookup matrix.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> import torch
+        >>> from transformers import AutoTokenizer, RealmScorer
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/realm-cc-news-pretrained-scorer")
+        >>> model = RealmScorer.from_pretrained("google/realm-cc-news-pretrained-scorer", num_candidates=2)
+
+        >>> # batch_size = 2, num_candidates = 2
+        >>> input_texts = ["How are you?", "What is the item in the picture?"]
+        >>> candidates_texts = [["Hello world!", "Nice to meet you!"], ["A cute cat.", "An adorable dog."]]
+
+        >>> inputs = tokenizer(input_texts, return_tensors="pt")
+        >>> candidates_inputs = tokenizer.batch_encode_candidates(candidates_texts, max_length=10, return_tensors="pt")
+
+        >>> outputs = model(
+        ...     **inputs,
+        ...     candidate_input_ids=candidates_inputs.input_ids,
+        ...     candidate_attention_mask=candidates_inputs.attention_mask,
+        ...     candidate_token_type_ids=candidates_inputs.token_type_ids,
+        ... )
+        >>> relevance_score = outputs.relevance_score
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is None and inputs_embeds is None:
+            raise ValueError("You have to specify either input_ids or input_embeds.")
+
+        if candidate_input_ids is None and candidate_inputs_embeds is None:
+            raise ValueError("You have to specify either candidate_input_ids or candidate_inputs_embeds.")
+
+        query_outputs = self.query_embedder(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        # [batch_size * num_candidates, candidate_seq_len]
+        (flattened_input_ids, flattened_attention_mask, flattened_token_type_ids) = self._flatten_inputs(
+            candidate_input_ids, candidate_attention_mask, candidate_token_type_ids
+        )
+
+        candidate_outputs = self.embedder(
+            flattened_input_ids,
+            attention_mask=flattened_attention_mask,
+            token_type_ids=flattened_token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=candidate_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        # [batch_size, retriever_proj_size]
+        query_score = query_outputs[0]
+        # [batch_size * num_candidates, retriever_proj_size]
+        candidate_score = candidate_outputs[0]
+        # [batch_size, num_candidates, retriever_proj_size]
+        candidate_score = candidate_score.view(-1, self.config.num_candidates, self.config.retriever_proj_size)
+        # [batch_size, num_candidates]
+        relevance_score = torch.einsum("bd,bnd->bn", query_score, candidate_score)
+
+        if not return_dict:
+            return relevance_score, query_score, candidate_score
+
+        return RealmScorerOutput(
+            relevance_score=relevance_score, query_score=query_score, candidate_score=candidate_score
+        )
+
+
+@add_start_docstrings(
+    "The knowledge-augmented encoder of REALM outputting masked language model logits and marginal log-likelihood"
+    " loss.",
+    REALM_START_DOCSTRING,
+)
+class RealmKnowledgeAugEncoder(RealmPreTrainedModel):
+    _tied_weights_keys = ["cls.predictions.decoder"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.realm = RealmBertModel(self.config)
+        self.cls = RealmOnlyMLMHead(self.config)
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.realm.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.realm.embeddings.word_embeddings = value
+
+    def get_output_embeddings(self):
+        return self.cls.predictions.decoder
+
+    def set_output_embeddings(self, new_embeddings):
+        self.cls.predictions.decoder = new_embeddings
+        self.cls.predictions.bias = new_embeddings.bias
+
+    @add_start_docstrings_to_model_forward(
+        REALM_INPUTS_DOCSTRING.format("batch_size, num_candidates, sequence_length")
+    )
+    @replace_return_docstrings(output_type=MaskedLMOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        relevance_score: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        mlm_mask: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, MaskedLMOutput]:
+        r"""
+        relevance_score (`torch.FloatTensor` of shape `(batch_size, num_candidates)`, *optional*):
+            Relevance score derived from RealmScorer, must be specified if you want to compute the masked language
+            modeling loss.
+
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+
+        mlm_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid calculating joint loss on certain positions. If not specified, the loss will not be masked.
+            Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> import torch
+        >>> from transformers import AutoTokenizer, RealmKnowledgeAugEncoder
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/realm-cc-news-pretrained-encoder")
+        >>> model = RealmKnowledgeAugEncoder.from_pretrained(
+        ...     "google/realm-cc-news-pretrained-encoder", num_candidates=2
+        ... )
+
+        >>> # batch_size = 2, num_candidates = 2
+        >>> text = [["Hello world!", "Nice to meet you!"], ["The cute cat.", "The adorable dog."]]
+
+        >>> inputs = tokenizer.batch_encode_candidates(text, max_length=10, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> logits = outputs.logits
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if labels is not None and relevance_score is None:
+            raise ValueError(
+                "You have to specify `relevance_score` when `labels` is specified in order to compute loss."
+            )
+
+        (flattened_input_ids, flattened_attention_mask, flattened_token_type_ids) = self._flatten_inputs(
+            input_ids, attention_mask, token_type_ids
+        )
+
+        joint_outputs = self.realm(
+            flattened_input_ids,
+            attention_mask=flattened_attention_mask,
+            token_type_ids=flattened_token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        # [batch_size * num_candidates, joint_seq_len, hidden_size]
+        joint_output = joint_outputs[0]
+        # [batch_size * num_candidates, joint_seq_len, vocab_size]
+        prediction_scores = self.cls(joint_output)
+        # [batch_size, num_candidates]
+        candidate_score = relevance_score
+
+        masked_lm_loss = None
+        if labels is not None:
+            batch_size, seq_length = labels.size()
+
+            if mlm_mask is None:
+                mlm_mask = torch.ones_like(labels, dtype=torch.float32)
+            else:
+                mlm_mask = mlm_mask.type(torch.float32)
+
+            # Compute marginal log-likelihood
+            loss_fct = CrossEntropyLoss(reduction="none")  # -100 index = padding token
+
+            # [batch_size * num_candidates * joint_seq_len, vocab_size]
+            mlm_logits = prediction_scores.view(-1, self.config.vocab_size)
+            # [batch_size * num_candidates * joint_seq_len]
+            mlm_targets = labels.tile(1, self.config.num_candidates).view(-1)
+            # [batch_size, num_candidates, joint_seq_len]
+            masked_lm_log_prob = -loss_fct(mlm_logits, mlm_targets).view(
+                batch_size, self.config.num_candidates, seq_length
+            )
+            # [batch_size, num_candidates, 1]
+            candidate_log_prob = candidate_score.log_softmax(-1).unsqueeze(-1)
+            # [batch_size, num_candidates, joint_seq_len]
+            joint_gold_log_prob = candidate_log_prob + masked_lm_log_prob
+            # [batch_size, joint_seq_len]
+            marginal_gold_log_probs = joint_gold_log_prob.logsumexp(1)
+            # []
+            masked_lm_loss = -torch.nansum(torch.sum(marginal_gold_log_probs * mlm_mask) / torch.sum(mlm_mask))
+
+        if not return_dict:
+            output = (prediction_scores,) + joint_outputs[2:4]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return MaskedLMOutput(
+            loss=masked_lm_loss,
+            logits=prediction_scores,
+            hidden_states=joint_outputs.hidden_states,
+            attentions=joint_outputs.attentions,
+        )
+
+
+@add_start_docstrings("The reader of REALM.", REALM_START_DOCSTRING)
+class RealmReader(RealmPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+
+        self.realm = RealmBertModel(config)
+        self.cls = RealmOnlyMLMHead(config)
+        self.qa_outputs = RealmReaderProjection(config)
+
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(REALM_INPUTS_DOCSTRING.format("reader_beam_size, sequence_length"))
+    @replace_return_docstrings(output_type=RealmReaderOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        relevance_score: Optional[torch.FloatTensor] = None,
+        block_mask: Optional[torch.BoolTensor] = None,
+        start_positions: Optional[torch.LongTensor] = None,
+        end_positions: Optional[torch.LongTensor] = None,
+        has_answers: Optional[torch.BoolTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, RealmReaderOutput]:
+        r"""
+        relevance_score (`torch.FloatTensor` of shape `(searcher_beam_size,)`, *optional*):
+            Relevance score, which must be specified if you want to compute the logits and marginal log loss.
+        block_mask (`torch.BoolTensor` of shape `(searcher_beam_size, sequence_length)`, *optional*):
+            The mask of the evidence block, which must be specified if you want to compute the logits and marginal log
+            loss.
+        start_positions (`torch.LongTensor` of shape `(searcher_beam_size,)`, *optional*):
+            Labels for position (index) of the start of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        end_positions (`torch.LongTensor` of shape `(searcher_beam_size,)`, *optional*):
+            Labels for position (index) of the end of the labelled span for computing the token classification loss.
+            Positions are clamped to the length of the sequence (`sequence_length`). Position outside of the sequence
+            are not taken into account for computing the loss.
+        has_answers (`torch.BoolTensor` of shape `(searcher_beam_size,)`, *optional*):
+            Whether or not the evidence block has answer(s).
+
+        Returns:
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if relevance_score is None:
+            raise ValueError("You have to specify `relevance_score` to calculate logits and loss.")
+        if block_mask is None:
+            raise ValueError("You have to specify `block_mask` to separate question block and evidence block.")
+        if token_type_ids.size(1) < self.config.max_span_width:
+            raise ValueError("The input sequence length must be greater than or equal to config.max_span_width.")
+        outputs = self.realm(
+            input_ids,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        # [reader_beam_size, joint_seq_len, hidden_size]
+        sequence_output = outputs[0]
+
+        # [reader_beam_size, num_candidates], [num_candidates], [num_candidates]
+        reader_logits, candidate_starts, candidate_ends = self.qa_outputs(
+            sequence_output, block_mask[0 : self.config.reader_beam_size]
+        )
+        # [searcher_beam_size, 1]
+        retriever_logits = torch.unsqueeze(relevance_score[0 : self.config.reader_beam_size], -1)
+        # [reader_beam_size, num_candidates]
+        reader_logits += retriever_logits
+        # []
+        predicted_block_index = torch.argmax(torch.max(reader_logits, dim=1).values)
+        # []
+        predicted_candidate = torch.argmax(torch.max(reader_logits, dim=0).values)
+        # [1]
+        predicted_start = torch.index_select(candidate_starts, dim=0, index=predicted_candidate)
+        # [1]
+        predicted_end = torch.index_select(candidate_ends, dim=0, index=predicted_candidate)
+
+        total_loss = None
+        retriever_loss = None
+        reader_loss = None
+        retriever_correct = None
+        reader_correct = None
+        if start_positions is not None and end_positions is not None and has_answers is not None:
+
+            def compute_correct_candidates(candidate_starts, candidate_ends, gold_starts, gold_ends):
+                """Compute correct span."""
+                # [reader_beam_size, num_answers, num_candidates]
+                is_gold_start = torch.eq(
+                    torch.unsqueeze(torch.unsqueeze(candidate_starts, 0), 0), torch.unsqueeze(gold_starts, -1)
+                )
+                is_gold_end = torch.eq(
+                    torch.unsqueeze(torch.unsqueeze(candidate_ends, 0), 0), torch.unsqueeze(gold_ends, -1)
+                )
+
+                # [reader_beam_size, num_candidates]
+                return torch.any(torch.logical_and(is_gold_start, is_gold_end), 1)
+
+            def marginal_log_loss(logits, is_correct):
+                """Loss based on the negative marginal log-likelihood."""
+
+                def mask_to_score(mask, dtype=torch.float32):
+                    return (1.0 - mask.type(dtype)) * torch.finfo(dtype).min
+
+                # []
+                log_numerator = torch.logsumexp(logits + mask_to_score(is_correct, dtype=logits.dtype), dim=-1)
+                log_denominator = torch.logsumexp(logits, dim=-1)
+                return log_denominator - log_numerator
+
+            # sometimes the start/end positions are outside our model inputs, we ignore these terms
+            # `-1` is reserved for no answer.
+            ignored_index = sequence_output.size(1)
+            start_positions = start_positions.clamp(-1, ignored_index)
+            end_positions = end_positions.clamp(-1, ignored_index)
+
+            retriever_correct = has_answers
+            any_retriever_correct = torch.any(retriever_correct)
+
+            reader_correct = compute_correct_candidates(
+                candidate_starts=candidate_starts,
+                candidate_ends=candidate_ends,
+                gold_starts=start_positions[0 : self.config.reader_beam_size],
+                gold_ends=end_positions[0 : self.config.reader_beam_size],
+            )
+            any_reader_correct = torch.any(reader_correct)
+
+            retriever_loss = marginal_log_loss(relevance_score, retriever_correct)
+            reader_loss = marginal_log_loss(reader_logits.view(-1), reader_correct.view(-1))
+            retriever_loss *= any_retriever_correct.type(torch.float32)
+            reader_loss *= any_reader_correct.type(torch.float32)
+
+            total_loss = (retriever_loss + reader_loss).mean()
+
+        if not return_dict:
+            output = (predicted_block_index, predicted_candidate, predicted_start, predicted_end) + outputs[2:]
+            return (
+                ((total_loss, retriever_loss, reader_loss, retriever_correct, reader_correct) + output)
+                if total_loss is not None
+                else output
+            )
+
+        return RealmReaderOutput(
+            loss=total_loss,
+            retriever_loss=retriever_loss,
+            reader_loss=reader_loss,
+            retriever_correct=retriever_correct,
+            reader_correct=reader_correct,
+            block_idx=predicted_block_index,
+            candidate=predicted_candidate,
+            start_pos=predicted_start,
+            end_pos=predicted_end,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+REALM_FOR_OPEN_QA_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.FloatTensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        token_type_ids (`torch.LongTensor` of shape `({0})`, *optional*):
+            Segment token indices to indicate first and second portions of the inputs. Indices are selected in `[0,
+            1]`:
+
+            - 0 corresponds to a *sentence A* token,
+            - 1 corresponds to a *sentence B* token (should not be used in this model by design).
+
+            [What are token type IDs?](../glossary#token-type-ids)
+        answer_ids (`list` of shape `(num_answers, answer_length)`, *optional*):
+            Answer ids for computing the marginal log-likelihood loss. Indices should be in `[-1, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-1` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "`RealmForOpenQA` for end-to-end open domain question answering.",
+    REALM_START_DOCSTRING,
+)
+class RealmForOpenQA(RealmPreTrainedModel):
+    def __init__(self, config, retriever=None):
+        super().__init__(config)
+        self.embedder = RealmEmbedder(config)
+        self.reader = RealmReader(config)
+        self.register_buffer(
+            "block_emb",
+            torch.zeros(()).new_empty(
+                size=(config.num_block_records, config.retriever_proj_size),
+                dtype=torch.float32,
+                device=torch.device("cpu"),
+            ),
+        )
+        self.retriever = retriever
+
+        self.post_init()
+
+    @property
+    def searcher_beam_size(self):
+        if self.training:
+            return self.config.searcher_beam_size
+        return self.config.reader_beam_size
+
+    def block_embedding_to(self, device):
+        """Send `self.block_emb` to a specific device.
+
+        Args:
+            device (`str` or `torch.device`):
+                The device to which `self.block_emb` will be sent.
+        """
+
+        self.block_emb = self.block_emb.to(device)
+
+    @add_start_docstrings_to_model_forward(REALM_FOR_OPEN_QA_DOCSTRING.format("1, sequence_length"))
+    @replace_return_docstrings(output_type=RealmForOpenQAOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor],
+        attention_mask: Optional[torch.FloatTensor] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        answer_ids: Optional[torch.LongTensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, RealmForOpenQAOutput]:
+        r"""
+        Returns:
+
+        Example:
+
+        ```python
+        >>> import torch
+        >>> from transformers import RealmForOpenQA, RealmRetriever, AutoTokenizer
+
+        >>> retriever = RealmRetriever.from_pretrained("google/realm-orqa-nq-openqa")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/realm-orqa-nq-openqa")
+        >>> model = RealmForOpenQA.from_pretrained("google/realm-orqa-nq-openqa", retriever=retriever)
+
+        >>> question = "Who is the pioneer in modern computer science?"
+        >>> question_ids = tokenizer([question], return_tensors="pt")
+        >>> answer_ids = tokenizer(
+        ...     ["alan mathison turing"],
+        ...     add_special_tokens=False,
+        ...     return_token_type_ids=False,
+        ...     return_attention_mask=False,
+        ... ).input_ids
+
+        >>> reader_output, predicted_answer_ids = model(**question_ids, answer_ids=answer_ids, return_dict=False)
+        >>> predicted_answer = tokenizer.decode(predicted_answer_ids)
+        >>> loss = reader_output.loss
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and input_ids.shape[0] != 1:
+            raise ValueError("The batch_size of the inputs must be 1.")
+
+        question_outputs = self.embedder(
+            input_ids=input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask, return_dict=True
+        )
+        # [1, projection_size]
+        question_projection = question_outputs[0]
+
+        # CPU computation starts.
+        # [1, block_emb_size]
+        batch_scores = torch.einsum("BD,QD->QB", self.block_emb, question_projection.to(self.block_emb.device))
+        # [1, searcher_beam_size]
+        _, retrieved_block_ids = torch.topk(batch_scores, k=self.searcher_beam_size, dim=-1)
+        # [searcher_beam_size]
+        retrieved_block_ids = retrieved_block_ids.squeeze()
+        # [searcher_beam_size, projection_size]
+        retrieved_block_emb = torch.index_select(self.block_emb, dim=0, index=retrieved_block_ids)
+        # CPU computation ends.
+
+        # Retrieve possible answers
+        has_answers, start_pos, end_pos, concat_inputs = self.retriever(
+            retrieved_block_ids.cpu(), input_ids, answer_ids, max_length=self.config.reader_seq_len
+        )
+
+        concat_inputs = concat_inputs.to(self.reader.device)
+        block_mask = concat_inputs.special_tokens_mask.type(torch.bool).to(device=self.reader.device)
+        block_mask.logical_not_().logical_and_(concat_inputs.token_type_ids.type(torch.bool))
+
+        if has_answers is not None:
+            has_answers = torch.tensor(has_answers, dtype=torch.bool, device=self.reader.device)
+            start_pos = torch.tensor(start_pos, dtype=torch.long, device=self.reader.device)
+            end_pos = torch.tensor(end_pos, dtype=torch.long, device=self.reader.device)
+
+        # [searcher_beam_size]
+        retrieved_logits = torch.einsum(
+            "D,BD->B", question_projection.squeeze(), retrieved_block_emb.to(self.reader.device)
+        )
+
+        reader_output = self.reader(
+            input_ids=concat_inputs.input_ids[0 : self.config.reader_beam_size],
+            attention_mask=concat_inputs.attention_mask[0 : self.config.reader_beam_size],
+            token_type_ids=concat_inputs.token_type_ids[0 : self.config.reader_beam_size],
+            relevance_score=retrieved_logits,
+            block_mask=block_mask,
+            has_answers=has_answers,
+            start_positions=start_pos,
+            end_positions=end_pos,
+            return_dict=True,
+        )
+
+        predicted_block = concat_inputs.input_ids[reader_output.block_idx]
+        predicted_answer_ids = predicted_block[reader_output.start_pos : reader_output.end_pos + 1]
+
+        if not return_dict:
+            return reader_output, predicted_answer_ids
+
+        return RealmForOpenQAOutput(
+            reader_output=reader_output,
+            predicted_answer_ids=predicted_answer_ids,
+        )
+
+
+__all__ = [
+    "RealmEmbedder",
+    "RealmForOpenQA",
+    "RealmKnowledgeAugEncoder",
+    "RealmPreTrainedModel",
+    "RealmReader",
+    "RealmScorer",
+    "load_tf_weights_in_realm",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/retrieval_realm.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/retrieval_realm.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5e47abb11791aedccd5621677713136a037be8c
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/retrieval_realm.py
@@ -0,0 +1,176 @@
+# coding=utf-8
+# Copyright 2022 The REALM authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""REALM Retriever model implementation."""
+
+import os
+from typing import Optional, Union
+
+import numpy as np
+from huggingface_hub import hf_hub_download
+
+from transformers import AutoTokenizer
+
+from ....utils import logging, strtobool
+
+
+_REALM_BLOCK_RECORDS_FILENAME = "block_records.npy"
+
+
+logger = logging.get_logger(__name__)
+
+
+def convert_tfrecord_to_np(block_records_path: str, num_block_records: int) -> np.ndarray:
+    import tensorflow.compat.v1 as tf
+
+    blocks_dataset = tf.data.TFRecordDataset(block_records_path, buffer_size=512 * 1024 * 1024)
+    blocks_dataset = blocks_dataset.batch(num_block_records, drop_remainder=True)
+    np_record = next(blocks_dataset.take(1).as_numpy_iterator())
+
+    return np_record
+
+
+class ScaNNSearcher:
+    """Note that ScaNNSearcher cannot currently be used within the model. In future versions, it might however be included."""
+
+    def __init__(
+        self,
+        db,
+        num_neighbors,
+        dimensions_per_block=2,
+        num_leaves=1000,
+        num_leaves_to_search=100,
+        training_sample_size=100000,
+    ):
+        """Build scann searcher."""
+
+        from scann.scann_ops.py.scann_ops_pybind import builder as Builder
+
+        builder = Builder(db=db, num_neighbors=num_neighbors, distance_measure="dot_product")
+        builder = builder.tree(
+            num_leaves=num_leaves, num_leaves_to_search=num_leaves_to_search, training_sample_size=training_sample_size
+        )
+        builder = builder.score_ah(dimensions_per_block=dimensions_per_block)
+
+        self.searcher = builder.build()
+
+    def search_batched(self, question_projection):
+        retrieved_block_ids, _ = self.searcher.search_batched(question_projection.detach().cpu())
+        return retrieved_block_ids.astype("int64")
+
+
+class RealmRetriever:
+    """The retriever of REALM outputting the retrieved evidence block and whether the block has answers as well as answer
+    positions."
+
+        Parameters:
+            block_records (`np.ndarray`):
+                A numpy array which contains evidence texts.
+            tokenizer ([`RealmTokenizer`]):
+                The tokenizer to encode retrieved texts.
+    """
+
+    def __init__(self, block_records, tokenizer):
+        super().__init__()
+        self.block_records = block_records
+        self.tokenizer = tokenizer
+
+    def __call__(self, retrieved_block_ids, question_input_ids, answer_ids, max_length=None, return_tensors="pt"):
+        retrieved_blocks = np.take(self.block_records, indices=retrieved_block_ids, axis=0)
+
+        question = self.tokenizer.decode(question_input_ids[0], skip_special_tokens=True)
+
+        text = []
+        text_pair = []
+        for retrieved_block in retrieved_blocks:
+            text.append(question)
+            text_pair.append(retrieved_block.decode())
+
+        concat_inputs = self.tokenizer(
+            text, text_pair, padding=True, truncation=True, return_special_tokens_mask=True, max_length=max_length
+        )
+        concat_inputs_tensors = concat_inputs.convert_to_tensors(return_tensors)
+
+        if answer_ids is not None:
+            return self.block_has_answer(concat_inputs, answer_ids) + (concat_inputs_tensors,)
+        else:
+            return (None, None, None, concat_inputs_tensors)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path: Optional[Union[str, os.PathLike]], *init_inputs, **kwargs):
+        if os.path.isdir(pretrained_model_name_or_path):
+            block_records_path = os.path.join(pretrained_model_name_or_path, _REALM_BLOCK_RECORDS_FILENAME)
+        else:
+            block_records_path = hf_hub_download(
+                repo_id=pretrained_model_name_or_path, filename=_REALM_BLOCK_RECORDS_FILENAME, **kwargs
+            )
+        if not strtobool(os.environ.get("TRUST_REMOTE_CODE", "False")):
+            raise ValueError(
+                "This part uses `pickle.load` which is insecure and will execute arbitrary code that is "
+                "potentially malicious. It's recommended to never unpickle data that could have come from an "
+                "untrusted source, or that could have been tampered with. If you already verified the pickle "
+                "data and decided to use it, you can set the environment variable "
+                "`TRUST_REMOTE_CODE` to `True` to allow it."
+            )
+        block_records = np.load(block_records_path, allow_pickle=True)
+
+        tokenizer = AutoTokenizer.from_pretrained(pretrained_model_name_or_path, *init_inputs, **kwargs)
+
+        return cls(block_records, tokenizer)
+
+    def save_pretrained(self, save_directory):
+        # save block records
+        np.save(os.path.join(save_directory, _REALM_BLOCK_RECORDS_FILENAME), self.block_records)
+        # save tokenizer
+        self.tokenizer.save_pretrained(save_directory)
+
+    def block_has_answer(self, concat_inputs, answer_ids):
+        """check if retrieved_blocks has answers."""
+        has_answers = []
+        start_pos = []
+        end_pos = []
+        max_answers = 0
+
+        for input_id in concat_inputs.input_ids:
+            input_id_list = input_id.tolist()
+            # Check answers between two [SEP] tokens
+            first_sep_idx = input_id_list.index(self.tokenizer.sep_token_id)
+            second_sep_idx = first_sep_idx + 1 + input_id_list[first_sep_idx + 1 :].index(self.tokenizer.sep_token_id)
+
+            start_pos.append([])
+            end_pos.append([])
+            for answer in answer_ids:
+                for idx in range(first_sep_idx + 1, second_sep_idx):
+                    if answer[0] == input_id_list[idx]:
+                        if input_id_list[idx : idx + len(answer)] == answer:
+                            start_pos[-1].append(idx)
+                            end_pos[-1].append(idx + len(answer) - 1)
+
+            if len(start_pos[-1]) == 0:
+                has_answers.append(False)
+            else:
+                has_answers.append(True)
+                if len(start_pos[-1]) > max_answers:
+                    max_answers = len(start_pos[-1])
+
+        # Pad -1 to max_answers
+        for start_pos_, end_pos_ in zip(start_pos, end_pos):
+            if len(start_pos_) < max_answers:
+                padded = [-1] * (max_answers - len(start_pos_))
+                start_pos_ += padded
+                end_pos_ += padded
+        return has_answers, start_pos, end_pos
+
+
+__all__ = ["RealmRetriever"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/tokenization_realm.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/tokenization_realm.py
new file mode 100644
index 0000000000000000000000000000000000000000..af7cdc1a5bac3afea8ed0651ae2892adcc03cbc3
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/tokenization_realm.py
@@ -0,0 +1,534 @@
+# coding=utf-8
+# Copyright 2022 The REALM authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for REALM."""
+
+import collections
+import os
+import unicodedata
+from typing import Optional
+
+from ....tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
+from ....tokenization_utils_base import BatchEncoding
+from ....utils import PaddingStrategy, logging
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
+
+
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        tokens = reader.readlines()
+    for index, token in enumerate(tokens):
+        token = token.rstrip("\n")
+        vocab[token] = index
+    return vocab
+
+
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a piece of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+
+
+class RealmTokenizer(PreTrainedTokenizer):
+    r"""
+    Construct a REALM tokenizer.
+
+    [`RealmTokenizer`] is identical to [`BertTokenizer`] and runs end-to-end tokenization: punctuation splitting and
+    wordpiece.
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            File containing the vocabulary.
+        do_lower_case (`bool`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+        do_basic_tokenize (`bool`, *optional*, defaults to `True`):
+            Whether or not to do basic tokenization before WordPiece.
+        never_split (`Iterable`, *optional*):
+            Collection of tokens which will never be split during tokenization. Only has an effect when
+            `do_basic_tokenize=True`
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
+            Whether or not to tokenize Chinese characters.
+
+            This should likely be deactivated for Japanese (see this
+            [issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents (`bool`, *optional*):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for `lowercase` (as in the original BERT).
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=True,
+        do_basic_tokenize=True,
+        never_split=None,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        **kwargs,
+    ):
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
+                " model use `tokenizer = RealmTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
+            )
+        self.vocab = load_vocab(vocab_file)
+        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
+        self.do_basic_tokenize = do_basic_tokenize
+        if do_basic_tokenize:
+            self.basic_tokenizer = BasicTokenizer(
+                do_lower_case=do_lower_case,
+                never_split=never_split,
+                tokenize_chinese_chars=tokenize_chinese_chars,
+                strip_accents=strip_accents,
+            )
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
+        super().__init__(
+            do_lower_case=do_lower_case,
+            do_basic_tokenize=do_basic_tokenize,
+            never_split=never_split,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            **kwargs,
+        )
+
+    @property
+    def do_lower_case(self):
+        return self.basic_tokenizer.do_lower_case
+
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+
+    def get_vocab(self):
+        return dict(self.vocab, **self.added_tokens_encoder)
+
+    def _tokenize(self, text):
+        split_tokens = []
+        if self.do_basic_tokenize:
+            for token in self.basic_tokenizer.tokenize(text, never_split=self.all_special_tokens):
+                # If the token is part of the never_split set
+                if token in self.basic_tokenizer.never_split:
+                    split_tokens.append(token)
+                else:
+                    split_tokens += self.wordpiece_tokenizer.tokenize(token)
+        else:
+            split_tokens = self.wordpiece_tokenizer.tokenize(text)
+        return split_tokens
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.ids_to_tokens.get(index, self.unk_token)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        out_string = " ".join(tokens).replace(" ##", "").strip()
+        return out_string
+
+    def batch_encode_candidates(self, text, **kwargs):
+        r"""
+        Encode a batch of text or text pair. This method is similar to regular __call__ method but has the following
+        differences:
+
+            1. Handle additional num_candidate axis. (batch_size, num_candidates, text)
+            2. Always pad the sequences to *max_length*.
+            3. Must specify *max_length* in order to stack packs of candidates into a batch.
+
+            - single sequence: `[CLS] X [SEP]`
+            - pair of sequences: `[CLS] A [SEP] B [SEP]`
+
+        Args:
+            text (`List[List[str]]`):
+                The batch of sequences to be encoded. Each sequence must be in this format: (batch_size,
+                num_candidates, text).
+            text_pair (`List[List[str]]`, *optional*):
+                The batch of sequences to be encoded. Each sequence must be in this format: (batch_size,
+                num_candidates, text).
+            **kwargs:
+                Keyword arguments of the __call__ method.
+
+        Returns:
+            [`BatchEncoding`]: Encoded text or text pair.
+
+        Example:
+
+        ```python
+        >>> from transformers import RealmTokenizer
+
+        >>> # batch_size = 2, num_candidates = 2
+        >>> text = [["Hello world!", "Nice to meet you!"], ["The cute cat.", "The adorable dog."]]
+
+        >>> tokenizer = RealmTokenizer.from_pretrained("google/realm-cc-news-pretrained-encoder")
+        >>> tokenized_text = tokenizer.batch_encode_candidates(text, max_length=10, return_tensors="pt")
+        ```"""
+
+        # Always using a fixed sequence length to encode in order to stack candidates into a batch.
+        kwargs["padding"] = PaddingStrategy.MAX_LENGTH
+
+        batch_text = text
+        batch_text_pair = kwargs.pop("text_pair", None)
+        return_tensors = kwargs.pop("return_tensors", None)
+
+        output_data = {
+            "input_ids": [],
+            "attention_mask": [],
+            "token_type_ids": [],
+        }
+
+        for idx, candidate_text in enumerate(batch_text):
+            if batch_text_pair is not None:
+                candidate_text_pair = batch_text_pair[idx]
+            else:
+                candidate_text_pair = None
+
+            encoded_candidates = super().__call__(candidate_text, candidate_text_pair, return_tensors=None, **kwargs)
+
+            encoded_input_ids = encoded_candidates.get("input_ids")
+            encoded_attention_mask = encoded_candidates.get("attention_mask")
+            encoded_token_type_ids = encoded_candidates.get("token_type_ids")
+
+            if encoded_input_ids is not None:
+                output_data["input_ids"].append(encoded_input_ids)
+            if encoded_attention_mask is not None:
+                output_data["attention_mask"].append(encoded_attention_mask)
+            if encoded_token_type_ids is not None:
+                output_data["token_type_ids"].append(encoded_token_type_ids)
+
+        output_data = {key: item for key, item in output_data.items() if len(item) != 0}
+
+        return BatchEncoding(output_data, tensor_type=return_tensors)
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A REALM sequence has the following format:
+
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
+    ) -> list[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        index = 0
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(
+                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+            )
+        else:
+            vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
+                        " Please check that the vocabulary is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(token + "\n")
+                index += 1
+        return (vocab_file,)
+
+
+class BasicTokenizer:
+    """
+    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
+
+    Args:
+        do_lower_case (`bool`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+        never_split (`Iterable`, *optional*):
+            Collection of tokens which will never be split during tokenization. Only has an effect when
+            `do_basic_tokenize=True`
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
+            Whether or not to tokenize Chinese characters.
+
+            This should likely be deactivated for Japanese (see this
+            [issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents (`bool`, *optional*):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for `lowercase` (as in the original BERT).
+    """
+
+    def __init__(self, do_lower_case=True, never_split=None, tokenize_chinese_chars=True, strip_accents=None):
+        if never_split is None:
+            never_split = []
+        self.do_lower_case = do_lower_case
+        self.never_split = set(never_split)
+        self.tokenize_chinese_chars = tokenize_chinese_chars
+        self.strip_accents = strip_accents
+
+    def tokenize(self, text, never_split=None):
+        """
+        Basic Tokenization of a piece of text. Split on "white spaces" only, for sub-word tokenization, see
+        WordPieceTokenizer.
+
+        Args:
+            never_split (`List[str]`, *optional*)
+                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
+                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
+        """
+        # union() returns a new set by concatenating the two sets.
+        never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
+        text = self._clean_text(text)
+
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        if self.tokenize_chinese_chars:
+            text = self._tokenize_chinese_chars(text)
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if token not in never_split:
+                if self.do_lower_case:
+                    token = token.lower()
+                    if self.strip_accents is not False:
+                        token = self._run_strip_accents(token)
+                elif self.strip_accents:
+                    token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token, never_split))
+
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
+    def _run_split_on_punc(self, text, never_split=None):
+        """Splits punctuation on a piece of text."""
+        if never_split is not None and text in never_split:
+            return [text]
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return ["".join(x) for x in output]
+
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if (
+            (cp >= 0x4E00 and cp <= 0x9FFF)
+            or (cp >= 0x3400 and cp <= 0x4DBF)
+            or (cp >= 0x20000 and cp <= 0x2A6DF)
+            or (cp >= 0x2A700 and cp <= 0x2B73F)
+            or (cp >= 0x2B740 and cp <= 0x2B81F)
+            or (cp >= 0x2B820 and cp <= 0x2CEAF)
+            or (cp >= 0xF900 and cp <= 0xFAFF)
+            or (cp >= 0x2F800 and cp <= 0x2FA1F)
+        ):
+            return True
+
+        return False
+
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xFFFD or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+
+class WordpieceTokenizer:
+    """Runs WordPiece tokenization."""
+
+    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+
+    def tokenize(self, text):
+        """
+        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
+        tokenization using the given vocabulary.
+
+        For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`.
+
+        Args:
+            text: A single token or whitespace separated tokens. This should have
+                already been passed through *BasicTokenizer*.
+
+        Returns:
+            A list of wordpiece tokens.
+        """
+
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+
+
+__all__ = ["RealmTokenizer"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/tokenization_realm_fast.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/tokenization_realm_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..53a1d99e0ac804ae196e9c65f0e2979dcb2401c8
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/realm/tokenization_realm_fast.py
@@ -0,0 +1,223 @@
+# coding=utf-8
+# Copyright 2022 The REALM authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Tokenization classes for REALM."""
+
+import json
+from typing import Optional
+
+from tokenizers import normalizers
+
+from ....tokenization_utils_base import BatchEncoding
+from ....tokenization_utils_fast import PreTrainedTokenizerFast
+from ....utils import PaddingStrategy, logging
+from .tokenization_realm import RealmTokenizer
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
+
+
+class RealmTokenizerFast(PreTrainedTokenizerFast):
+    r"""
+    Construct a "fast" REALM tokenizer (backed by HuggingFace's *tokenizers* library). Based on WordPiece.
+
+    [`RealmTokenizerFast`] is identical to [`BertTokenizerFast`] and runs end-to-end tokenization: punctuation
+    splitting and wordpiece.
+
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            File containing the vocabulary.
+        do_lower_case (`bool`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        clean_text (`bool`, *optional*, defaults to `True`):
+            Whether or not to clean the text before tokenization by removing any control characters and replacing all
+            whitespaces by the classic one.
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
+            Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this
+            issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents (`bool`, *optional*):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for `lowercase` (as in the original BERT).
+        wordpieces_prefix (`str`, *optional*, defaults to `"##"`):
+            The prefix for subwords.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    slow_tokenizer_class = RealmTokenizer
+
+    def __init__(
+        self,
+        vocab_file=None,
+        tokenizer_file=None,
+        do_lower_case=True,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        **kwargs,
+    ):
+        super().__init__(
+            vocab_file,
+            tokenizer_file=tokenizer_file,
+            do_lower_case=do_lower_case,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            **kwargs,
+        )
+
+        normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
+        if (
+            normalizer_state.get("lowercase", do_lower_case) != do_lower_case
+            or normalizer_state.get("strip_accents", strip_accents) != strip_accents
+            or normalizer_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars
+        ):
+            normalizer_class = getattr(normalizers, normalizer_state.pop("type"))
+            normalizer_state["lowercase"] = do_lower_case
+            normalizer_state["strip_accents"] = strip_accents
+            normalizer_state["handle_chinese_chars"] = tokenize_chinese_chars
+            self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state)
+
+        self.do_lower_case = do_lower_case
+
+    def batch_encode_candidates(self, text, **kwargs):
+        r"""
+        Encode a batch of text or text pair. This method is similar to regular __call__ method but has the following
+        differences:
+
+            1. Handle additional num_candidate axis. (batch_size, num_candidates, text)
+            2. Always pad the sequences to *max_length*.
+            3. Must specify *max_length* in order to stack packs of candidates into a batch.
+
+            - single sequence: `[CLS] X [SEP]`
+            - pair of sequences: `[CLS] A [SEP] B [SEP]`
+
+        Args:
+            text (`List[List[str]]`):
+                The batch of sequences to be encoded. Each sequence must be in this format: (batch_size,
+                num_candidates, text).
+            text_pair (`List[List[str]]`, *optional*):
+                The batch of sequences to be encoded. Each sequence must be in this format: (batch_size,
+                num_candidates, text).
+            **kwargs:
+                Keyword arguments of the __call__ method.
+
+        Returns:
+            [`BatchEncoding`]: Encoded text or text pair.
+
+        Example:
+
+        ```python
+        >>> from transformers import RealmTokenizerFast
+
+        >>> # batch_size = 2, num_candidates = 2
+        >>> text = [["Hello world!", "Nice to meet you!"], ["The cute cat.", "The adorable dog."]]
+
+        >>> tokenizer = RealmTokenizerFast.from_pretrained("google/realm-cc-news-pretrained-encoder")
+        >>> tokenized_text = tokenizer.batch_encode_candidates(text, max_length=10, return_tensors="pt")
+        ```"""
+
+        # Always using a fixed sequence length to encode in order to stack candidates into a batch.
+        kwargs["padding"] = PaddingStrategy.MAX_LENGTH
+
+        batch_text = text
+        batch_text_pair = kwargs.pop("text_pair", None)
+        return_tensors = kwargs.pop("return_tensors", None)
+
+        output_data = {
+            "input_ids": [],
+            "attention_mask": [],
+            "token_type_ids": [],
+        }
+
+        for idx, candidate_text in enumerate(batch_text):
+            if batch_text_pair is not None:
+                candidate_text_pair = batch_text_pair[idx]
+            else:
+                candidate_text_pair = None
+
+            encoded_candidates = super().__call__(candidate_text, candidate_text_pair, return_tensors=None, **kwargs)
+
+            encoded_input_ids = encoded_candidates.get("input_ids")
+            encoded_attention_mask = encoded_candidates.get("attention_mask")
+            encoded_token_type_ids = encoded_candidates.get("token_type_ids")
+
+            if encoded_input_ids is not None:
+                output_data["input_ids"].append(encoded_input_ids)
+            if encoded_attention_mask is not None:
+                output_data["attention_mask"].append(encoded_attention_mask)
+            if encoded_token_type_ids is not None:
+                output_data["token_type_ids"].append(encoded_token_type_ids)
+
+        output_data = {key: item for key, item in output_data.items() if len(item) != 0}
+
+        return BatchEncoding(output_data, tensor_type=return_tensors)
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A REALM sequence has the following format:
+
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+
+        if token_ids_1 is not None:
+            output += token_ids_1 + [self.sep_token_id]
+
+        return output
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
+        return tuple(files)
+
+
+__all__ = ["RealmTokenizerFast"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a875576607430c041abab01eccaf468a6cc9272e
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ....utils import _LazyModule
+from ....utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_retribert import *
+    from .modeling_retribert import *
+    from .tokenization_retribert import *
+    from .tokenization_retribert_fast import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/configuration_retribert.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/configuration_retribert.py
new file mode 100644
index 0000000000000000000000000000000000000000..80d755a16961450cae783d834385e5e6873dc24e
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/configuration_retribert.py
@@ -0,0 +1,108 @@
+# coding=utf-8
+# Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""RetriBERT model configuration"""
+
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class RetriBertConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`RetriBertModel`]. It is used to instantiate a
+    RetriBertModel model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the RetriBERT
+    [yjernite/retribert-base-uncased](https://huggingface.co/yjernite/retribert-base-uncased) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the RetriBERT model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`RetriBertModel`]
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the *token_type_ids* passed into [`BertModel`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        share_encoders (`bool`, *optional*, defaults to `True`):
+            Whether or not to use the same Bert-type encoder for the queries and document
+        projection_dim (`int`, *optional*, defaults to 128):
+            Final dimension of the query and document representation after projection
+    """
+
+    model_type = "retribert"
+
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        num_hidden_layers=8,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        share_encoders=True,
+        projection_dim=128,
+        pad_token_id=0,
+        **kwargs,
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.share_encoders = share_encoders
+        self.projection_dim = projection_dim
+
+
+__all__ = ["RetriBertConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/modeling_retribert.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/modeling_retribert.py
new file mode 100644
index 0000000000000000000000000000000000000000..06806e8e6d0bb01fa31c6849ccb23f3e29bf74aa
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/modeling_retribert.py
@@ -0,0 +1,217 @@
+# coding=utf-8
+# Copyright 2019-present, the HuggingFace Inc. team, The Google AI Language Team and Facebook, Inc.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+RetriBERT model
+"""
+
+import math
+from typing import Optional
+
+import torch
+import torch.utils.checkpoint as checkpoint
+from torch import nn
+
+from ....modeling_utils import PreTrainedModel
+from ....utils import add_start_docstrings, logging
+from ...bert.modeling_bert import BertModel
+from .configuration_retribert import RetriBertConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+# INTERFACE FOR ENCODER AND TASK SPECIFIC MODEL #
+class RetriBertPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config: RetriBertConfig
+    load_tf_weights = None
+    base_model_prefix = "retribert"
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+RETRIBERT_START_DOCSTRING = r"""
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`RetriBertConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+@add_start_docstrings(
+    """Bert Based model to embed queries or document for document retrieval.""",
+    RETRIBERT_START_DOCSTRING,
+)
+class RetriBertModel(RetriBertPreTrainedModel):
+    def __init__(self, config: RetriBertConfig) -> None:
+        super().__init__(config)
+        self.projection_dim = config.projection_dim
+
+        self.bert_query = BertModel(config)
+        self.bert_doc = None if config.share_encoders else BertModel(config)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.project_query = nn.Linear(config.hidden_size, config.projection_dim, bias=False)
+        self.project_doc = nn.Linear(config.hidden_size, config.projection_dim, bias=False)
+
+        self.ce_loss = nn.CrossEntropyLoss(reduction="mean")
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def embed_sentences_checkpointed(
+        self,
+        input_ids,
+        attention_mask,
+        sent_encoder,
+        checkpoint_batch_size=-1,
+    ):
+        # reproduces BERT forward pass with checkpointing
+        if checkpoint_batch_size < 0 or input_ids.shape[0] < checkpoint_batch_size:
+            return sent_encoder(input_ids, attention_mask=attention_mask)[1]
+        else:
+            # prepare implicit variables
+            device = input_ids.device
+            input_shape = input_ids.size()
+            token_type_ids = torch.zeros(input_shape, dtype=torch.long, device=device)
+            head_mask = [None] * sent_encoder.config.num_hidden_layers
+            extended_attention_mask: torch.Tensor = sent_encoder.get_extended_attention_mask(
+                attention_mask, input_shape
+            )
+
+            # define function for checkpointing
+            def partial_encode(*inputs):
+                encoder_outputs = sent_encoder.encoder(
+                    inputs[0],
+                    attention_mask=inputs[1],
+                    head_mask=head_mask,
+                )
+                sequence_output = encoder_outputs[0]
+                pooled_output = sent_encoder.pooler(sequence_output)
+                return pooled_output
+
+            # run embedding layer on everything at once
+            embedding_output = sent_encoder.embeddings(
+                input_ids=input_ids, position_ids=None, token_type_ids=token_type_ids, inputs_embeds=None
+            )
+            # run encoding and pooling on one mini-batch at a time
+            pooled_output_list = []
+            for b in range(math.ceil(input_ids.shape[0] / checkpoint_batch_size)):
+                b_embedding_output = embedding_output[b * checkpoint_batch_size : (b + 1) * checkpoint_batch_size]
+                b_attention_mask = extended_attention_mask[b * checkpoint_batch_size : (b + 1) * checkpoint_batch_size]
+                pooled_output = checkpoint.checkpoint(partial_encode, b_embedding_output, b_attention_mask)
+                pooled_output_list.append(pooled_output)
+            return torch.cat(pooled_output_list, dim=0)
+
+    def embed_questions(
+        self,
+        input_ids,
+        attention_mask=None,
+        checkpoint_batch_size=-1,
+    ):
+        q_reps = self.embed_sentences_checkpointed(
+            input_ids,
+            attention_mask,
+            self.bert_query,
+            checkpoint_batch_size,
+        )
+        return self.project_query(q_reps)
+
+    def embed_answers(
+        self,
+        input_ids,
+        attention_mask=None,
+        checkpoint_batch_size=-1,
+    ):
+        a_reps = self.embed_sentences_checkpointed(
+            input_ids,
+            attention_mask,
+            self.bert_query if self.bert_doc is None else self.bert_doc,
+            checkpoint_batch_size,
+        )
+        return self.project_doc(a_reps)
+
+    def forward(
+        self,
+        input_ids_query: torch.LongTensor,
+        attention_mask_query: Optional[torch.FloatTensor],
+        input_ids_doc: torch.LongTensor,
+        attention_mask_doc: Optional[torch.FloatTensor],
+        checkpoint_batch_size: int = -1,
+    ) -> torch.FloatTensor:
+        r"""
+        Args:
+            input_ids_query (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary for the queries in a batch.
+
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask_query (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            input_ids_doc (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary for the documents in a batch.
+            attention_mask_doc (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on documents padding token indices.
+            checkpoint_batch_size (`int`, *optional*, defaults to `-1`):
+                If greater than 0, uses gradient checkpointing to only compute sequence representation on
+                `checkpoint_batch_size` examples at a time on the GPU. All query representations are still compared to
+                all document representations in the batch.
+
+        Return:
+            `torch.FloatTensor``: The bidirectional cross-entropy loss obtained while trying to match each query to its
+            corresponding document and each document to its corresponding query in the batch
+        """
+        device = input_ids_query.device
+        q_reps = self.embed_questions(input_ids_query, attention_mask_query, checkpoint_batch_size)
+        a_reps = self.embed_answers(input_ids_doc, attention_mask_doc, checkpoint_batch_size)
+        compare_scores = torch.mm(q_reps, a_reps.t())
+        loss_qa = self.ce_loss(compare_scores, torch.arange(compare_scores.shape[1]).to(device))
+        loss_aq = self.ce_loss(compare_scores.t(), torch.arange(compare_scores.shape[0]).to(device))
+        loss = (loss_qa + loss_aq) / 2
+        return loss
+
+
+__all__ = ["RetriBertModel", "RetriBertPreTrainedModel"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/tokenization_retribert.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/tokenization_retribert.py
new file mode 100644
index 0000000000000000000000000000000000000000..288b46e267bccb358fa86e74ac2b98dcb99e8904
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/tokenization_retribert.py
@@ -0,0 +1,475 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for RetriBERT."""
+
+import collections
+import os
+import unicodedata
+from typing import Optional
+
+from ....tokenization_utils import PreTrainedTokenizer, _is_control, _is_punctuation, _is_whitespace
+from ....utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt"}
+
+
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        tokens = reader.readlines()
+    for index, token in enumerate(tokens):
+        token = token.rstrip("\n")
+        vocab[token] = index
+    return vocab
+
+
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a piece of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+
+
+class RetriBertTokenizer(PreTrainedTokenizer):
+    r"""
+    Constructs a RetriBERT tokenizer.
+
+    [`RetriBertTokenizer`] is identical to [`BertTokenizer`] and runs end-to-end tokenization: punctuation splitting
+    and wordpiece.
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer
+    to: this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            File containing the vocabulary.
+        do_lower_case (`bool`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+        do_basic_tokenize (`bool`, *optional*, defaults to `True`):
+            Whether or not to do basic tokenization before WordPiece.
+        never_split (`Iterable`, *optional*):
+            Collection of tokens which will never be split during tokenization. Only has an effect when
+            `do_basic_tokenize=True`
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
+            Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see this
+            [issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents (`bool`, *optional*):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for `lowercase` (as in the original BERT).
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        do_lower_case=True,
+        do_basic_tokenize=True,
+        never_split=None,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        **kwargs,
+    ):
+        if not os.path.isfile(vocab_file):
+            raise ValueError(
+                f"Can't find a vocabulary file at path '{vocab_file}'. To load the vocabulary from a Google pretrained"
+                " model use `tokenizer = BertTokenizer.from_pretrained(PRETRAINED_MODEL_NAME)`"
+            )
+        self.vocab = load_vocab(vocab_file)
+        self.ids_to_tokens = collections.OrderedDict([(ids, tok) for tok, ids in self.vocab.items()])
+        self.do_basic_tokenize = do_basic_tokenize
+        if do_basic_tokenize:
+            self.basic_tokenizer = BasicTokenizer(
+                do_lower_case=do_lower_case,
+                never_split=never_split,
+                tokenize_chinese_chars=tokenize_chinese_chars,
+                strip_accents=strip_accents,
+            )
+
+        self.wordpiece_tokenizer = WordpieceTokenizer(vocab=self.vocab, unk_token=str(unk_token))
+
+        super().__init__(
+            do_lower_case=do_lower_case,
+            do_basic_tokenize=do_basic_tokenize,
+            never_split=never_split,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            **kwargs,
+        )
+
+    @property
+    def do_lower_case(self):
+        return self.basic_tokenizer.do_lower_case
+
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+
+    def get_vocab(self):
+        return dict(self.vocab, **self.added_tokens_encoder)
+
+    def _tokenize(self, text, split_special_tokens=False):
+        split_tokens = []
+        if self.do_basic_tokenize:
+            for token in self.basic_tokenizer.tokenize(
+                text, never_split=self.all_special_tokens if not split_special_tokens else None
+            ):
+                # If the token is part of the never_split set
+                if token in self.basic_tokenizer.never_split:
+                    split_tokens.append(token)
+                else:
+                    split_tokens += self.wordpiece_tokenizer.tokenize(token)
+        else:
+            split_tokens = self.wordpiece_tokenizer.tokenize(text)
+        return split_tokens
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.ids_to_tokens.get(index, self.unk_token)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        out_string = " ".join(tokens).replace(" ##", "").strip()
+        return out_string
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A BERT sequence has the following format:
+
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
+    ) -> list[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if token_ids_1 is not None:
+            return [1] + ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1]
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        index = 0
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(
+                save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+            )
+        else:
+            vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
+        with open(vocab_file, "w", encoding="utf-8") as writer:
+            for token, token_index in sorted(self.vocab.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {vocab_file}: vocabulary indices are not consecutive."
+                        " Please check that the vocabulary is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(token + "\n")
+                index += 1
+        return (vocab_file,)
+
+
+class BasicTokenizer:
+    """
+    Constructs a BasicTokenizer that will run basic tokenization (punctuation splitting, lower casing, etc.).
+
+    Args:
+        do_lower_case (`bool`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+        never_split (`Iterable`, *optional*):
+            Collection of tokens which will never be split during tokenization. Only has an effect when
+            `do_basic_tokenize=True`
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
+            Whether or not to tokenize Chinese characters.
+
+            This should likely be deactivated for Japanese (see this
+            [issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents (`bool`, *optional*):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for `lowercase` (as in the original BERT).
+        do_split_on_punc (`bool`, *optional*, defaults to `True`):
+            In some instances we want to skip the basic punctuation splitting so that later tokenization can capture
+            the full context of the words, such as contractions.
+    """
+
+    def __init__(
+        self,
+        do_lower_case=True,
+        never_split=None,
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        do_split_on_punc=True,
+    ):
+        if never_split is None:
+            never_split = []
+        self.do_lower_case = do_lower_case
+        self.never_split = set(never_split)
+        self.tokenize_chinese_chars = tokenize_chinese_chars
+        self.strip_accents = strip_accents
+        self.do_split_on_punc = do_split_on_punc
+
+    def tokenize(self, text, never_split=None):
+        """
+        Basic Tokenization of a piece of text. For sub-word tokenization, see WordPieceTokenizer.
+
+        Args:
+            never_split (`List[str]`, *optional*)
+                Kept for backward compatibility purposes. Now implemented directly at the base class level (see
+                [`PreTrainedTokenizer.tokenize`]) List of token not to split.
+        """
+        # union() returns a new set by concatenating the two sets.
+        never_split = self.never_split.union(set(never_split)) if never_split else self.never_split
+        text = self._clean_text(text)
+
+        # This was added on November 1st, 2018 for the multilingual and Chinese
+        # models. This is also applied to the English models now, but it doesn't
+        # matter since the English models were not trained on any Chinese data
+        # and generally don't have any Chinese data in them (there are Chinese
+        # characters in the vocabulary because Wikipedia does have some Chinese
+        # words in the English Wikipedia.).
+        if self.tokenize_chinese_chars:
+            text = self._tokenize_chinese_chars(text)
+        # prevents treating the same character with different unicode codepoints as different characters
+        unicode_normalized_text = unicodedata.normalize("NFC", text)
+        orig_tokens = whitespace_tokenize(unicode_normalized_text)
+        split_tokens = []
+        for token in orig_tokens:
+            if token not in never_split:
+                if self.do_lower_case:
+                    token = token.lower()
+                    if self.strip_accents is not False:
+                        token = self._run_strip_accents(token)
+                elif self.strip_accents:
+                    token = self._run_strip_accents(token)
+            split_tokens.extend(self._run_split_on_punc(token, never_split))
+
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
+    def _run_split_on_punc(self, text, never_split=None):
+        """Splits punctuation on a piece of text."""
+        if not self.do_split_on_punc or (never_split is not None and text in never_split):
+            return [text]
+        chars = list(text)
+        i = 0
+        start_new_word = True
+        output = []
+        while i < len(chars):
+            char = chars[i]
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+
+        return ["".join(x) for x in output]
+
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if (
+            (cp >= 0x4E00 and cp <= 0x9FFF)
+            or (cp >= 0x3400 and cp <= 0x4DBF)
+            or (cp >= 0x20000 and cp <= 0x2A6DF)
+            or (cp >= 0x2A700 and cp <= 0x2B73F)
+            or (cp >= 0x2B740 and cp <= 0x2B81F)
+            or (cp >= 0x2B820 and cp <= 0x2CEAF)
+            or (cp >= 0xF900 and cp <= 0xFAFF)
+            or (cp >= 0x2F800 and cp <= 0x2FA1F)
+        ):
+            return True
+
+        return False
+
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xFFFD or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+
+class WordpieceTokenizer:
+    """Runs WordPiece tokenization."""
+
+    def __init__(self, vocab, unk_token, max_input_chars_per_word=100):
+        self.vocab = vocab
+        self.unk_token = unk_token
+        self.max_input_chars_per_word = max_input_chars_per_word
+
+    def tokenize(self, text):
+        """
+        Tokenizes a piece of text into its word pieces. This uses a greedy longest-match-first algorithm to perform
+        tokenization using the given vocabulary.
+
+        For example, `input = "unaffable"` will return as output `["un", "##aff", "##able"]`.
+
+        Args:
+            text: A single token or whitespace separated tokens. This should have
+                already been passed through *BasicTokenizer*.
+
+        Returns:
+            A list of wordpiece tokens.
+        """
+
+        output_tokens = []
+        for token in whitespace_tokenize(text):
+            chars = list(token)
+            if len(chars) > self.max_input_chars_per_word:
+                output_tokens.append(self.unk_token)
+                continue
+
+            is_bad = False
+            start = 0
+            sub_tokens = []
+            while start < len(chars):
+                end = len(chars)
+                cur_substr = None
+                while start < end:
+                    substr = "".join(chars[start:end])
+                    if start > 0:
+                        substr = "##" + substr
+                    if substr in self.vocab:
+                        cur_substr = substr
+                        break
+                    end -= 1
+                if cur_substr is None:
+                    is_bad = True
+                    break
+                sub_tokens.append(cur_substr)
+                start = end
+
+            if is_bad:
+                output_tokens.append(self.unk_token)
+            else:
+                output_tokens.extend(sub_tokens)
+        return output_tokens
+
+
+__all__ = ["RetriBertTokenizer"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/tokenization_retribert_fast.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/tokenization_retribert_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..5fcbc395a1b7ba7f659a9befae14c8ba658c27d0
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/retribert/tokenization_retribert_fast.py
@@ -0,0 +1,150 @@
+# coding=utf-8
+# Copyright 2018 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for RetriBERT."""
+
+import json
+from typing import Optional
+
+from tokenizers import normalizers
+
+from ....tokenization_utils_fast import PreTrainedTokenizerFast
+from ....utils import logging
+from .tokenization_retribert import RetriBertTokenizer
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.txt", "tokenizer_file": "tokenizer.json"}
+
+
+class RetriBertTokenizerFast(PreTrainedTokenizerFast):
+    r"""
+    Construct a "fast" RetriBERT tokenizer (backed by HuggingFace's *tokenizers* library).
+
+    [`RetriBertTokenizerFast`] is identical to [`BertTokenizerFast`] and runs end-to-end tokenization: punctuation
+    splitting and wordpiece.
+
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            File containing the vocabulary.
+        do_lower_case (`bool`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        clean_text (`bool`, *optional*, defaults to `True`):
+            Whether or not to clean the text before tokenization by removing any control characters and replacing all
+            whitespaces by the classic one.
+        tokenize_chinese_chars (`bool`, *optional*, defaults to `True`):
+            Whether or not to tokenize Chinese characters. This should likely be deactivated for Japanese (see [this
+            issue](https://github.com/huggingface/transformers/issues/328)).
+        strip_accents (`bool`, *optional*):
+            Whether or not to strip all accents. If this option is not specified, then it will be determined by the
+            value for `lowercase` (as in the original BERT).
+        wordpieces_prefix (`str`, *optional*, defaults to `"##"`):
+            The prefix for subwords.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    slow_tokenizer_class = RetriBertTokenizer
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file=None,
+        tokenizer_file=None,
+        do_lower_case=True,
+        unk_token="[UNK]",
+        sep_token="[SEP]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        tokenize_chinese_chars=True,
+        strip_accents=None,
+        **kwargs,
+    ):
+        super().__init__(
+            vocab_file,
+            tokenizer_file=tokenizer_file,
+            do_lower_case=do_lower_case,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            tokenize_chinese_chars=tokenize_chinese_chars,
+            strip_accents=strip_accents,
+            **kwargs,
+        )
+
+        normalizer_state = json.loads(self.backend_tokenizer.normalizer.__getstate__())
+        if (
+            normalizer_state.get("lowercase", do_lower_case) != do_lower_case
+            or normalizer_state.get("strip_accents", strip_accents) != strip_accents
+            or normalizer_state.get("handle_chinese_chars", tokenize_chinese_chars) != tokenize_chinese_chars
+        ):
+            normalizer_class = getattr(normalizers, normalizer_state.pop("type"))
+            normalizer_state["lowercase"] = do_lower_case
+            normalizer_state["strip_accents"] = strip_accents
+            normalizer_state["handle_chinese_chars"] = tokenize_chinese_chars
+            self.backend_tokenizer.normalizer = normalizer_class(**normalizer_state)
+
+        self.do_lower_case = do_lower_case
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None):
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A BERT sequence has the following format:
+
+        - single sequence: `[CLS] X [SEP]`
+        - pair of sequences: `[CLS] A [SEP] B [SEP]`
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        output = [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+
+        if token_ids_1 is not None:
+            output += token_ids_1 + [self.sep_token_id]
+
+        return output
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        files = self._tokenizer.model.save(save_directory, name=filename_prefix)
+        return tuple(files)
+
+
+__all__ = ["RetriBertTokenizerFast"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..78c549b6e294e1ca97a718ef601472d4d400e12a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2021 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ....utils import _LazyModule
+from ....utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_speech_to_text_2 import *
+    from .modeling_speech_to_text_2 import *
+    from .processing_speech_to_text_2 import *
+    from .tokenization_speech_to_text_2 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/configuration_speech_to_text_2.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/configuration_speech_to_text_2.py
new file mode 100644
index 0000000000000000000000000000000000000000..a7cde922740a9a2173c64311aec8f6ad5496ac03
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/configuration_speech_to_text_2.py
@@ -0,0 +1,134 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Speech2Text model configuration"""
+
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class Speech2Text2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Speech2Text2ForCausalLM`]. It is used to
+    instantiate an Speech2Text2 model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the Speech2Text2
+    [facebook/s2t-wav2vec2-large-en-de](https://huggingface.co/facebook/s2t-wav2vec2-large-en-de) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50265):
+            Vocabulary size of the Speech2Text model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`Speech2TextModel`]
+        d_model (`int`, *optional*, defaults to 1024):
+            Dimensionality of the layers and the pooler layer.
+        decoder_layers (`int`, *optional*, defaults to 12):
+            Number of decoder layers.
+        decoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the pooler. If string, `"gelu"`, `"relu"`,
+            `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+            https://huggingface.co/papers/1909.11556>`__ for more details.
+        decoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://huggingface.co/papers/1909.11556)
+            for more details.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        max_target_positions (`int`, *optional*, defaults to 1024):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+
+    Example:
+
+    ```python
+    >>> from transformers import Speech2Text2Config, Speech2Text2ForCausalLM
+
+    >>> # Initializing a Speech2Text2 s2t_transformer_s style configuration
+    >>> configuration = Speech2Text2Config()
+
+    >>> # Initializing a model (with random weights) from the s2t_transformer_s style configuration
+    >>> model = Speech2Text2ForCausalLM(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "speech_to_text_2"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {"num_attention_heads": "decoder_attention_heads", "hidden_size": "d_model"}
+
+    def __init__(
+        self,
+        vocab_size=10000,
+        decoder_layers=6,
+        decoder_ffn_dim=2048,
+        decoder_attention_heads=4,
+        decoder_layerdrop=0.0,
+        use_cache=True,
+        activation_function="relu",
+        d_model=256,
+        dropout=0.1,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        init_std=0.02,
+        decoder_start_token_id=2,
+        scale_embedding=True,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        max_target_positions=1024,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.d_model = d_model
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.decoder_layerdrop = decoder_layerdrop
+        self.use_cache = use_cache
+        self.num_hidden_layers = decoder_layers
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+        self.max_target_positions = max_target_positions
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            decoder_start_token_id=decoder_start_token_id,
+            **kwargs,
+        )
+
+
+__all__ = ["Speech2Text2Config"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/modeling_speech_to_text_2.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/modeling_speech_to_text_2.py
new file mode 100644
index 0000000000000000000000000000000000000000..854f21c0655046124fb439ec3e4a44395c33abcc
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/modeling_speech_to_text_2.py
@@ -0,0 +1,905 @@
+# coding=utf-8
+# Copyright 2021 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Speech2Text2 model."""
+
+import math
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from ....activations import ACT2FN
+from ....cache_utils import Cache
+from ....modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
+from ....modeling_layers import GradientCheckpointingLayer
+from ....modeling_outputs import BaseModelOutputWithPastAndCrossAttentions, CausalLMOutputWithCrossAttentions
+from ....modeling_utils import PreTrainedModel
+from ....utils import add_start_docstrings, logging, replace_return_docstrings
+from ....utils.deprecation import deprecate_kwarg
+from .configuration_speech_to_text_2 import Speech2Text2Config
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "Speech2Text2Config"
+_CHECKPOINT_FOR_DOC = "facebook/s2t-wav2vec2-large-en-de"
+
+
+class Speech2Text2SinusoidalPositionalEmbedding(nn.Module):
+    """This module produces sinusoidal positional embeddings of any length."""
+
+    def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        super().__init__()
+        self.offset = 2
+        self.embedding_dim = embedding_dim
+        self.padding_idx = padding_idx
+
+    def make_weights(self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        emb_weights = self.get_embedding(num_embeddings, embedding_dim, padding_idx)
+        if hasattr(self, "weights"):
+            # in forward put the weights on the correct dtype and device of the param
+            emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device)
+
+        self.weights = nn.Parameter(emb_weights)
+        self.weights.requires_grad = False
+        self.weights.detach_()
+
+    @staticmethod
+    def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        """
+        Build sinusoidal embeddings. This matches the implementation in tensor2tensor, but differs slightly from the
+        description in Section 3.5 of "Attention Is All You Need".
+        """
+        half_dim = embedding_dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=torch.int64).float() * -emb)
+        emb = torch.arange(num_embeddings, dtype=torch.int64).float().unsqueeze(1) * emb.unsqueeze(0)
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1)
+        if embedding_dim % 2 == 1:
+            # zero pad
+            emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
+        if padding_idx is not None:
+            emb[padding_idx, :] = 0
+        return emb.to(torch.get_default_dtype())
+
+    @torch.no_grad()
+    def forward(self, input_ids: torch.Tensor, past_key_values_length: int = 0):
+        bsz, seq_len = input_ids.size()
+        # Create the position ids from the input token ids. Any padded tokens remain padded.
+        position_ids = self.create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length).to(
+            input_ids.device
+        )
+
+        # expand embeddings if needed
+        max_pos = self.padding_idx + 1 + seq_len
+        if max_pos > self.weights.size(0):
+            self.make_weights(max_pos + self.offset, self.embedding_dim, self.padding_idx)
+
+        return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, -1).detach()
+
+    def create_position_ids_from_input_ids(
+        self, input_ids: torch.Tensor, padding_idx: int, past_key_values_length: Optional[int] = 0
+    ):
+        """
+        Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding
+        symbols are ignored. This is modified from fairseq's `utils.make_positions`.
+
+        Args:
+            x: torch.Tensor x:
+        Returns: torch.Tensor
+        """
+        # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
+        mask = input_ids.ne(padding_idx).int()
+        incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
+        return incremental_indices.long() + padding_idx
+
+
+class Speech2Text2Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+        is_causal: bool = False,
+        config: Optional[Speech2Text2Config] = None,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        self.config = config
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+        self.is_causal = is_causal
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        # `past_key_values[0].shape[2] == key_value_states.shape[1]`
+        # is checking that the `sequence_length` of the `past_key_values` is the same as
+        # the provided `key_value_states` to support prefix tuning
+        if (
+            is_cross_attention
+            and past_key_values is not None
+            and past_key_values[0].shape[2] == key_value_states.shape[1]
+        ):
+            # reuse k,v, cross_attentions
+            key_states = past_key_values[0]
+            value_states = past_key_values[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_values is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = torch.cat([past_key_values[0], key_states], dim=2)
+            value_states = torch.cat([past_key_values[1], value_states], dim=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(torch.Tensor, torch.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_values` is always `None`
+            past_key_values = (key_states, value_states)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, bsz).view(*proj_shape)
+        key_states = key_states.reshape(*proj_shape)
+        value_states = value_states.reshape(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if layer_head_mask is not None:
+            if layer_head_mask.size() != (self.num_heads,):
+                raise ValueError(
+                    f"Head mask for a single layer should be of size {(self.num_heads,)}, but is"
+                    f" {layer_head_mask.size()}"
+                )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to be reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned across GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped, past_key_values
+
+
+class Speech2Text2DecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: Speech2Text2Config):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        self.self_attn = Speech2Text2Attention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+
+        if config.is_decoder:
+            self.encoder_attn = Speech2Text2Attention(
+                self.embed_dim,
+                config.decoder_attention_heads,
+                dropout=config.attention_dropout,
+                is_decoder=True,
+            )
+            self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = True,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            encoder_hidden_states (`torch.FloatTensor`):
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
+            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
+                size *(decoder_attention_heads,)*.
+            past_key_values (`Tuple(torch.FloatTensor)`): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        # Self Attention
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_values[:2] if past_key_values is not None else None
+        # add present self-attn cache to positions 1,2 of present_key_value tuple
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_values=self_attn_past_key_value,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Cross-Attention Block
+        cross_attn_present_key_value = None
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+
+            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
+            cross_attn_past_key_value = past_key_values[-2:] if past_key_values is not None else None
+            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_values=cross_attn_past_key_value,
+                output_attentions=output_attentions,
+            )
+            hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+            hidden_states = residual + hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+            # add cross-attn to positions 3,4 of present_key_value tuple
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+class Speech2Text2PreTrainedModel(PreTrainedModel):
+    config: Speech2Text2Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        std = self.config.init_std
+        if isinstance(module, (nn.Linear, nn.Conv1d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, Speech2Text2SinusoidalPositionalEmbedding):
+            weight = module.get_embedding(*module.weight.shape, module.padding_idx)
+            weight = nn.Parameter(weight, requires_grad=False)
+            weight.detach_()
+            module.weight = weight
+
+
+SPEECH_TO_TEXT_2_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`Speech2Text2Config`]):
+            Model configuration class with all the parameters of the model. Initializing with a config file does not
+            load the weights associated with the model, only the configuration. Check out the
+            [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+
+class Speech2Text2Decoder(Speech2Text2PreTrainedModel):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`Speech2Text2DecoderLayer`]
+
+    Args:
+        config: Speech2Text2Config
+        embed_tokens (nn.Embedding): output embedding
+    """
+
+    def __init__(self, config: Speech2Text2Config):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+        self.padding_idx = config.pad_token_id
+        self.max_target_positions = config.max_target_positions
+        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
+
+        self.embed_positions = Speech2Text2SinusoidalPositionalEmbedding(
+            self.max_target_positions,
+            config.d_model,
+            self.padding_idx,
+        )
+
+        self.layers = nn.ModuleList([Speech2Text2DecoderLayer(config) for _ in range(config.decoder_layers)])
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        head_mask=None,
+        cross_attn_head_mask=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using [`Speech2Text2Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
+                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
+                selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules in encoder to avoid performing cross-attention
+                on hidden heads. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`.
+
+                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
+                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
+                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
+                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        # past_key_values_length
+        past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+
+        attention_mask = _prepare_4d_causal_attention_mask(
+            attention_mask, input_shape, inputs_embeds, past_key_values_length
+        )
+
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _prepare_4d_attention_mask(
+                encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+            )
+
+        # embed positions
+        positions = self.embed_positions(input_ids, past_key_values_length=past_key_values_length)
+
+        hidden_states = inputs_embeds + positions
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache = True` is incompatible with gradient checkpointing. Setting `use_cache = False`..."
+                )
+                use_cache = False
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+        next_decoder_cache = () if use_cache else None
+
+        # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
+        for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
+            if attn_mask is not None:
+                if attn_mask.size()[0] != (len(self.layers)):
+                    raise ValueError(
+                        f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
+                        f" {head_mask.size()[0]}."
+                    )
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                cross_attn_layer_head_mask=(cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None),
+                past_key_values=past_key_values[idx] if past_key_values is not None else None,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+            )
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                next_decoder_cache += (layer_outputs[3 if output_attentions else 1],)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        next_cache = next_decoder_cache if use_cache else None
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, next_cache, all_hidden_states, all_self_attns, all_cross_attentions]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=next_cache,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+@add_start_docstrings(
+    "The Speech2Text2 Model with a language modeling head. Can be used for summarization.",
+    SPEECH_TO_TEXT_2_START_DOCSTRING,
+)
+class Speech2Text2DecoderWrapper(Speech2Text2PreTrainedModel):
+    """
+    This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
+    used in combination with the [`EncoderDecoderModel`] framework.
+    """
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.decoder = Speech2Text2Decoder(config)
+
+    def forward(self, *args, **kwargs):
+        return self.decoder(*args, **kwargs)
+
+
+@add_start_docstrings(
+    "The Speech2Text2 Decoder with a language modeling head. Can be used as the decoder part of"
+    " [`EncoderDecoderModel`] and [`SpeechEncoderDecoder`].",
+    SPEECH_TO_TEXT_2_START_DOCSTRING,
+)
+class Speech2Text2ForCausalLM(Speech2Text2PreTrainedModel):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config):
+        config.is_decoder = True
+        config.is_encoder_decoder = False
+        super().__init__(config)
+        self.model = Speech2Text2DecoderWrapper(config)
+
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.decoder.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.decoder.embed_tokens = value
+
+    def set_decoder(self, decoder):
+        self.model.decoder = decoder
+
+    def get_decoder(self):
+        return self.model.decoder
+
+    @replace_return_docstrings(output_type=CausalLMOutputWithCrossAttentions, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.FloatTensor], CausalLMOutputWithCrossAttentions]:
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using [`Speech2Text2Tokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                if the model is configured as a decoder.
+            encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used
+                in the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            past_key_values (`tuple(tuple(torch.FloatTensor))`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+                Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
+                shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`) and 2 additional tensors of
+                shape `(batch_size, num_heads, encoder_sequence_length, embed_size_per_head)`. The two additional
+                tensors are only required when the model is used as a decoder in a Sequence to Sequence model.
+
+                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
+                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
+                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
+                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+            labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+                config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+                (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import (
+        ...     SpeechEncoderDecoderModel,
+        ...     Speech2Text2ForCausalLM,
+        ...     Wav2Vec2Model,
+        ...     Speech2Text2Config,
+        ...     Wav2Vec2Config,
+        ...     Wav2Vec2FeatureExtractor,
+        ...     Speech2Text2Tokenizer,
+        ... )
+        >>> from datasets import load_dataset
+
+        >>> feature_extractor = Wav2Vec2FeatureExtractor()
+        >>> tokenizer = Speech2Text2Tokenizer.from_pretrained("facebook/s2t-wav2vec2-large-en-de")
+
+        >>> encoder = Wav2Vec2Model(Wav2Vec2Config())
+        >>> decoder = Speech2Text2ForCausalLM(Speech2Text2Config())
+        >>> # init random speech2text model
+
+        >>> model = SpeechEncoderDecoderModel(encoder=encoder, decoder=decoder)
+        >>> model.config.pad_token_id = tokenizer.pad_token_id
+        >>> model.config.decoder_start_token_id = tokenizer.bos_token_id
+        >>> # pre-process inputs and labels
+
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> inputs = feature_extractor(
+        ...     ds[0]["audio"]["array"], sampling_rate=ds[0]["audio"]["sampling_rate"], return_tensors="pt"
+        ... )
+        >>> input_values = inputs.input_values
+        >>> decoder_input_ids = tokenizer(ds[0]["text"], return_tensors="pt").input_ids
+        >>> # compute loss
+
+        >>> loss = model(inputs=input_values, labels=decoder_input_ids).loss
+        >>> # backprop loss
+
+        >>> loss.backward()  # doctest: +IGNORE_RESULT
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model.decoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            head_mask=head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        logits = self.lm_head(outputs[0])
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs
+    ):
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_ids.shape)
+
+        if past_key_values:
+            past_length = past_key_values.get_seq_length()
+
+            # Some generation methods already pass only the last input ID
+            if input_ids.shape[1] > past_length:
+                remove_prefix_length = past_length
+            else:
+                # Default to old behavior: keep only final ID
+                remove_prefix_length = input_ids.shape[1] - 1
+
+            input_ids = input_ids[:, remove_prefix_length:]
+        # first step, decoder_cached_states are empty
+        return {
+            "input_ids": input_ids,  # encoder_outputs is defined. input_ids not needed
+            "attention_mask": attention_mask,
+            "past_key_values": past_key_values,
+            "use_cache": use_cache,
+        }
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+
+
+__all__ = ["Speech2Text2ForCausalLM", "Speech2Text2PreTrainedModel"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/processing_speech_to_text_2.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/processing_speech_to_text_2.py
new file mode 100644
index 0000000000000000000000000000000000000000..b69420bf540cbe737d6509b79d943af9d41e6c24
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/processing_speech_to_text_2.py
@@ -0,0 +1,105 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Speech processor class for Speech2Text2
+"""
+
+import warnings
+from contextlib import contextmanager
+
+from ....processing_utils import ProcessorMixin
+
+
+class Speech2Text2Processor(ProcessorMixin):
+    r"""
+    Constructs a Speech2Text2 processor which wraps a Speech2Text2 feature extractor and a Speech2Text2 tokenizer into
+    a single processor.
+
+    [`Speech2Text2Processor`] offers all the functionalities of [`AutoFeatureExtractor`] and [`Speech2Text2Tokenizer`].
+    See the [`~Speech2Text2Processor.__call__`] and [`~Speech2Text2Processor.decode`] for more information.
+
+    Args:
+        feature_extractor (`AutoFeatureExtractor`):
+            An instance of [`AutoFeatureExtractor`]. The feature extractor is a required input.
+        tokenizer (`Speech2Text2Tokenizer`):
+            An instance of [`Speech2Text2Tokenizer`]. The tokenizer is a required input.
+    """
+
+    feature_extractor_class = "AutoFeatureExtractor"
+    tokenizer_class = "Speech2Text2Tokenizer"
+
+    def __init__(self, feature_extractor, tokenizer):
+        super().__init__(feature_extractor, tokenizer)
+        self.current_processor = self.feature_extractor
+        self._in_target_context_manager = False
+
+    def __call__(self, *args, **kwargs):
+        """
+        When used in normal mode, this method forwards all its arguments to AutoFeatureExtractor's
+        [`~AutoFeatureExtractor.__call__`] and returns its output. If used in the context
+        [`~Speech2Text2Processor.as_target_processor`] this method forwards all its arguments to
+        Speech2Text2Tokenizer's [`~Speech2Text2Tokenizer.__call__`]. Please refer to the docstring of the above two
+        methods for more information.
+        """
+        # For backward compatibility
+        if self._in_target_context_manager:
+            return self.current_processor(*args, **kwargs)
+
+        if "raw_speech" in kwargs:
+            warnings.warn("Using `raw_speech` as a keyword argument is deprecated. Use `audio` instead.")
+            audio = kwargs.pop("raw_speech")
+        else:
+            audio = kwargs.pop("audio", None)
+        sampling_rate = kwargs.pop("sampling_rate", None)
+        text = kwargs.pop("text", None)
+        if len(args) > 0:
+            audio = args[0]
+            args = args[1:]
+
+        if audio is None and text is None:
+            raise ValueError("You need to specify either an `audio` or `text` input to process.")
+
+        if audio is not None:
+            inputs = self.feature_extractor(audio, *args, sampling_rate=sampling_rate, **kwargs)
+        if text is not None:
+            encodings = self.tokenizer(text, **kwargs)
+
+        if text is None:
+            return inputs
+        elif audio is None:
+            return encodings
+        else:
+            inputs["labels"] = encodings["input_ids"]
+            return inputs
+
+    @contextmanager
+    def as_target_processor(self):
+        """
+        Temporarily sets the tokenizer for processing the input. Useful for encoding the labels when fine-tuning
+        Speech2Text2.
+        """
+        warnings.warn(
+            "`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your "
+            "labels by using the argument `text` of the regular `__call__` method (either in the same call as "
+            "your audio inputs, or in a separate call."
+        )
+        self._in_target_context_manager = True
+        self.current_processor = self.tokenizer
+        yield
+        self.current_processor = self.feature_extractor
+        self._in_target_context_manager = False
+
+
+__all__ = ["Speech2Text2Processor"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/tokenization_speech_to_text_2.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/tokenization_speech_to_text_2.py
new file mode 100644
index 0000000000000000000000000000000000000000..05c10080b7732601efd476a8c6e2640e74450a76
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/speech_to_text_2/tokenization_speech_to_text_2.py
@@ -0,0 +1,252 @@
+# coding=utf-8
+# Copyright 2021 The Facebook Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization class for Speech2Text2."""
+
+import json
+import os
+from typing import Optional
+
+from ....tokenization_utils import PreTrainedTokenizer
+from ....utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+VOCAB_FILES_NAMES = {
+    "vocab_file": "vocab.json",
+    "tokenizer_config_file": "tokenizer_config.json",
+    "merges_file": "merges.txt",
+}
+
+
+BPE_TOKEN_MERGES = "</w>"
+BPE_TOKEN_VOCAB = "@@ "
+
+
+def get_pairs(word):
+    """
+    Return set of symbol pairs in a word. word is represented as tuple of symbols (symbols being variable-length
+    strings)
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+# Speech2Text2 has no max input length
+
+
+class Speech2Text2Tokenizer(PreTrainedTokenizer):
+    """
+    Constructs a Speech2Text2Tokenizer.
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains some of the main methods. Users should refer to
+    the superclass for more information regarding such methods.
+
+    Args:
+        vocab_file (`str`):
+            File containing the vocabulary.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The beginning of sentence token.
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sentence token.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+
+        **kwargs
+            Additional keyword arguments passed along to [`PreTrainedTokenizer`]
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        bos_token="<s>",
+        pad_token="<pad>",
+        eos_token="</s>",
+        unk_token="<unk>",
+        do_lower_case=False,
+        merges_file=None,
+        **kwargs,
+    ):
+        self.do_lower_case = do_lower_case
+
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
+        self.decoder = {v: k for k, v in self.encoder.items()}
+
+        if merges_file is None:
+            logger.info(f"No merges files provided. {self.__class__.__name__} can only be used for decoding.")
+
+            self.bpe_ranks = None
+            self.cache = None
+        else:
+            with open(merges_file, encoding="utf-8") as merges_handle:
+                merges = merges_handle.read().split("\n")[:-1]
+
+            merges = [tuple(merge.split()[:2]) for merge in merges]
+            self.bpe_ranks = dict(zip(merges, range(len(merges))))
+            self.cache = {}
+        super().__init__(
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            pad_token=pad_token,
+            do_lower_case=do_lower_case,
+            **kwargs,
+        )
+
+    @property
+    def vocab_size(self) -> int:
+        return len(self.decoder)
+
+    def get_vocab(self) -> dict:
+        return dict(self.encoder, **self.added_tokens_encoder)
+
+    def bpe(self, token):
+        word = tuple(token[:-1]) + (token[-1] + BPE_TOKEN_MERGES,)
+        if token in self.cache:
+            return self.cache[token]
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token
+
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                except ValueError:
+                    new_word.extend(word[i:])
+                    break
+                else:
+                    new_word.extend(word[i:j])
+                    i = j
+
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = " ".join(word)
+        if word == "\n  " + BPE_TOKEN_MERGES:
+            word = "\n" + BPE_TOKEN_MERGES
+
+        if word.endswith(BPE_TOKEN_MERGES):
+            word = word.replace(BPE_TOKEN_MERGES, "")
+
+        word = word.replace(" ", BPE_TOKEN_VOCAB)
+        self.cache[token] = word
+        return word
+
+    def _tokenize(self, text):
+        """Tokenize a string."""
+
+        if self.bpe_ranks is None:
+            raise ValueError(
+                "This tokenizer was instantiated without a `merges.txt` file, so"
+                " that it can only be used for decoding, not for encoding. "
+                "Make sure to provide `merges.txt` file at instantiation to enable "
+                "encoding."
+            )
+
+        if self.do_lower_case:
+            text = text.lower()
+
+        text = text.split()
+
+        split_tokens = []
+        for token in text:
+            if token:
+                split_tokens.extend(list(self.bpe(token).split(" ")))
+
+        return split_tokens
+
+    def _convert_token_to_id(self, token: str) -> int:
+        """Converts a token (str) in an index (integer) using the vocab."""
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+    def _convert_id_to_token(self, index: int) -> str:
+        """Converts an index (integer) in a token (str) using the vocab."""
+        result = self.decoder.get(index, self.unk_token)
+        return result
+
+    def convert_tokens_to_string(self, tokens: list[str]) -> str:
+        """
+        Converts a list of output tokens into a single string.
+        """
+        # combine tokens
+        string = " ".join(tokens)
+
+        # make sure @@ tokens are concatenated
+        string = "".join(string.split(BPE_TOKEN_VOCAB))
+
+        return string
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        merges_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
+        )
+
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
+
+        index = 0
+        if self.bpe_ranks is None:
+            return (vocab_file,)
+
+        with open(merges_file, "w", encoding="utf-8") as writer:
+            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {merges_file}: BPE merge indices are not consecutive."
+                        " Please check that the tokenizer is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(" ".join(bpe_tokens) + "\n")
+                index += 1
+
+        return (vocab_file, merges_file)
+
+
+__all__ = ["Speech2Text2Tokenizer"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tapex/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tapex/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b535eb1df2c40a7680bbf8d57fc70b78e23437f4
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tapex/__init__.py
@@ -0,0 +1,26 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ....utils import _LazyModule
+from ....utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .tokenization_tapex import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tapex/tokenization_tapex.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tapex/tokenization_tapex.py
new file mode 100644
index 0000000000000000000000000000000000000000..b32383ddd497828b6a429d39c5d4069a21154b9f
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tapex/tokenization_tapex.py
@@ -0,0 +1,1470 @@
+# coding=utf-8
+# Copyright 2022 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for TAPEX."""
+
+import json
+import os
+import random
+from functools import lru_cache
+from typing import Optional, Union
+
+import regex as re
+
+from ....file_utils import ExplicitEnum, PaddingStrategy, TensorType, add_end_docstrings, is_pandas_available
+from ....tokenization_utils import AddedToken, PreTrainedTokenizer
+from ....tokenization_utils_base import ENCODE_KWARGS_DOCSTRING, BatchEncoding, TextInput, TruncationStrategy
+from ....utils import logging
+
+
+if is_pandas_available():
+    import pandas as pd
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.json", "merges_file": "merges.txt"}
+
+
+class TapexTruncationStrategy(ExplicitEnum):
+    """
+    Possible values for the `truncation` argument in [`~TapasTokenizer.__call__`]. Useful for tab-completion in an IDE.
+    """
+
+    DROP_ROWS_TO_FIT = "drop_rows_to_fit"
+
+
+TAPEX_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING = r"""
+            add_special_tokens (`bool`, *optional*, defaults to `True`):
+                Whether or not to encode the sequences with the special tokens relative to their model.
+            padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `False`):
+                Activates and controls padding. Accepts the following values:
+
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            truncation (`bool`, `str`, [`TapexTruncationStrategy`] or [`~tokenization_utils_base.TruncationStrategy`],
+                   *optional*, defaults to `False`):
+
+                Activates and controls truncation. Accepts the following values:
+
+                - `'drop_rows_to_fit'`: Truncate to a maximum length specified with the argument `max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided. This will truncate
+                  row by row, removing rows from the table.
+                - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or
+                  to the maximum acceptable input length for the model if that argument is not provided. This will
+                  truncate token by token, removing a token from the longest sequence in the pair if a pair of
+                  sequences (or a batch of pairs) is provided.
+                - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided. This will only
+                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+                - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided. This will only
+                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+                - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
+                  greater than the model maximum admissible input size).
+            max_length (`int`, *optional*):
+                Controls the maximum length to use by one of the truncation/padding parameters. If left unset or set to
+                `None`, this will use the predefined model maximum length if a maximum length is required by one of the
+                truncation/padding parameters. If the model has no specific maximum input length (like XLNet)
+                truncation/padding to a maximum length will be deactivated.
+            stride (`int`, *optional*, defaults to 0):
+                If set to a number along with `max_length`, the overflowing tokens returned when
+                `return_overflowing_tokens=True` will contain some tokens from the end of the truncated sequence
+                returned to provide some overlap between truncated and overflowing sequences. The value of this
+                argument defines the number of overlapping tokens.
+            pad_to_multiple_of (`int`, *optional*):
+                If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
+                the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+"""
+
+
+@lru_cache
+def bytes_to_unicode():
+    """
+    Returns list of utf-8 byte and a mapping to unicode strings. We specifically avoids mapping to whitespace/control
+    characters the bpe code barfs on. The reversible bpe codes work on unicode strings. This means you need a large #
+    of unicode characters in your vocab if you want to avoid UNKs. When you're at something like a 10B token dataset
+    you end up needing around 5K for decent coverage. This is a significant percentage of your normal, say, 32K bpe
+    vocab. To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    """
+    bs = (
+        list(range(ord("!"), ord("~") + 1)) + list(range(ord("¡"), ord("¬") + 1)) + list(range(ord("®"), ord("ÿ") + 1))
+    )
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8 + n)
+            n += 1
+    cs = [chr(n) for n in cs]
+    return dict(zip(bs, cs))
+
+
+def get_pairs(word):
+    """
+    Return set of symbol pairs in a word. Word is represented as tuple of symbols (symbols being variable-length
+    strings).
+    """
+    pairs = set()
+    prev_char = word[0]
+    for char in word[1:]:
+        pairs.add((prev_char, char))
+        prev_char = char
+    return pairs
+
+
+class IndexedRowTableLinearize:
+    """
+    FORMAT: col: col1 | col2 | col 3 row 1 : val1 | val2 | val3 row 2 : ...
+    """
+
+    def process_table(self, table_content: dict):
+        """
+        Given a table, TableLinearize aims at converting it into a flatten sequence with special symbols.
+        """
+        assert "header" in table_content and "rows" in table_content, self.PROMPT_MESSAGE
+        # process header
+        table_str = self.process_header(table_content["header"]) + " "
+        # process rows
+        for i, row_example in enumerate(table_content["rows"]):
+            # NOTE: the row should start from row 1 instead of 0
+            table_str += self.process_row(row_example, row_index=i + 1) + " "
+        return table_str.strip()
+
+    def process_header(self, headers: list):
+        """
+        Given a list of headers, TableLinearize aims at converting it into a flatten sequence with special symbols.
+        """
+        return "col : " + " | ".join(headers)
+
+    def process_row(self, row: list, row_index: int):
+        """
+        Given a row, TableLinearize aims at converting it into a flatten sequence with special symbols.
+        """
+        row_str = ""
+        row_cell_values = []
+        for cell_value in row:
+            if isinstance(cell_value, int):
+                row_cell_values.append(str(cell_value))
+            else:
+                row_cell_values.append(cell_value)
+        row_str += " | ".join(row_cell_values)
+        return "row " + str(row_index) + " : " + row_str
+
+
+class TapexTokenizer(PreTrainedTokenizer):
+    r"""
+    Construct a TAPEX tokenizer. Based on byte-level Byte-Pair-Encoding (BPE).
+
+    This tokenizer can be used to flatten one or more table(s) and concatenate them with one or more related sentences
+    to be used by TAPEX models. The format that the TAPEX tokenizer creates is the following:
+
+    sentence col: col1 | col2 | col 3 row 1 : val1 | val2 | val3 row 2 : ...
+
+    The tokenizer supports a single table + single query, a single table and multiple queries (in which case the table
+    will be duplicated for every query), a single query and multiple tables (in which case the query will be duplicated
+    for every table), and multiple tables and queries. In other words, you can provide a batch of tables + questions to
+    the tokenizer for instance to prepare them for the model.
+
+    Tokenization itself is based on the BPE algorithm. It is identical to the one used by BART, RoBERTa and GPT-2.
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        merges_file (`str`):
+            Path to the merges file.
+        do_lower_case (`bool`, *optional*, defaults to `True`):
+            Whether or not to lowercase the input when tokenizing.
+        errors (`str`, *optional*, defaults to `"replace"`):
+            Paradigm to follow when decoding bytes to UTF-8. See
+            [bytes.decode](https://docs.python.org/3/library/stdtypes.html#bytes.decode) for more information.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
+            The token used is the `sep_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        add_prefix_space (`bool`, *optional*, defaults to `False`):
+            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
+            other word. (BART tokenizer detect beginning of words by the preceding space).
+        max_cell_length (`int`, *optional*, defaults to 15):
+            Maximum number of characters per cell when linearizing a table. If this number is exceeded, truncation
+            takes place.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        merges_file,
+        do_lower_case=True,
+        errors="replace",
+        bos_token="<s>",
+        eos_token="</s>",
+        sep_token="</s>",
+        cls_token="<s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        mask_token="<mask>",
+        add_prefix_space=False,
+        max_cell_length=15,
+        **kwargs,
+    ):
+        bos_token = AddedToken(bos_token, lstrip=False, rstrip=False) if isinstance(bos_token, str) else bos_token
+        eos_token = AddedToken(eos_token, lstrip=False, rstrip=False) if isinstance(eos_token, str) else eos_token
+        sep_token = AddedToken(sep_token, lstrip=False, rstrip=False) if isinstance(sep_token, str) else sep_token
+        cls_token = AddedToken(cls_token, lstrip=False, rstrip=False) if isinstance(cls_token, str) else cls_token
+        unk_token = AddedToken(unk_token, lstrip=False, rstrip=False) if isinstance(unk_token, str) else unk_token
+        pad_token = AddedToken(pad_token, lstrip=False, rstrip=False) if isinstance(pad_token, str) else pad_token
+
+        # Mask token behave like a normal word, i.e. include the space before it
+        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.encoder = json.load(vocab_handle)
+        self.decoder = {v: k for k, v in self.encoder.items()}
+        self.errors = errors  # how to handle errors in decoding
+        self.byte_encoder = bytes_to_unicode()
+        self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
+        with open(merges_file, encoding="utf-8") as merges_handle:
+            bpe_merges = merges_handle.read().split("\n")[1:-1]
+        bpe_merges = [tuple(merge.split()) for merge in bpe_merges]
+        self.bpe_ranks = dict(zip(bpe_merges, range(len(bpe_merges))))
+        self.cache = {}
+        self.add_prefix_space = add_prefix_space
+        self.do_lower_case = do_lower_case
+
+        # Should have added re.IGNORECASE so BPE merges can happen for capitalized versions of contractions
+        self.pat = re.compile(r"""'s|'t|'re|'ve|'m|'ll|'d| ?\p{L}+| ?\p{N}+| ?[^\s\p{L}\p{N}]+|\s+(?!\S)|\s+""")
+
+        # additional properties
+
+        super().__init__(
+            vocab_file=vocab_file,
+            merges_file=merges_file,
+            do_lower_case=do_lower_case,
+            errors=errors,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            add_prefix_space=add_prefix_space,
+            max_cell_length=max_cell_length,
+            **kwargs,
+        )
+
+        self.max_cell_length = max_cell_length
+        self.table_linearize = IndexedRowTableLinearize()
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A TAPEX sequence has the following format:
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s></s> B </s>`
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
+    ) -> list[int]:
+        """
+        Args:
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+            token_ids_0 (`list[int]`):
+                List of IDs.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+        Returns:
+            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if token_ids_1 is None:
+            return [1] + ([0] * len(token_ids_0)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Args:
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. TAPEX does not:
+        make use of token type ids, therefore a list of zeros is returned.
+            token_ids_0 (`list[int]`):
+                List of IDs.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+        Returns:
+            `list[int]`: List of zeros.
+        """
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
+
+    def prepare_for_tokenization(self, text, is_split_into_words=False, **kwargs):
+        add_prefix_space = kwargs.pop("add_prefix_space", self.add_prefix_space)
+        if (is_split_into_words or add_prefix_space) and (len(text) > 0 and not text[0].isspace()):
+            text = " " + text
+        return (text, kwargs)
+
+    @property
+    def vocab_size(self):
+        return len(self.encoder)
+
+    def get_vocab(self):
+        return dict(self.encoder, **self.added_tokens_encoder)
+
+    def bpe(self, token):
+        if token in self.cache:
+            return self.cache[token]
+        word = tuple(token)
+        pairs = get_pairs(word)
+
+        if not pairs:
+            return token
+
+        while True:
+            bigram = min(pairs, key=lambda pair: self.bpe_ranks.get(pair, float("inf")))
+            if bigram not in self.bpe_ranks:
+                break
+            first, second = bigram
+            new_word = []
+            i = 0
+            while i < len(word):
+                try:
+                    j = word.index(first, i)
+                except ValueError:
+                    new_word.extend(word[i:])
+                    break
+                else:
+                    new_word.extend(word[i:j])
+                    i = j
+
+                if word[i] == first and i < len(word) - 1 and word[i + 1] == second:
+                    new_word.append(first + second)
+                    i += 2
+                else:
+                    new_word.append(word[i])
+                    i += 1
+            new_word = tuple(new_word)
+            word = new_word
+            if len(word) == 1:
+                break
+            else:
+                pairs = get_pairs(word)
+        word = " ".join(word)
+        self.cache[token] = word
+        return word
+
+    def _tokenize(self, text):
+        """Tokenize a string."""
+        bpe_tokens = []
+        for token in re.findall(self.pat, text):
+            token = "".join(
+                self.byte_encoder[b] for b in token.encode("utf-8")
+            )  # Maps all our bytes to unicode strings, avoiding control tokens of the BPE (spaces in our case)
+            bpe_tokens.extend(bpe_token for bpe_token in self.bpe(token).split(" "))
+        return bpe_tokens
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.encoder.get(token, self.encoder.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.decoder.get(index)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        text = "".join(tokens)
+        text = bytearray([self.byte_decoder[c] for c in text]).decode("utf-8", errors=self.errors)
+        return text
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+        merge_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["merges_file"]
+        )
+
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.encoder, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
+
+        index = 0
+        with open(merge_file, "w", encoding="utf-8") as writer:
+            writer.write("#version: 0.2\n")
+            for bpe_tokens, token_index in sorted(self.bpe_ranks.items(), key=lambda kv: kv[1]):
+                if index != token_index:
+                    logger.warning(
+                        f"Saving vocabulary to {merge_file}: BPE merge indices are not consecutive."
+                        " Please check that the tokenizer is not corrupted!"
+                    )
+                    index = token_index
+                writer.write(" ".join(bpe_tokens) + "\n")
+                index += 1
+
+        return vocab_file, merge_file
+
+    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, TAPEX_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
+    def __call__(
+        self,
+        table: Union["pd.DataFrame", list["pd.DataFrame"]] = None,
+        query: Optional[Union[TextInput, list[TextInput]]] = None,
+        answer: Optional[Union[str, list[str]]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        """
+        Main method to tokenize and prepare for the model one or several table-sequence pair(s).
+
+        Args:
+            table (`pd.DataFrame`, `list[pd.DataFrame]`):
+                Table(s) containing tabular data.
+            query (`str` or `list[str]`, *optional*):
+                Sentence or batch of sentences related to one or more table(s) to be encoded. Note that the number of
+                sentences must match the number of tables.
+            answer (`str` or `list[str]`, *optional*):
+                Optionally, the corresponding answer to the questions as supervision.
+        """
+
+        if table is not None:
+            return self.source_call_func(
+                table=table,
+                query=query,
+                answer=answer,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs,
+            )
+        elif answer is not None:
+            return self.target_call_func(
+                answer=answer,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs,
+            )
+        else:
+            raise ValueError("You need to provide either a `table` or an `answer`.")
+
+    def source_call_func(
+        self,
+        table: Union["pd.DataFrame", list["pd.DataFrame"]],
+        query: Optional[Union[TextInput, list[TextInput]]] = None,
+        answer: Optional[Union[str, list[str]]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        # Input type checking for clearer error
+        valid_table = False
+        valid_query = False
+
+        # Check that table have a valid type
+        if isinstance(table, pd.DataFrame):
+            valid_table = True
+        elif isinstance(table, (list, tuple)) and isinstance(table[0], pd.DataFrame):
+            valid_table = True
+
+        # Check that query have a valid type
+        if query is None or isinstance(query, str):
+            valid_query = True
+        elif isinstance(query, (list, tuple)):
+            if len(query) == 0 or isinstance(query[0], str):
+                valid_query = True
+
+        if not valid_table:
+            raise ValueError(
+                "table input must of type `pd.DataFrame` (single example), `list[pd.DataFrame]` (batch of examples). "
+            )
+        if not valid_query:
+            raise ValueError("query input must of type `str` (single example), `list[str]` (batch of examples). ")
+        is_batched = isinstance(table, (list, tuple)) or isinstance(query, (list, tuple))
+
+        if is_batched:
+            return self.batch_encode_plus(
+                table=table,
+                query=query,
+                answer=answer,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs,
+            )
+        else:
+            return self.encode_plus(
+                table=table,
+                query=query,
+                answer=answer,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs,
+            )
+
+    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, TAPEX_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
+    def batch_encode_plus(
+        self,
+        table: Union["pd.DataFrame", list["pd.DataFrame"]],
+        query: Optional[list[TextInput]] = None,
+        answer: Optional[list[str]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Optional[Union[bool, str]] = None,
+        max_length: Optional[int] = None,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        """
+        <Tip warning={true}>
+
+        This method is deprecated, `__call__` should be used instead.
+
+        </Tip>
+        """
+        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
+        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            verbose=verbose,
+            **kwargs,
+        )
+
+        return self._batch_encode_plus(
+            table=table,
+            query=query,
+            answer=answer,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_tensors=return_tensors,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            **kwargs,
+        )
+
+    def _batch_encode_plus(
+        self,
+        table: Union["pd.DataFrame", list["pd.DataFrame"]],
+        query: Optional[list[TextInput]] = None,
+        answer: Optional[list[str]] = None,
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        if return_offsets_mapping:
+            raise NotImplementedError(
+                "return_offset_mapping is not available when using Python tokenizers. "
+                "To use this feature, change your tokenizer to one deriving from "
+                "transformers.PreTrainedTokenizerFast."
+            )
+
+        if isinstance(table, pd.DataFrame) and isinstance(query, (list, tuple)):
+            # single table, many queries case
+            # duplicate table for every query
+            table = [table] * len(query)
+        if isinstance(table, (list, tuple)) and isinstance(query, str):
+            # many tables, single query case
+            # duplicate query for every table
+            query = [query] * len(table)
+
+        batch_outputs = self._batch_prepare_for_model(
+            table=table,
+            query=query,
+            answer=answer,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_attention_mask=return_attention_mask,
+            return_token_type_ids=return_token_type_ids,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_length=return_length,
+            return_tensors=return_tensors,
+            verbose=verbose,
+        )
+
+        return BatchEncoding(batch_outputs)
+
+    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, TAPEX_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
+    def _batch_prepare_for_model(
+        self,
+        table: Union["pd.DataFrame", list["pd.DataFrame"]],
+        query: Optional[Union[TextInput, list[TextInput]]] = None,
+        answer: Optional[Union[str, list[str]]] = None,
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[str] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+    ) -> BatchEncoding:
+        """
+        This method adds special tokens, truncates sequences if overflowing while taking into account the special
+        tokens and manages a moving window (with user defined stride) for overflowing tokens.
+        """
+        batch_outputs = {}
+        if answer is None:
+            answer = [None] * len(table)
+        for _table, _query, _answer in zip(table, query, answer):
+            text = self.prepare_table_query(
+                _table, _query, _answer, truncation_strategy=truncation_strategy, max_length=max_length
+            )
+
+            if self.do_lower_case:
+                text = text.lower()
+
+            tokens = self.tokenize(text)
+            outputs = self.prepare_for_model(
+                ids=self.convert_tokens_to_ids(tokens),
+                add_special_tokens=add_special_tokens,
+                padding=PaddingStrategy.DO_NOT_PAD.value,  # we pad in batch afterwards
+                truncation=truncation_strategy.value,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=None,  # we pad in batch afterwards
+                return_attention_mask=False,  # we pad in batch afterwards
+                return_token_type_ids=return_token_type_ids,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_length=return_length,
+                return_tensors=None,  # We convert the whole batch to tensors at the end
+                prepend_batch_axis=False,
+                verbose=verbose,
+            )
+
+            for key, value in outputs.items():
+                if key not in batch_outputs:
+                    batch_outputs[key] = []
+                batch_outputs[key].append(value)
+
+        batch_outputs = self.pad(
+            batch_outputs,
+            padding=padding_strategy.value,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_attention_mask=return_attention_mask,
+        )
+
+        batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)
+
+        return batch_outputs
+
+    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING)
+    def encode(
+        self,
+        table: "pd.DataFrame",
+        query: Optional[TextInput] = None,
+        answer: Optional[str] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy, TapexTruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ) -> list[int]:
+        """
+        Prepare a table, a string and possible answer for the model. This method does not return token type IDs,
+        attention masks, etc. which are necessary for the model to work correctly. Use this method if you want to build
+        your processing on your own, otherwise refer to `__call__`.
+        """
+        encoded_inputs = self.encode_plus(
+            table,
+            query=query,
+            answer=answer,
+            add_special_tokens=add_special_tokens,
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            return_tensors=return_tensors,
+            **kwargs,
+        )
+
+        return encoded_inputs["input_ids"]
+
+    @add_end_docstrings(ENCODE_KWARGS_DOCSTRING, TAPEX_ENCODE_PLUS_ADDITIONAL_KWARGS_DOCSTRING)
+    def encode_plus(
+        self,
+        table: "pd.DataFrame",
+        query: Optional[TextInput] = None,
+        answer: Optional[str] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Optional[Union[bool, str]] = None,
+        max_length: Optional[int] = None,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
+        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            verbose=verbose,
+            **kwargs,
+        )
+
+        return self._encode_plus(
+            table=table,
+            query=query,
+            answer=answer,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_tensors=return_tensors,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            **kwargs,
+        )
+
+    def _encode_plus(
+        self,
+        table: "pd.DataFrame",
+        query: Optional[TextInput] = None,
+        answer: Optional[str] = None,
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        if return_offsets_mapping:
+            raise NotImplementedError(
+                "return_offset_mapping is not available when using Python tokenizers. "
+                "To use this feature, change your tokenizer to one deriving from "
+                "transformers.PreTrainedTokenizerFast. "
+                "More information on available tokenizers at "
+                "https://github.com/huggingface/transformers/pull/2674"
+            )
+
+        text = self.prepare_table_query(
+            table, query, answer, truncation_strategy=truncation_strategy, max_length=max_length
+        )
+
+        # if necessary, perform lower case
+        if self.do_lower_case:
+            text = text.lower()
+
+        tokens = self.tokenize(text)
+
+        return self.prepare_for_model(
+            ids=self.convert_tokens_to_ids(tokens),
+            add_special_tokens=add_special_tokens,
+            padding=padding_strategy.value,
+            truncation=truncation_strategy.value,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_tensors=return_tensors,
+            prepend_batch_axis=True,
+            return_attention_mask=return_attention_mask,
+            return_token_type_ids=return_token_type_ids,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_length=return_length,
+            verbose=verbose,
+        )
+
+    def target_call_func(
+        self,
+        answer: Union[str, list[str]],
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        """
+        The method tokenizes and prepares the answer label for the model.
+
+        Args:
+            answer (`str` or `list[str]`):
+                Corresponding answer supervision to the queries for training the model.
+        """
+        is_batched = isinstance(answer, (list, tuple))
+
+        if is_batched:
+            return self.target_batch_encode_plus(
+                answer=answer,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs,
+            )
+        else:
+            return self.target_encode_plus(
+                answer=answer,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                pad_to_multiple_of=pad_to_multiple_of,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs,
+            )
+
+    def target_batch_encode_plus(
+        self,
+        answer: list[str],
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Optional[Union[bool, str]] = None,
+        max_length: Optional[int] = None,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        """
+        Prepare answer strings for the model.
+
+        Args:
+            answer `list[str]`:
+                Corresponding answer supervision to the queries for training the model.
+        """
+        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
+        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            verbose=verbose,
+            **kwargs,
+        )
+
+        return self._target_batch_encode_plus(
+            answer=answer,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_tensors=return_tensors,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            **kwargs,
+        )
+
+    def _target_batch_encode_plus(
+        self,
+        answer: list[str],
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        batch_outputs = {}
+        for text in answer:
+            if self.do_lower_case:
+                text = text.lower()
+
+            tokens = self.tokenize(text)
+            outputs = self.prepare_for_model(
+                ids=self.convert_tokens_to_ids(tokens),
+                add_special_tokens=add_special_tokens,
+                padding=PaddingStrategy.DO_NOT_PAD.value,  # we pad in batch afterwards
+                truncation=truncation_strategy.value,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=None,  # we pad in batch afterwards
+                return_attention_mask=False,  # we pad in batch afterwards
+                return_token_type_ids=return_token_type_ids,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_length=return_length,
+                return_tensors=None,  # We convert the whole batch to tensors at the end
+                prepend_batch_axis=False,
+                verbose=verbose,
+            )
+
+            for key, value in outputs.items():
+                if key not in batch_outputs:
+                    batch_outputs[key] = []
+                batch_outputs[key].append(value)
+
+        batch_outputs = self.pad(
+            batch_outputs,
+            padding=padding_strategy.value,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_attention_mask=return_attention_mask,
+        )
+
+        batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)
+
+        return BatchEncoding(batch_outputs)
+
+    def target_encode(
+        self,
+        answer: str,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy, TapexTruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ) -> list[int]:
+        """
+        Prepare the answer string for the model. This method does not return token type IDs, attention masks, etc.
+        which are necessary for the model to work correctly. Use this method if you want to build your processing on
+        your own, otherwise refer to `__call__`.
+
+        Args:
+            answer `str`:
+                Corresponding answer supervision to the queries for training the model
+        """
+        encoded_outputs = self.target_encode_plus(
+            answer=answer,
+            add_special_tokens=add_special_tokens,
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            return_tensors=return_tensors,
+            **kwargs,
+        )
+
+        return encoded_outputs["input_ids"]
+
+    def target_encode_plus(
+        self,
+        answer: str,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Optional[Union[bool, str]] = None,
+        max_length: Optional[int] = None,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        """
+        Prepare a answer string for the model.
+
+        Args:
+            answer `str`:
+                Corresponding answer supervision to the queries for training the model.
+        """
+        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
+        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            verbose=verbose,
+            **kwargs,
+        )
+
+        return self._target_encode_plus(
+            answer=answer,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_tensors=return_tensors,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            **kwargs,
+        )
+
+    def _target_encode_plus(
+        self,
+        answer: str,
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        if return_offsets_mapping:
+            raise NotImplementedError(
+                "return_offset_mapping is not available when using Python tokenizers. "
+                "To use this feature, change your tokenizer to one deriving from "
+                "transformers.PreTrainedTokenizerFast. "
+                "More information on available tokenizers at "
+                "https://github.com/huggingface/transformers/pull/2674"
+            )
+
+        text = answer
+
+        # if necessary, perform lower case
+        if self.do_lower_case:
+            text = text.lower()
+
+        tokens = self.tokenize(text)
+
+        return self.prepare_for_model(
+            ids=self.convert_tokens_to_ids(tokens),
+            add_special_tokens=add_special_tokens,
+            padding=padding_strategy.value,
+            truncation=truncation_strategy.value,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_tensors=return_tensors,
+            prepend_batch_axis=True,
+            return_attention_mask=return_attention_mask,
+            return_token_type_ids=return_token_type_ids,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_length=return_length,
+            verbose=verbose,
+        )
+
+    def prepare_table_query(
+        self,
+        table,
+        query,
+        answer=None,
+        truncation_strategy=Union[str, TruncationStrategy, TapexTruncationStrategy],
+        max_length=None,
+    ):
+        """
+        This method can be used to linearize a table and add a corresponding query.
+
+        Optionally, it also handles truncation of the table (cells).
+
+        An answer can be provided for more precise truncation.
+        """
+        if not table.empty:
+            # step 1: create table dictionary
+            table_content = {"header": list(table.columns), "rows": [list(row.values) for i, row in table.iterrows()]}
+
+            # step 2: modify table internally
+            # always truncate table cells based on self.max_cell_length
+            # optionally truncate rows if truncation_strategy is set to it
+            self.truncate_table_cells(table_content, query, answer)
+            if truncation_strategy == TapexTruncationStrategy.DROP_ROWS_TO_FIT:
+                self.truncate_table_rows(table_content, query, answer, max_length=max_length)
+
+            # step 3: linearize table
+            linear_table = self.table_linearize.process_table(table_content)
+        else:
+            linear_table = ""
+
+        if linear_table == "":
+            logger.warning(
+                "You provide an empty table, or all cells contain much tokens (e.g., >= 1024 tokens). "
+                + f"Please carefully check the corresponding table with the query : {query}."
+            )
+        if query == "":
+            logger.warning("You provide nothing to query with respect to the table.")
+        # step 4: concatenate query with linear_table
+        separator = " " if query and linear_table else ""
+        joint_input = (query + separator + linear_table) if query else linear_table
+
+        return joint_input
+
+    def truncate_table_cells(self, table_content: dict, question: str, answer: list):
+        # TODO (Qian): is it possible to revert the original cell if it is in the final answer?
+        cell_mapping = {}
+        for row in table_content["rows"]:
+            for i, cell in enumerate(row):
+                truncate_cell = self.truncate_cell(cell)
+                if truncate_cell is not None:
+                    cell_mapping[cell] = truncate_cell
+                    row[i] = truncate_cell
+
+        # modify the answer list
+        if answer is not None:
+            for i, case in enumerate(answer):
+                if case in cell_mapping:
+                    answer[i] = cell_mapping[case]
+
+    def truncate_cell(self, cell_value):
+        # do not process on these cases
+        if isinstance(cell_value, (int, float)):
+            return cell_value
+        if cell_value.strip() != "":
+            try_tokens = self.tokenize(cell_value)
+            if len(try_tokens) >= self.max_cell_length:
+                retain_tokens = try_tokens[: self.max_cell_length]
+                retain_cell_value = self.convert_tokens_to_string(retain_tokens)
+                return retain_cell_value
+            else:
+                return None
+        else:
+            return cell_value
+
+    def truncate_table_rows(
+        self, table_content: dict, question: str, answer: Optional[Union[str, list[str]]] = None, max_length=None
+    ):
+        """
+        Args:
+        table_content:
+            {"header": xxx, "rows": xxx, "id" (Optionally): xxx}
+
+        question:
+            natural language sentence
+
+        answer:
+            if for training, is the supervision; otherwise will be empty
+        """
+        delete_ratio, remain_token_len = self.estimate_delete_ratio(table_content, question, max_length)
+        # randomly delete unrelated rows
+        self.delete_unrelated_rows(table_content, question, answer, delete_ratio)
+        # guarantee the result < max_length
+        maximum_keep_rows = 0
+        for ind, row_example in enumerate(table_content["rows"]):
+            value_string = self.table_linearize.process_row(row_example, ind + 1)
+            value_token_len = len(self.tokenize(value_string))
+            # over the size limit, and take action
+            if value_token_len > remain_token_len:
+                break
+            remain_token_len -= value_token_len
+            maximum_keep_rows += 1
+        del table_content["rows"][maximum_keep_rows:]
+
+    def estimate_delete_ratio(self, table_content: dict, question: str, max_length=None):
+        if "header" not in table_content or "rows" not in table_content:
+            raise ValueError("The table content should contain both 'header' and 'rows' keys.")
+        # calculate the tokens of header, special tokens will only be pre-prepended into question
+        question_tokens = self.tokenize(question, add_special_tokens=True)
+        # calculate the tokens of header
+        header_string = self.table_linearize.process_header(table_content["header"])
+        header_tokens = self.tokenize(header_string, add_special_tokens=False)
+        # split all cell values into tokens and see how many can be accommodated
+        used_token_len = len(question_tokens) + len(header_tokens)
+        # remaining token space for rows
+        remain_token_len = max_length - used_token_len
+
+        value_string = ""
+        for _, row_example in enumerate(table_content["rows"]):
+            # use a general index to roughly estimate the overall token len
+            value_string += self.table_linearize.process_row(row_example, 100) + " "
+        value_token_len = len(self.tokenize(value_string))
+
+        if value_token_len < remain_token_len:
+            # no row will be deleted
+            return 0.0, remain_token_len
+        else:
+            # calc a roughly delete rate
+            return 1.0 - remain_token_len / value_token_len, remain_token_len
+
+    def delete_unrelated_rows(self, table_content: dict, question: str, answer: list, delete_ratio: float):
+        """
+        The argument answer is used only during training.
+        """
+        truncated_unrelated_indices = []
+        related_indices = []
+        if answer is None or len(answer) == 0:
+            answer_set = set()
+        else:
+            answer_set = {ans_ex.lower() for ans_ex in answer}
+        # add question key words into answer set
+        if question is not None:
+            answer_set.update(question.split())
+        question_set = set(question.strip("?!.,").split(" "))
+        row_max_len = len(table_content["rows"])
+        for _row_idx, row in enumerate(table_content["rows"]):
+            lower_row = {str(cell).lower() for cell in row}
+            if len(lower_row & answer_set) == 0 and len(lower_row & question_set) == 0:
+                truncated_unrelated_indices.append(_row_idx)
+            else:
+                # add neighbours to preserve information aggressively
+                related_indices.extend([_row_idx - 2, _row_idx - 1, _row_idx, _row_idx + 1, _row_idx + 2])
+
+        # remove the neighbours
+        truncated_unrelated_indices = [
+            _row_idx for _row_idx in truncated_unrelated_indices if _row_idx not in related_indices
+        ]
+        # select some cases to drop
+        drop_items = min(len(truncated_unrelated_indices), int(len(table_content["rows"]) * delete_ratio))
+        drop_row_indices = random.choices(truncated_unrelated_indices, k=drop_items)
+
+        for _row_idx in reversed(range(row_max_len)):
+            if _row_idx in drop_row_indices:
+                del table_content["rows"][_row_idx]
+
+        # only when the drop ratio is too large, logging for warning.
+        if "id" in table_content and len(drop_row_indices) > 0:
+            logger.warning("Delete {:.2f} rows in table {}".format(len(drop_row_indices), table_content["id"]))
+
+
+__all__ = ["TapexTokenizer"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/trajectory_transformer/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/trajectory_transformer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4bccdd12c9703313951e223d0ac1955f9ca1581
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/trajectory_transformer/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ....utils import _LazyModule
+from ....utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_trajectory_transformer import *
+    from .modeling_trajectory_transformer import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/trajectory_transformer/configuration_trajectory_transformer.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/trajectory_transformer/configuration_trajectory_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a267cc4c4b82c6703eaa3a732d3fca1606a60d7
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/trajectory_transformer/configuration_trajectory_transformer.py
@@ -0,0 +1,155 @@
+# coding=utf-8
+# Copyright 2022 The Trajectory Transformers paper authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TrajectoryTransformer model configuration"""
+
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class TrajectoryTransformerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`TrajectoryTransformerModel`]. It is used to
+    instantiate an TrajectoryTransformer model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the
+    TrajectoryTransformer
+    [CarlCochet/trajectory-transformer-halfcheetah-medium-v2](https://huggingface.co/CarlCochet/trajectory-transformer-halfcheetah-medium-v2)
+    architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 100):
+            Vocabulary size of the TrajectoryTransformer model. Defines the number of different tokens that can be
+            represented by the `trajectories` passed when calling [`TrajectoryTransformerModel`]
+        action_weight (`int`, *optional*, defaults to 5):
+            Weight of the action in the loss function
+        reward_weight (`int`, *optional*, defaults to 1):
+            Weight of the reward in the loss function
+        value_weight (`int`, *optional*, defaults to 1):
+            Weight of the value in the loss function
+        block_size (`int`, *optional*, defaults to 249):
+            Size of the blocks in the trajectory transformer.
+        action_dim (`int`, *optional*, defaults to 6):
+            Dimension of the action space.
+        observation_dim (`int`, *optional*, defaults to 17):
+            Dimension of the observation space.
+        transition_dim (`int`, *optional*, defaults to 25):
+            Dimension of the transition space.
+        n_layer (`int`, *optional*, defaults to 4):
+            Number of hidden layers in the Transformer encoder.
+        n_head (`int`, *optional*, defaults to 4):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        n_embd (`int`, *optional*, defaults to 128):
+            Dimensionality of the embeddings and hidden states.
+        resid_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        embd_pdrop (`int`, *optional*, defaults to 0.1):
+            The dropout ratio for the embeddings.
+        attn_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        kaiming_initializer_range (`float, *optional*, defaults to 1):
+            A coefficient scaling the negative slope of the kaiming initializer rectifier for EinLinear layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        Example:
+
+    ```python
+    >>> from transformers import TrajectoryTransformerConfig, TrajectoryTransformerModel
+
+    >>> # Initializing a TrajectoryTransformer CarlCochet/trajectory-transformer-halfcheetah-medium-v2 style configuration
+    >>> configuration = TrajectoryTransformerConfig()
+
+    >>> # Initializing a model (with random weights) from the CarlCochet/trajectory-transformer-halfcheetah-medium-v2 style configuration
+    >>> model = TrajectoryTransformerModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "trajectory_transformer"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "hidden_size": "n_embd",
+        "num_attention_heads": "n_head",
+        "num_hidden_layers": "n_layer",
+    }
+
+    def __init__(
+        self,
+        vocab_size=100,
+        action_weight=5,
+        reward_weight=1,
+        value_weight=1,
+        block_size=249,
+        action_dim=6,
+        observation_dim=17,
+        transition_dim=25,
+        n_layer=4,
+        n_head=4,
+        n_embd=128,
+        embd_pdrop=0.1,
+        attn_pdrop=0.1,
+        resid_pdrop=0.1,
+        learning_rate=0.0006,
+        max_position_embeddings=512,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        kaiming_initializer_range=1,
+        use_cache=True,
+        pad_token_id=1,
+        bos_token_id=50256,
+        eos_token_id=50256,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.action_weight = action_weight
+        self.reward_weight = reward_weight
+        self.value_weight = value_weight
+        self.max_position_embeddings = max_position_embeddings
+        self.block_size = block_size
+        self.action_dim = action_dim
+        self.observation_dim = observation_dim
+        self.transition_dim = transition_dim
+        self.learning_rate = learning_rate
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.n_embd = n_embd
+        self.embd_pdrop = embd_pdrop
+        self.attn_pdrop = attn_pdrop
+        self.resid_pdrop = resid_pdrop
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.kaiming_initializer_range = kaiming_initializer_range
+        self.use_cache = use_cache
+        super().__init__(pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id, **kwargs)
+
+
+__all__ = ["TrajectoryTransformerConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/trajectory_transformer/modeling_trajectory_transformer.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/trajectory_transformer/modeling_trajectory_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6ae410c1474140c018c88bb5cbdaf8ebf076226
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/trajectory_transformer/modeling_trajectory_transformer.py
@@ -0,0 +1,602 @@
+# coding=utf-8
+# Copyright 2022 The Trajectory Transformers paper authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch TrajectoryTransformer model."""
+
+import math
+import os
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import functional as F
+
+from ....cache_utils import Cache
+from ....modeling_layers import GradientCheckpointingLayer
+from ....modeling_utils import PreTrainedModel
+from ....utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_trajectory_transformer import TrajectoryTransformerConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "CarlCochet/trajectory-transformer-halfcheetah-medium-v2"
+_CONFIG_FOR_DOC = "TrajectoryTransformerConfig"
+
+
+def load_tf_weights_in_trajectory_transformer(model, config, tf_checkpoint_path):
+    """Load tf checkpoints in a pytorch model."""
+    try:
+        import re
+
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
+        raise
+    tf_path = os.path.abspath(tf_checkpoint_path)
+    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+    for name, shape in init_vars:
+        logger.info(f"Loading TF weight {name} with shape {shape}")
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array)
+
+    for name, array in zip(names, arrays):
+        name = name.split("/")
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(
+            n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
+            for n in name
+        ):
+            logger.info(f"Skipping {'/'.join(name)}")
+            continue
+        pointer = model
+        for m_name in name:
+            if re.fullmatch(r"[A-Za-z]+_\d+", m_name):
+                scope_names = re.split(r"_(\d+)", m_name)
+            else:
+                scope_names = [m_name]
+            if scope_names[0] == "kernel" or scope_names[0] == "gamma":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "output_bias" or scope_names[0] == "beta":
+                pointer = getattr(pointer, "bias")
+            elif scope_names[0] == "output_weights":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "squad":
+                pointer = getattr(pointer, "classifier")
+            else:
+                try:
+                    pointer = getattr(pointer, scope_names[0])
+                except AttributeError:
+                    logger.info(f"Skipping {'/'.join(name)}")
+                    continue
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
+                pointer = pointer[num]
+        if m_name[-11:] == "_embeddings":
+            pointer = getattr(pointer, "weight")
+        elif m_name == "kernel":
+            array = np.transpose(array)
+        try:
+            if pointer.shape != array.shape:
+                raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
+        except AssertionError as e:
+            e.args += (pointer.shape, array.shape)
+            raise
+        logger.info(f"Initialize PyTorch weight {name}")
+        pointer.data = torch.from_numpy(array)
+    return model
+
+
+@dataclass
+class TrajectoryTransformerOutput(ModelOutput):
+    """
+    Base class for model's outputs that also contains a pooling of the last hidden states.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss.
+        logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+        past_key_values (`tuple[tuple[torch.Tensor]]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            Tuple of length `config.n_layers`, containing tuples of tensors of shape `(batch_size, num_heads,
+            sequence_length, embed_size_per_head)`). Contains pre-computed hidden-states (key and values in the
+            attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
+            plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. GPT2Attentions weights after the attention softmax, used to compute the weighted average
+            in the self-attention heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[Cache] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+
+
+class TrajectoryTransformerPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config: TrajectoryTransformerConfig
+    load_tf_weights = load_tf_weights_in_trajectory_transformer
+    base_model_prefix = "trajectory_transformer"
+    main_input_name = "trajectories"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        if isinstance(module, (nn.Linear, nn.Embedding)):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, EinLinear):
+            for i in range(module.n_models):
+                nn.init.kaiming_uniform_(module.weight[i], a=math.sqrt(5) / self.config.kaiming_initializer_range)
+                if module.bias is not None:
+                    fan_in, _ = nn.init._calculate_fan_in_and_fan_out(module.weight[i])
+                    bound = (1 / math.sqrt(fan_in)) * self.config.initializer_range
+                    nn.init.uniform_(module.bias[i], -bound, bound)
+
+
+TRAJECTORY_TRANSFORMER_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`TrajectoryTransformerConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+TRAJECTORY_TRANSFORMER_INPUTS_DOCSTRING = r"""
+    Args:
+        trajectories (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Batch of trajectories, where a trajectory is a sequence of states, actions and rewards.
+        past_key_values (`tuple[tuple[torch.Tensor]]` of length `config.n_layers`, *optional*):
+            Contains precomputed hidden-states (key and values in the attention blocks) as computed by the model (see
+            `past_key_values` output below). Can be used to speed up sequential decoding. The `input_ids` which have
+            their past given to this model should not be passed as `input_ids` as they have already been computed.
+        targets (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Desired targets used to compute the loss.
+        attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+class EinLinear(nn.Module):
+    def __init__(self, n_models, in_features, out_features, bias):
+        super().__init__()
+        self.n_models = n_models
+        self.out_features = out_features
+        self.in_features = in_features
+        self.weight = nn.Parameter(torch.Tensor(n_models, out_features, in_features))
+        if bias:
+            self.bias = nn.Parameter(torch.Tensor(n_models, out_features))
+        else:
+            self.register_parameter("bias", None)
+
+    def reset_parameters(self):
+        for i in range(self.n_models):
+            nn.init.kaiming_uniform_(self.weight[i], a=math.sqrt(5))
+            if self.bias is not None:
+                fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight[i])
+                bound = 1 / math.sqrt(fan_in)
+                nn.init.uniform_(self.bias[i], -bound, bound)
+
+    def forward(self, input):
+        """
+        Args:
+            input (`torch.FloatTensor` of shape `(B, n_models, input_dim)`):
+                The input to the layer.
+        """
+        # [ batch_size x n_models x output_dim ]
+        output = torch.einsum("eoi,bei->beo", self.weight, input)
+        if self.bias is not None:
+            raise RuntimeError()
+        return output
+
+
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        if config.n_embd % config.n_head != 0:
+            raise ValueError(f"n_head ({config.n_head}) should be a divisor of n_embd ({config.n_embd})")
+
+        # key, query, value projections for all heads
+        self.key = nn.Linear(config.n_embd, config.n_embd)
+        self.query = nn.Linear(config.n_embd, config.n_embd)
+        self.value = nn.Linear(config.n_embd, config.n_embd)
+
+        # regularization
+        self.attn_drop = nn.Dropout(config.attn_pdrop)
+        self.resid_drop = nn.Dropout(config.resid_pdrop)
+
+        # output projection
+        self.proj = nn.Linear(config.n_embd, config.n_embd)
+
+        # causal mask to ensure that attention is only applied to the left in the input sequence
+        self.register_buffer(
+            "mask",
+            torch.tril(torch.ones(config.block_size, config.block_size)).view(
+                1, 1, config.block_size, config.block_size
+            ),
+            persistent=False,
+        )
+
+        # mask previous value estimates
+        joined_dim = config.observation_dim + config.action_dim + 2
+        self.mask.squeeze()[:, joined_dim - 1 :: joined_dim] = 0
+
+        self.n_head = config.n_head
+
+    def forward(
+        self,
+        hidden_states: Optional[tuple[torch.FloatTensor]],
+        layer_past: Optional[tuple[torch.Tensor]] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+    ):
+        batch_size, sequence_length, embedding_dim = hidden_states.size()
+
+        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
+        # [ batch_size x n_heads x sequence_length x head_dim ]
+        key = (
+            self.key(hidden_states)
+            .view(batch_size, sequence_length, self.n_head, embedding_dim // self.n_head)
+            .transpose(1, 2)
+        )
+        query = (
+            self.query(hidden_states)
+            .view(batch_size, sequence_length, self.n_head, embedding_dim // self.n_head)
+            .transpose(1, 2)
+        )
+        value = (
+            self.value(hidden_states)
+            .view(batch_size, sequence_length, self.n_head, embedding_dim // self.n_head)
+            .transpose(1, 2)
+        )
+
+        if layer_past is not None:
+            past_key, past_value = layer_past
+            key = torch.cat((past_key, key), dim=-2)
+            value = torch.cat((past_value, value), dim=-2)
+
+        if use_cache is True:
+            present = (key, value)
+        else:
+            present = None
+
+        # causal self-attention
+        # [ batch_size x n_heads x sequence_length x sequence_length ]
+        attn_weights = (torch.matmul(query, key.transpose(-2, -1))) * (1.0 / math.sqrt(key.size(-1)))
+        attn_weights = attn_weights.masked_fill(
+            self.mask[:, :, :sequence_length, :sequence_length] == 0, torch.finfo(attn_weights.dtype).min
+        )
+        attn_weights = F.softmax(attn_weights, dim=-1)
+        self._attn_map = attn_weights.clone()
+        attn_weights = self.attn_drop(attn_weights)
+
+        output = torch.matmul(attn_weights, value)
+        # [ batch_size x sequence_length x embedding_dim ]
+        # re-assemble all head outputs side by side
+        output = output.transpose(1, 2).contiguous().view(batch_size, sequence_length, embedding_dim)
+
+        # output projection
+        output = self.resid_drop(self.proj(output))
+
+        outputs = (output, present)
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class Block(GradientCheckpointingLayer):
+    def __init__(self, config):
+        super().__init__()
+        self.ln1 = nn.LayerNorm(config.n_embd)
+        self.ln2 = nn.LayerNorm(config.n_embd)
+        self.attn = CausalSelfAttention(config)
+
+        # MLP
+        self.l1 = nn.Linear(config.n_embd, 4 * config.n_embd)
+        self.act = nn.GELU()
+        self.l2 = nn.Linear(4 * config.n_embd, config.n_embd)
+        self.drop = nn.Dropout(config.resid_pdrop)
+
+    def forward(
+        self,
+        hidden_states: Optional[tuple[torch.FloatTensor]],
+        layer_past: Optional[tuple[torch.Tensor]] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+    ):
+        residual = hidden_states
+        hidden_states = self.ln1(hidden_states)
+
+        attn_outputs = self.attn(
+            hidden_states, layer_past=layer_past, use_cache=use_cache, output_attentions=output_attentions
+        )
+        attn_output = attn_outputs[0]
+        outputs = attn_outputs[1:]
+        hidden_states = attn_output + residual
+
+        residual = hidden_states
+        hidden_states = self.ln2(hidden_states)
+        hidden_states = self.l1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.l2(hidden_states)
+        hidden_states = residual + self.drop(hidden_states)
+
+        if use_cache:
+            outputs = (hidden_states,) + outputs
+        else:
+            outputs = (hidden_states,) + outputs[1:]
+
+        return outputs
+
+
+@add_start_docstrings(
+    "The bare TrajectoryTransformer Model transformer outputting raw hidden-states without any specific head on top.",
+    TRAJECTORY_TRANSFORMER_START_DOCSTRING,
+)
+class TrajectoryTransformerModel(TrajectoryTransformerPreTrainedModel):
+    """the full GPT language model, with a context size of block_size"""
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        # input embedding stem (+1 for stop token)
+        self.tok_emb = nn.Embedding(config.vocab_size * config.transition_dim + 1, config.n_embd)
+
+        self.pos_emb = nn.Parameter(torch.zeros(1, config.block_size, config.n_embd))
+        self.drop = nn.Dropout(config.embd_pdrop)
+        # transformer
+        self.blocks = nn.ModuleList([Block(config) for _ in range(config.n_layer)])
+        # decoder head
+        self.ln_f = nn.LayerNorm(config.n_embd)
+        self.head = EinLinear(config.transition_dim, config.n_embd, config.vocab_size + 1, bias=False)
+
+        self.vocab_size = config.vocab_size
+        self.stop_token = config.vocab_size * config.transition_dim
+        self.block_size = config.block_size
+
+        self.observation_dim = config.observation_dim
+        self.action_dim = config.action_dim
+        self.transition_dim = config.transition_dim
+        self.embedding_dim = config.n_embd
+
+        self.action_weight = config.action_weight
+        self.reward_weight = config.reward_weight
+        self.value_weight = config.value_weight
+
+        self.gradient_checkpointing = False
+
+        self.post_init()
+
+    def get_block_size(self):
+        return self.block_size
+
+    def offset_tokens(self, trajectories):
+        _, sequence_length = trajectories.shape
+
+        n_states = int(np.ceil(sequence_length / self.transition_dim))
+
+        offsets = torch.arange(self.transition_dim) * self.vocab_size
+        offsets = offsets.repeat(n_states).to(trajectories.device)
+
+        offset_trajectories = trajectories + offsets[:sequence_length]
+        offset_trajectories[trajectories == self.vocab_size] = self.stop_token
+        return offset_trajectories
+
+    def pad_to_full_observation(self, hidden_states):
+        batch_size, sequence_length, _ = hidden_states.shape
+
+        n_pad = (self.transition_dim - sequence_length % self.transition_dim) % self.transition_dim
+        padding = torch.zeros(batch_size, n_pad, self.embedding_dim, device=hidden_states.device)
+
+        # [ batch_size x padded_sequence_length' x embedding_dim ]
+        hidden_states_pad = torch.cat([hidden_states, padding], dim=1)
+        hidden_states_pad = hidden_states_pad.view(-1, self.transition_dim, self.embedding_dim)
+
+        return hidden_states_pad, n_pad
+
+    @add_start_docstrings_to_model_forward(
+        TRAJECTORY_TRANSFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length")
+    )
+    @replace_return_docstrings(output_type=TrajectoryTransformerOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        trajectories: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        targets: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], TrajectoryTransformerOutput]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import TrajectoryTransformerModel
+        >>> import torch
+
+        >>> model = TrajectoryTransformerModel.from_pretrained(
+        ...     "CarlCochet/trajectory-transformer-halfcheetah-medium-v2"
+        ... )
+        >>> model.to(device)
+        >>> model.eval()
+
+        >>> observations_dim, action_dim, batch_size = 17, 6, 256
+        >>> seq_length = observations_dim + action_dim + 1
+
+        >>> trajectories = torch.LongTensor([np.random.permutation(self.seq_length) for _ in range(batch_size)]).to(
+        ...     device
+        ... )
+        >>> targets = torch.LongTensor([np.random.permutation(self.seq_length) for _ in range(batch_size)]).to(device)
+
+        >>> outputs = model(
+        ...     trajectories,
+        ...     targets=targets,
+        ...     use_cache=True,
+        ...     output_attentions=True,
+        ...     output_hidden_states=True,
+        ...     return_dict=True,
+        ... )
+        ```
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        if past_key_values is None:
+            past_key_values = tuple([None] * len(self.blocks))
+
+        batch_size, sequence_length = trajectories.size()
+
+        if sequence_length > self.block_size:
+            raise ValueError("Cannot forward, model block size is exhausted.")
+
+        offset_trajectories = self.offset_tokens(trajectories)
+        # [ batch_size x sequence_length x embedding_dim ]
+        # forward the GPT model
+        token_embeddings = self.tok_emb(offset_trajectories)  # each index maps to a (learnable) vector
+        position_embeddings = self.pos_emb[:, :sequence_length, :]  # each position maps to a (learnable) vector
+
+        hidden_states = self.drop(token_embeddings + position_embeddings)
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        presents = () if use_cache else None
+        all_self_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+
+        for i, (block, layer_past) in enumerate(zip(self.blocks, past_key_values)):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            outputs = block(hidden_states, layer_past, use_cache, output_attentions)
+
+            hidden_states = outputs[0]
+            if use_cache is True:
+                presents = presents + (outputs[1],)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (outputs[2 if use_cache else 1],)
+
+        # [ batch_size x sequence_length x embedding_dim ]
+        hidden_state = self.ln_f(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        hidden_states_pad, n_pad = self.pad_to_full_observation(hidden_state)
+
+        logits = self.head(hidden_states_pad)
+        logits = logits.reshape(batch_size, sequence_length + n_pad, self.vocab_size + 1)
+        logits = logits[:, :sequence_length]
+
+        # if we are given some desired targets also calculate the loss
+        if targets is not None:
+            loss = F.cross_entropy(logits.reshape(-1, logits.size(-1)), targets.view(-1), reduction="none")
+            if self.action_weight != 1 or self.reward_weight != 1 or self.value_weight != 1:
+                # make weights
+                n_states = int(np.ceil(sequence_length / self.transition_dim))
+                weights = torch.cat(
+                    [
+                        torch.ones(self.observation_dim, device=trajectories.device),
+                        torch.ones(self.action_dim, device=trajectories.device) * self.action_weight,
+                        torch.ones(1, device=trajectories.device) * self.reward_weight,
+                        torch.ones(1, device=trajectories.device) * self.value_weight,
+                    ]
+                )
+                weights = weights.repeat(n_states)
+                weights = weights[1:].repeat(batch_size, 1)
+                loss = loss * weights.view(-1)
+            loss = (loss * attention_mask.view(-1)).mean()
+        else:
+            loss = None
+
+        if not return_dict:
+            return tuple(v for v in [loss, logits, presents, all_hidden_states, all_self_attentions] if v is not None)
+
+        return TrajectoryTransformerOutput(
+            loss=loss,
+            logits=logits,
+            past_key_values=presents,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+__all__ = [
+    "TrajectoryTransformerModel",
+    "TrajectoryTransformerPreTrainedModel",
+    "load_tf_weights_in_trajectory_transformer",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ac9a2cbf4766bd187d90fdaf46505db3e92b68f
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ....utils import _LazyModule
+from ....utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_transfo_xl import *
+    from .modeling_tf_transfo_xl import *
+    from .modeling_transfo_xl import *
+    from .tokenization_transfo_xl import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/configuration_transfo_xl.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/configuration_transfo_xl.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7926246c4cbb37904fb3044507f4831bb7d9bb9
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/configuration_transfo_xl.py
@@ -0,0 +1,189 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Transformer XL configuration"""
+
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class TransfoXLConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`TransfoXLModel`] or a [`TFTransfoXLModel`]. It is
+    used to instantiate a Transformer-XL model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the TransfoXL
+    [transfo-xl/transfo-xl-wt103](https://huggingface.co/transfo-xl/transfo-xl-wt103) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 267735):
+            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`TransfoXLModel`] or [`TFTransfoXLModel`].
+        cutoffs (`list[int]`, *optional*, defaults to `[20000, 40000, 200000]`):
+            Cutoffs for the adaptive softmax.
+        d_model (`int`, *optional*, defaults to 1024):
+            Dimensionality of the model's hidden states.
+        d_embed (`int`, *optional*, defaults to 1024):
+            Dimensionality of the embeddings
+        n_head (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        d_head (`int`, *optional*, defaults to 64):
+            Dimensionality of the model's heads.
+        d_inner (`int`, *optional*, defaults to 4096):
+            Inner dimension in FF
+        div_val (`int`, *optional*, defaults to 4):
+            Divident value for adaptive input and softmax
+        pre_lnorm (`boolean`, *optional*, defaults to `False`):
+            Whether or not to apply LayerNorm to the input instead of the output in the blocks.
+        n_layer (`int`, *optional*, defaults to 18):
+            Number of hidden layers in the Transformer encoder.
+        mem_len (`int`, *optional*, defaults to 1600):
+            Length of the retained previous heads.
+        clamp_len (`int`, *optional*, defaults to 1000):
+            Use the same pos embeddings after clamp_len.
+        same_length (`boolean`, *optional*, defaults to `True`):
+            Whether or not to use the same attn length for all tokens
+        proj_share_all_but_first (`boolean`, *optional*, defaults to `True`):
+            True to share all but first projs, False not to share.
+        attn_type (`int`, *optional*, defaults to 0):
+            Attention type. 0 for Transformer-XL, 1 for Shaw et al, 2 for Vaswani et al, 3 for Al Rfou et al.
+        sample_softmax (`int`, *optional*, defaults to -1):
+            Number of samples in the sampled softmax.
+        adaptive (`boolean`, *optional*, defaults to `True`):
+            Whether or not to use adaptive softmax.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        dropatt (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        untie_r (`boolean`, *optional*, defaults to `True`):
+            Whether ot not to untie relative position biases.
+        init (`str`, *optional*, defaults to `"normal"`):
+            Parameter initializer to use.
+        init_range (`float`, *optional*, defaults to 0.01):
+            Parameters initialized by U(-init_range, init_range).
+        proj_init_std (`float`, *optional*, defaults to 0.01):
+            Parameters initialized by N(0, init_std)
+        init_std (`float`, *optional*, defaults to 0.02):
+            Parameters initialized by N(0, init_std)
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-05):
+            The epsilon to use in the layer normalization layers
+        eos_token_id (`int`, *optional*, defaults to 0):
+            End of stream token id.
+
+    Examples:
+
+    ```python
+    >>> from transformers import TransfoXLConfig, TransfoXLModel
+
+    >>> # Initializing a Transformer XL configuration
+    >>> configuration = TransfoXLConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = TransfoXLModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "transfo-xl"
+    keys_to_ignore_at_inference = ["mems"]
+    attribute_map = {
+        "n_token": "vocab_size",
+        "hidden_size": "d_model",
+        "num_attention_heads": "n_head",
+        "num_hidden_layers": "n_layer",
+    }
+
+    def __init__(
+        self,
+        vocab_size=267735,
+        cutoffs=[20000, 40000, 200000],
+        d_model=1024,
+        d_embed=1024,
+        n_head=16,
+        d_head=64,
+        d_inner=4096,
+        div_val=4,
+        pre_lnorm=False,
+        n_layer=18,
+        mem_len=1600,
+        clamp_len=1000,
+        same_length=True,
+        proj_share_all_but_first=True,
+        attn_type=0,
+        sample_softmax=-1,
+        adaptive=True,
+        dropout=0.1,
+        dropatt=0.0,
+        untie_r=True,
+        init="normal",
+        init_range=0.01,
+        proj_init_std=0.01,
+        init_std=0.02,
+        layer_norm_epsilon=1e-5,
+        eos_token_id=0,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.cutoffs = []
+        self.cutoffs.extend(cutoffs)
+        if proj_share_all_but_first:
+            self.tie_projs = [False] + [True] * len(self.cutoffs)
+        else:
+            self.tie_projs = [False] + [False] * len(self.cutoffs)
+        self.d_model = d_model
+        self.d_embed = d_embed
+        self.d_head = d_head
+        self.d_inner = d_inner
+        self.div_val = div_val
+        self.pre_lnorm = pre_lnorm
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.mem_len = mem_len
+        self.same_length = same_length
+        self.attn_type = attn_type
+        self.clamp_len = clamp_len
+        self.sample_softmax = sample_softmax
+        self.adaptive = adaptive
+        self.dropout = dropout
+        self.dropatt = dropatt
+        self.untie_r = untie_r
+        self.init = init
+        self.init_range = init_range
+        self.proj_init_std = proj_init_std
+        self.init_std = init_std
+        self.layer_norm_epsilon = layer_norm_epsilon
+        super().__init__(eos_token_id=eos_token_id, **kwargs)
+
+    @property
+    def max_position_embeddings(self):
+        # Message copied from Transformer-XL documentation
+        logger.info(f"The model {self.model_type} is one of the few models that has no sequence length limit.")
+        return -1
+
+    @max_position_embeddings.setter
+    def max_position_embeddings(self, value):
+        # Message copied from Transformer-XL documentation
+        raise NotImplementedError(
+            f"The model {self.model_type} is one of the few models that has no sequence length limit."
+        )
+
+
+__all__ = ["TransfoXLConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c7830d633448bde3b8a420f0d84cdd070498882
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl.py
@@ -0,0 +1,1128 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+TF 2.0 Transformer XL model.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+import numpy as np
+import tensorflow as tf
+
+from ....modeling_tf_utils import (
+    TFModelInputType,
+    TFPreTrainedModel,
+    TFSequenceClassificationLoss,
+    get_initializer,
+    keras,
+    keras_serializable,
+    unpack_inputs,
+)
+from ....tf_utils import shape_list, stable_softmax
+from ....utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
+from .configuration_transfo_xl import TransfoXLConfig
+from .modeling_tf_transfo_xl_utilities import TFAdaptiveSoftmaxMask
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "transfo-xl/transfo-xl-wt103"
+_CONFIG_FOR_DOC = "TransfoXLConfig"
+
+
+class TFPositionalEmbedding(keras.layers.Layer):
+    def __init__(self, demb, **kwargs):
+        super().__init__(**kwargs)
+
+        self.inv_freq = 1 / (10000 ** (tf.range(0, demb, 2.0) / demb))
+
+    def call(self, pos_seq, bsz=None):
+        self.inv_freq = tf.cast(self.inv_freq, dtype=pos_seq.dtype)
+        sinusoid_inp = tf.einsum("i,j->ij", pos_seq, self.inv_freq)
+        pos_emb = tf.concat([tf.sin(sinusoid_inp), tf.cos(sinusoid_inp)], -1)
+
+        if bsz is not None:
+            return tf.tile(pos_emb[:, None, :], [1, bsz, 1])
+        else:
+            return pos_emb[:, None, :]
+
+
+class TFPositionwiseFF(keras.layers.Layer):
+    def __init__(self, d_model, d_inner, dropout, pre_lnorm=False, layer_norm_epsilon=1e-5, init_std=0.02, **kwargs):
+        super().__init__(**kwargs)
+
+        self.d_model = d_model
+        self.d_inner = d_inner
+        self.dropout = dropout
+
+        self.layer_1 = keras.layers.Dense(
+            d_inner, kernel_initializer=get_initializer(init_std), activation=tf.nn.relu, name="CoreNet_._0"
+        )
+        self.drop_1 = keras.layers.Dropout(dropout)
+        self.layer_2 = keras.layers.Dense(d_model, kernel_initializer=get_initializer(init_std), name="CoreNet_._3")
+        self.drop_2 = keras.layers.Dropout(dropout)
+
+        self.layer_norm = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layer_norm")
+
+        self.pre_lnorm = pre_lnorm
+
+    def call(self, inp, training=False):
+        if self.pre_lnorm:
+            # layer normalization + positionwise feed-forward
+            core_out = self.layer_norm(inp)
+            core_out = self.layer_1(core_out)
+            core_out = self.drop_1(core_out, training=training)
+            core_out = self.layer_2(core_out)
+            core_out = self.drop_2(core_out, training=training)
+
+            # residual connection
+            output = core_out + inp
+        else:
+            # positionwise feed-forward
+            core_out = self.layer_1(inp)
+            core_out = self.drop_1(core_out, training=training)
+            core_out = self.layer_2(core_out)
+            core_out = self.drop_2(core_out, training=training)
+
+            # residual connection + layer normalization
+            output = self.layer_norm(inp + core_out)
+
+        return output
+
+
+class TFRelPartialLearnableMultiHeadAttn(keras.layers.Layer):
+    def __init__(
+        self,
+        n_head,
+        d_model,
+        d_head,
+        dropout,
+        dropatt=0.0,
+        pre_lnorm=False,
+        r_r_bias=None,
+        r_w_bias=None,
+        layer_norm_epsilon=1e-5,
+        init_std=0.02,
+        output_attentions=False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.n_head = n_head
+        self.d_model = d_model
+        self.d_head = d_head
+        self.dropout = dropout
+        self.output_attentions = output_attentions
+
+        self.qkv_net = keras.layers.Dense(
+            3 * n_head * d_head, kernel_initializer=get_initializer(init_std), use_bias=False, name="qkv_net"
+        )
+
+        self.drop = keras.layers.Dropout(dropout)
+        self.dropatt = keras.layers.Dropout(dropatt)
+        self.o_net = keras.layers.Dense(
+            d_model, kernel_initializer=get_initializer(init_std), use_bias=False, name="o_net"
+        )
+
+        self.layer_norm = keras.layers.LayerNormalization(epsilon=layer_norm_epsilon, name="layer_norm")
+
+        self.scale = 1 / (d_head**0.5)
+
+        self.pre_lnorm = pre_lnorm
+
+        if r_r_bias is not None and r_w_bias is not None:  # Biases are shared
+            self.r_r_bias = r_r_bias
+            self.r_w_bias = r_w_bias
+        else:
+            self.r_r_bias = None
+            self.r_w_bias = None
+
+        self.r_net = keras.layers.Dense(
+            self.n_head * self.d_head, kernel_initializer=get_initializer(init_std), use_bias=False, name="r_net"
+        )
+
+    def build(self, input_shape):
+        if self.r_r_bias is None or self.r_w_bias is None:  # Biases are not shared
+            self.r_r_bias = self.add_weight(
+                shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_r_bias"
+            )
+            self.r_w_bias = self.add_weight(
+                shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_w_bias"
+            )
+        super().build(input_shape)
+
+    def _rel_shift(self, x):
+        x_size = shape_list(x)
+
+        x = tf.pad(x, [[0, 0], [1, 0], [0, 0], [0, 0]])
+        x = tf.reshape(x, [x_size[1] + 1, x_size[0], x_size[2], x_size[3]])
+        x = tf.slice(x, [1, 0, 0, 0], [-1, -1, -1, -1])
+        x = tf.reshape(x, x_size)
+
+        return x
+
+    def call(self, w, r, attn_mask, mems, head_mask, output_attentions, training=False):
+        qlen, rlen, bsz = shape_list(w)[0], shape_list(r)[0], shape_list(w)[1]
+
+        if mems is not None:
+            mems = tf.cast(mems, dtype=w.dtype)
+            cat = tf.concat([mems, w], 0)
+            if self.pre_lnorm:
+                w_heads = self.qkv_net(self.layer_norm(cat))
+            else:
+                w_heads = self.qkv_net(cat)
+            r_head_k = self.r_net(r)
+
+            w_head_q, w_head_k, w_head_v = tf.split(w_heads, 3, axis=-1)
+            w_head_q = w_head_q[-qlen:]
+        else:
+            if self.pre_lnorm:
+                w_heads = self.qkv_net(self.layer_norm(w))
+            else:
+                w_heads = self.qkv_net(w)
+            r_head_k = self.r_net(r)
+
+            w_head_q, w_head_k, w_head_v = tf.split(w_heads, 3, axis=-1)
+
+        klen = shape_list(w_head_k)[0]
+
+        w_head_q = tf.reshape(w_head_q, (qlen, bsz, self.n_head, self.d_head))  # qlen x bsz x n_head x d_head
+        w_head_k = tf.reshape(w_head_k, (klen, bsz, self.n_head, self.d_head))  # qlen x bsz x n_head x d_head
+        w_head_v = tf.reshape(w_head_v, (klen, bsz, self.n_head, self.d_head))  # qlen x bsz x n_head x d_head
+
+        r_head_k = tf.reshape(r_head_k, (rlen, self.n_head, self.d_head))  # qlen x n_head x d_head
+
+        # compute attention score
+        rw_head_q = w_head_q + self.r_w_bias  # qlen x bsz x n_head x d_head
+        AC = tf.einsum("ibnd,jbnd->ijbn", rw_head_q, w_head_k)  # qlen x klen x bsz x n_head
+
+        rr_head_q = w_head_q + self.r_r_bias
+        BD = tf.einsum("ibnd,jnd->ijbn", rr_head_q, r_head_k)  # qlen x klen x bsz x n_head
+        BD = self._rel_shift(BD)
+
+        # [qlen x klen x bsz x n_head]
+        attn_score = AC + BD
+        attn_score = attn_score * self.scale
+
+        # compute attention probability
+        if attn_mask is not None:
+            attn_mask_t = attn_mask[:, :, None, None]
+            attn_mask_t = tf.cast(attn_mask_t, dtype=attn_score.dtype)
+            attn_score = attn_score * (1.0 - attn_mask_t) - 1e30 * attn_mask_t
+
+        # [qlen x klen x bsz x n_head]
+        attn_prob = stable_softmax(attn_score, axis=1)
+        attn_prob = self.dropatt(attn_prob, training=training)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_prob = attn_prob * head_mask
+
+        # compute attention vector
+        attn_vec = tf.einsum("ijbn,jbnd->ibnd", attn_prob, w_head_v)
+
+        # [qlen x bsz x n_head x d_head]
+        attn_vec_sizes = shape_list(attn_vec)
+        attn_vec = tf.reshape(attn_vec, (attn_vec_sizes[0], attn_vec_sizes[1], self.n_head * self.d_head))
+
+        # linear projection
+        attn_out = self.o_net(attn_vec)
+        attn_out = self.drop(attn_out, training=training)
+
+        if self.pre_lnorm:
+            # residual connection
+            outputs = [w + attn_out]
+        else:
+            # residual connection + layer normalization
+            outputs = [self.layer_norm(w + attn_out)]
+
+        if output_attentions:
+            outputs.append(attn_prob)
+
+        return outputs
+
+
+class TFRelPartialLearnableDecoderLayer(keras.layers.Layer):
+    def __init__(
+        self,
+        n_head,
+        d_model,
+        d_head,
+        d_inner,
+        dropout,
+        dropatt=0.0,
+        pre_lnorm=False,
+        r_w_bias=None,
+        r_r_bias=None,
+        layer_norm_epsilon=1e-5,
+        init_std=0.02,
+        output_attentions=False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.dec_attn = TFRelPartialLearnableMultiHeadAttn(
+            n_head,
+            d_model,
+            d_head,
+            dropout,
+            dropatt=dropatt,
+            pre_lnorm=pre_lnorm,
+            r_w_bias=r_w_bias,
+            r_r_bias=r_r_bias,
+            init_std=init_std,
+            layer_norm_epsilon=layer_norm_epsilon,
+            output_attentions=output_attentions,
+            name="dec_attn",
+        )
+        self.pos_ff = TFPositionwiseFF(
+            d_model,
+            d_inner,
+            dropout,
+            pre_lnorm=pre_lnorm,
+            init_std=init_std,
+            layer_norm_epsilon=layer_norm_epsilon,
+            name="pos_ff",
+        )
+
+    def call(self, dec_inp, r, dec_attn_mask, mems, head_mask, output_attentions, training=False):
+        attn_outputs = self.dec_attn(dec_inp, r, dec_attn_mask, mems, head_mask, output_attentions, training=training)
+        ff_output = self.pos_ff(attn_outputs[0], training=training)
+
+        outputs = [ff_output] + attn_outputs[1:]
+
+        return outputs
+
+
+class TFTransfoEmbeddings(keras.layers.Layer):
+    def __init__(self, vocab_size, emb_size, init_std, **kwargs):
+        super().__init__(**kwargs)
+
+        self.vocab_size = vocab_size
+        self.emb_size = emb_size
+        self.init_std = init_std
+
+    def build(self, input_shape):
+        self.weight = self.add_weight(
+            shape=(self.vocab_size, self.emb_size),
+            initializer=get_initializer(self.init_std),
+            name="embeddings",
+        )
+
+        super().build(input_shape)
+
+    def call(self, inputs):
+        return tf.gather(self.weight, inputs)
+
+
+class TFAdaptiveEmbedding(keras.layers.Layer):
+    def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, init_std=0.02, sample_softmax=False, **kwargs):
+        super().__init__(**kwargs)
+
+        self.n_token = n_token
+        self.d_embed = d_embed
+        self.init_std = init_std
+
+        self.cutoffs = cutoffs + [n_token]
+        self.div_val = div_val
+        self.d_proj = d_proj
+
+        self.emb_scale = d_proj**0.5
+
+        self.cutoff_ends = [0] + self.cutoffs
+
+        self.emb_layers = []
+        self.emb_projs = []
+
+        if div_val == 1:
+            raise NotImplementedError  # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
+        else:
+            for i in range(len(self.cutoffs)):
+                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
+                d_emb_i = d_embed // (div_val**i)
+                self.emb_layers.append(
+                    TFTransfoEmbeddings(
+                        r_idx - l_idx,
+                        d_emb_i,
+                        init_std,
+                        name=f"emb_layers_._{i}",
+                    )
+                )
+
+    def build(self, input_shape):
+        for i in range(len(self.cutoffs)):
+            d_emb_i = self.d_embed // (self.div_val**i)
+            self.emb_projs.append(
+                self.add_weight(
+                    shape=(d_emb_i, self.d_proj),
+                    initializer=get_initializer(self.init_std),
+                    trainable=True,
+                    name=f"emb_projs_._{i}",
+                )
+            )
+
+        super().build(input_shape)
+
+    def call(self, inp):
+        if self.div_val == 1:
+            raise NotImplementedError  # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
+        else:
+            inp_flat = tf.reshape(inp, (-1,))
+            emb_flat = tf.zeros([shape_list(inp_flat)[0], self.d_proj])
+            for i in range(len(self.cutoffs)):
+                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
+
+                mask_i = (inp_flat >= l_idx) & (inp_flat < r_idx)
+
+                inp_i = tf.boolean_mask(inp_flat, mask_i) - l_idx
+                emb_i = self.emb_layers[i](inp_i)
+                emb_i = tf.einsum("id,de->ie", emb_i, self.emb_projs[i])
+
+                mask_idx = tf.where(mask_i)
+                scatter = tf.scatter_nd(mask_idx, emb_i, shape_list(emb_flat))
+                emb_flat = tf.cast(emb_flat, dtype=scatter.dtype)
+                emb_flat += scatter
+
+            embed_shape = shape_list(inp) + [self.d_proj]
+            embed = tf.reshape(emb_flat, embed_shape)
+
+        embed *= self.emb_scale
+
+        return embed
+
+
+@keras_serializable
+class TFTransfoXLMainLayer(keras.layers.Layer):
+    config_class = TransfoXLConfig
+
+    def __init__(self, config, **kwargs):
+        super().__init__(**kwargs)
+
+        self.config = config
+        self.output_hidden_states = config.output_hidden_states
+        self.output_attentions = config.output_attentions
+        self.return_dict = config.use_return_dict
+
+        self.n_token = config.vocab_size
+
+        self.d_embed = config.d_embed
+        self.d_model = config.d_model
+        self.n_head = config.n_head
+        self.d_head = config.d_head
+        self.untie_r = config.untie_r
+
+        self.word_emb = TFAdaptiveEmbedding(
+            config.vocab_size,
+            config.d_embed,
+            config.d_model,
+            config.cutoffs,
+            div_val=config.div_val,
+            init_std=config.init_std,
+            name="word_emb",
+        )
+
+        self.drop = keras.layers.Dropout(config.dropout)
+
+        self.n_layer = config.n_layer
+        self.mem_len = config.mem_len
+        self.attn_type = config.attn_type
+
+        self.layers = []
+        if config.attn_type == 0:  # the default attention
+            for i in range(config.n_layer):
+                self.layers.append(
+                    TFRelPartialLearnableDecoderLayer(
+                        config.n_head,
+                        config.d_model,
+                        config.d_head,
+                        config.d_inner,
+                        config.dropout,
+                        dropatt=config.dropatt,
+                        pre_lnorm=config.pre_lnorm,
+                        r_w_bias=None if self.untie_r else self.r_w_bias,
+                        r_r_bias=None if self.untie_r else self.r_r_bias,
+                        layer_norm_epsilon=config.layer_norm_epsilon,
+                        init_std=config.init_std,
+                        output_attentions=self.output_attentions,
+                        name=f"layers_._{i}",
+                    )
+                )
+        else:  # learnable embeddings and absolute embeddings
+            raise NotImplementedError  # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
+
+        self.same_length = config.same_length
+        self.clamp_len = config.clamp_len
+
+        if self.attn_type == 0:  # default attention
+            self.pos_emb = TFPositionalEmbedding(self.d_model, name="pos_emb")
+        else:  # learnable embeddings and absolute embeddings
+            raise NotImplementedError  # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
+
+    def build(self, input_shape):
+        if not self.untie_r:
+            self.r_w_bias = self.add_weight(
+                shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_w_bias"
+            )
+            self.r_r_bias = self.add_weight(
+                shape=(self.n_head, self.d_head), initializer="zeros", trainable=True, name="r_r_bias"
+            )
+        super().build(input_shape)
+
+    def get_input_embeddings(self):
+        return self.word_emb
+
+    def set_input_embeddings(self, value):
+        raise NotImplementedError
+
+    def backward_compatible(self):
+        self.sample_softmax = -1
+
+    def reset_memory_length(self, mem_len):
+        self.mem_len = mem_len
+
+    def _prune_heads(self, heads):
+        raise NotImplementedError
+
+    def init_mems(self, bsz):
+        if self.mem_len > 0:
+            mems = []
+            for i in range(self.n_layer):
+                empty = tf.zeros([self.mem_len, bsz, self.d_model])
+                mems.append(empty)
+
+            return mems
+        else:
+            return None
+
+    def _update_mems(self, hids, mems, mlen, qlen):
+        # does not deal with None
+        if mems is None:
+            return None
+
+        # mems is not None
+        assert len(hids) == len(mems), "len(hids) != len(mems)"
+
+        # There are `mlen + qlen` steps that can be cached into mems
+        new_mems = []
+        end_idx = mlen + tf.math.maximum(0, qlen)
+        beg_idx = tf.math.maximum(0, end_idx - tf.convert_to_tensor(self.mem_len))
+        for i in range(len(hids)):
+            mems[i] = tf.cast(mems[i], dtype=hids[i].dtype)
+            cat = tf.concat([mems[i], hids[i]], axis=0)
+            tf.stop_gradient(cat)
+            new_mems.append(cat[beg_idx:end_idx])
+
+        return new_mems
+
+    @unpack_inputs
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        mems: list[tf.Tensor] | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool = False,
+    ):
+        # the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library
+        # so we transpose here from shape [bsz, len] to shape [len, bsz]
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_ids = tf.transpose(input_ids, perm=(1, 0))
+            qlen, bsz = shape_list(input_ids)
+        elif inputs_embeds is not None:
+            inputs_embeds = tf.transpose(inputs_embeds, perm=(1, 0, 2))
+            qlen, bsz = shape_list(inputs_embeds)[:2]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if mems is None:
+            mems = self.init_mems(bsz)
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] (a head_mask for each layer)
+        # and head_mask is converted to shape [num_hidden_layers x qlen x klen x bsz x n_head]
+        if head_mask is not None:
+            raise NotImplementedError
+        else:
+            head_mask = [None] * self.n_layer
+
+        if inputs_embeds is not None:
+            word_emb = inputs_embeds
+        else:
+            word_emb = self.word_emb(input_ids)
+
+        mlen = shape_list(mems[0])[0] if mems is not None else 0
+        klen = mlen + qlen
+
+        # Compute decoder attention mask
+        all_ones = tf.ones([qlen, klen], dtype=tf.int32)
+        upper_mask = 1 - tf.linalg.band_part(tf.ones([qlen, klen], dtype=tf.int32), -1, mlen)
+        if self.same_length:
+            mask_len = klen - self.mem_len
+            mask_shift_len = qlen - tf.nn.relu(mask_len)  # Lazy clamping of negatives to zero
+
+            # Use an indicator variable instead of a conditional to keep the compiler happy
+            lower_mask = tf.linalg.band_part(all_ones, -1, 0) - (
+                tf.linalg.band_part(all_ones, mask_shift_len - 1, 0) * tf.cast(mask_shift_len != 0, tf.int32)
+            )
+            dec_attn_mask = upper_mask + lower_mask
+        else:
+            dec_attn_mask = upper_mask
+
+        hids = []
+        attentions = [] if output_attentions else None
+        if self.attn_type == 0:  # default
+            pos_seq = tf.range(klen - 1, -1, -1.0)
+            if self.clamp_len > 0:
+                pos_seq = tf.minimum(pos_seq, self.clamp_len)
+            pos_emb = self.pos_emb(pos_seq)
+
+            core_out = self.drop(word_emb, training=training)
+            pos_emb = self.drop(pos_emb, training=training)
+
+            for i, layer in enumerate(self.layers):
+                hids.append(core_out)
+                mems_i = None if mems is None else mems[i]
+                layer_outputs = layer(
+                    core_out,
+                    pos_emb,
+                    dec_attn_mask,
+                    mems_i,
+                    head_mask[i],
+                    output_attentions,
+                    training=training,
+                )
+                core_out = layer_outputs[0]
+                if output_attentions:
+                    attentions.append(layer_outputs[1])
+        else:  # learnable embeddings and absolute embeddings
+            raise NotImplementedError  # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
+
+        core_out = self.drop(core_out, training=training)
+
+        new_mems = self._update_mems(hids, mems, mlen, qlen)
+
+        # We transpose back here to shape [bsz, len, hidden_dim]
+        core_out = tf.transpose(core_out, perm=(1, 0, 2))
+
+        if output_hidden_states:
+            # Transpose to library standard shape [bsz, len, hidden_dim] and add last layer
+            hids = tuple(tf.transpose(t, perm=(1, 0, 2)) for t in hids)
+            hids = hids + (core_out,)
+        else:
+            hids = None
+        if output_attentions:
+            # Transpose to library standard shape [bsz, n_heads, query_seq_len, key_seq_len]
+            attentions = tuple(tf.transpose(t, perm=(2, 3, 0, 1)) for t in attentions)
+
+        if not return_dict:
+            return tuple(v for v in [core_out, new_mems, hids, attentions] if v is not None)
+
+        return TFTransfoXLModelOutput(
+            last_hidden_state=core_out,
+            mems=new_mems,
+            hidden_states=hids,
+            attentions=attentions,
+        )
+
+
+class TFTransfoXLPreTrainedModel(TFPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = TransfoXLConfig
+    base_model_prefix = "transformer"
+
+
+@dataclass
+class TFTransfoXLModelOutput(ModelOutput):
+    """
+    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
+
+    Args:
+        last_hidden_state (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        mems (`list[tf.Tensor]` of length `config.n_layers`):
+            Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `mems`
+            input) to speed up sequential decoding. The token ids which have their past given to this model should not
+            be passed as input ids as they have already been computed.
+        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    last_hidden_state: tf.Tensor | None = None
+    mems: list[tf.Tensor] = None
+    hidden_states: tuple[tf.Tensor] | None = None
+    attentions: tuple[tf.Tensor] | None = None
+
+
+@dataclass
+class TFTransfoXLLMHeadModelOutput(ModelOutput):
+    """
+    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
+
+    Args:
+        losses (`tf.Tensor` of shape *(batch_size, sequence_length-1)*, *optional*, returned when `labels` is provided):
+            Language modeling losses (not reduced).
+        prediction_scores (`tf.Tensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token after SoftMax).
+        mems (`list[tf.Tensor]` of length `config.n_layers`):
+            Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `mems`
+            input) to speed up sequential decoding. The token ids which have their past given to this model should not
+            be passed as input ids as they have already been computed.
+        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    prediction_scores: tf.Tensor | None = None
+    mems: list[tf.Tensor] = None
+    hidden_states: tuple[tf.Tensor] | None = None
+    attentions: tuple[tf.Tensor] | None = None
+
+
+@dataclass
+class TFTransfoXLSequenceClassifierOutputWithPast(ModelOutput):
+    """
+    Base class for outputs of sentence classification models.
+
+    Args:
+        loss (`tf.Tensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Classification (or regression if config.num_labels==1) loss.
+        logits (`tf.Tensor` of shape `(batch_size, config.num_labels)`):
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        mems (`list[tf.Tensor]` of length `config.n_layers`):
+            Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `mems`
+            input) to speed up sequential decoding. The token ids which have their past given to this model should not
+            be passed as input ids as they have already been computed.
+        hidden_states (`tuple(tf.Tensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of shape
+            `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(tf.Tensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `tf.Tensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: tf.Tensor | None = None
+    logits: tf.Tensor | None = None
+    mems: list[tf.Tensor] = None
+    hidden_states: tuple[tf.Tensor] | None = None
+    attentions: tuple[tf.Tensor] | None = None
+
+
+TRANSFO_XL_START_DOCSTRING = r"""
+
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
+    behavior.
+
+    <Tip>
+
+    TensorFlow models and layers in `transformers` accept two formats as input:
+
+    - having all inputs as keyword arguments (like PyTorch models), or
+    - having all inputs as a list, tuple or dict in the first positional argument.
+
+    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
+    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
+    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
+    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
+    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
+    positional argument:
+
+    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
+    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+    Note that when creating models and layers with
+    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
+    about any of this, as you can just pass inputs like you would to any other Python function!
+
+    </Tip>
+
+    Parameters:
+        config ([`TransfoXLConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+TRANSFO_XL_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.__call__`] and
+            [`PreTrainedTokenizer.encode`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        mems (`list[tf.Tensor]` of length `config.n_layers`):
+            Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see
+            `mems` output below). Can be used to speed up sequential decoding. The token ids which have their mems
+            given to this model should not be passed as `input_ids` as they have already been computed.
+        head_mask (`tf.Tensor` or `Numpy array` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        inputs_embeds (`tf.Tensor` or `Numpy array` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
+            config will be used instead.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
+            used instead.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
+            eager mode, in graph mode the value will always be set to True.
+        training (`bool`, *optional*, defaults to `False`):
+            Whether or not to use the model in training mode (some modules like dropout modules have different
+            behaviors between training and evaluation).
+"""
+
+
+@add_start_docstrings(
+    "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
+    TRANSFO_XL_START_DOCSTRING,
+)
+class TFTransfoXLModel(TFTransfoXLPreTrainedModel):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.transformer = TFTransfoXLMainLayer(config, name="transformer")
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(TRANSFO_XL_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFTransfoXLModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        mems: list[tf.Tensor] | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool = False,
+    ) -> TFTransfoXLModelOutput | tuple[tf.Tensor]:
+        outputs = self.transformer(
+            input_ids=input_ids,
+            mems=mems,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        return outputs
+
+
+@add_start_docstrings(
+    """
+    The Transformer-XL Model with a language modeling head on top (adaptive softmax with weights tied to the adaptive
+    input embeddings)
+    """,
+    TRANSFO_XL_START_DOCSTRING,
+)
+class TFTransfoXLLMHeadModel(TFTransfoXLPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.transformer = TFTransfoXLMainLayer(config, name="transformer")
+        self.sample_softmax = config.sample_softmax
+        assert self.sample_softmax <= 0, (
+            "Sampling from the softmax is not implemented yet. Please look at issue: #3310:"
+            " https://github.com/huggingface/transformers/issues/3310"
+        )
+
+        self.crit = TFAdaptiveSoftmaxMask(
+            config.vocab_size, config.d_embed, config.d_model, config.cutoffs, div_val=config.div_val, name="crit"
+        )
+
+    def _resize_token_embeddings(self, new_num_tokens):
+        raise NotImplementedError()
+
+    def get_output_embeddings(self):
+        """Double-check if you are using adaptive softmax."""
+        if len(self.crit.out_layers) > 0:
+            return self.crit.out_layers[-1]
+        return None
+
+    def reset_memory_length(self, mem_len):
+        self.transformer.reset_memory_length(mem_len)
+
+    def init_mems(self, bsz):
+        return self.transformer.init_mems(bsz)
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(TRANSFO_XL_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFTransfoXLLMHeadModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        mems: list[tf.Tensor] | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool = False,
+    ) -> TFTransfoXLLMHeadModelOutput | tuple[tf.Tensor]:
+        if input_ids is not None:
+            bsz, tgt_len = shape_list(input_ids)[:2]
+        else:
+            bsz, tgt_len = shape_list(inputs_embeds)[:2]
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            mems,
+            head_mask,
+            inputs_embeds,
+            output_attentions,
+            output_hidden_states,
+            return_dict,
+            training=training,
+        )
+
+        last_hidden = transformer_outputs[0]
+        pred_hid = last_hidden[:, -tgt_len:]
+
+        softmax_output = self.crit(pred_hid, labels, training=training)
+        prediction_scores = softmax_output if labels is None else ()
+
+        if not return_dict:
+            return (prediction_scores,) + transformer_outputs[1:]
+
+        return TFTransfoXLLMHeadModelOutput(
+            prediction_scores=prediction_scores,
+            mems=transformer_outputs.mems,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **model_kwargs):
+        inputs = {}
+
+        # if past is defined in model kwargs then use it for faster decoding
+        if past_key_values:
+            input_ids = tf.expand_dims(input_ids[:, -1], axis=-1)
+        else:
+            input_ids = input_ids
+
+        return inputs
+
+    # Adapted from the torch tie_weights function
+    def tf_to_pt_weight_rename(self, tf_weight):
+        if self.config.tie_word_embeddings and "crit.out_layers" in tf_weight:
+            return tf_weight, tf_weight.replace("crit.out_layers", "transformer.word_emb.emb_layers")
+        elif self.config.tie_projs and "crit.out_projs" in tf_weight:
+            for i, tie_proj in enumerate(self.config.tie_projs):
+                if tie_proj and self.config.div_val == 1 and self.config.d_model != self.config.d_embed:
+                    # self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[0]
+                    return tf_weight, tf_weight.replace(f"crit.out_projs.{i}", "transformer.word_emb.emb_projs.0")
+                elif tie_proj and self.config.div_val != 1:
+                    # self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[i]
+                    return tf_weight, tf_weight.replace("crit.out_projs", "transformer.word_emb.emb_projs")
+        else:
+            return (tf_weight,)
+
+
+@add_start_docstrings(
+    """
+    The Transfo XL Model transformer with a sequence classification head on top (linear layer).
+
+    [`TFTransfoXLForSequenceClassification`] uses the last token in order to do the classification, as other causal
+    models (e.g. GPT-1,GPT-2) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    TRANSFO_XL_START_DOCSTRING,
+)
+class TFTransfoXLForSequenceClassification(TFTransfoXLPreTrainedModel, TFSequenceClassificationLoss):
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.num_labels = config.num_labels
+        self.score = keras.layers.Dense(
+            config.num_labels,
+            kernel_initializer=get_initializer(config.init_range),
+            name="score",
+            use_bias=False,
+        )
+        self.transformer = TFTransfoXLMainLayer(config, name="transformer")
+
+    def get_output_embeddings(self):
+        # Remove after transformers v4.32. Fix this model's `test_model_common_attributes` test too.
+        logger.warning(
+            "Sequence classification models do not have output embeddings. `.get_output_embeddings` will be removed "
+            "in transformers v4.32."
+        )
+        return self.transformer.word_emb
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(TRANSFO_XL_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFTransfoXLSequenceClassifierOutputWithPast,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        mems: list[tf.Tensor] | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool | None = False,
+    ) -> tuple | TFTransfoXLSequenceClassifierOutputWithPast:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the cross entropy classification loss. Indices should be in `[0, ...,
+            config.vocab_size - 1]`.
+        """
+        transformer_outputs = self.transformer(
+            input_ids=input_ids,
+            mems=mems,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+        in_logits = None
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                sequence_lengths = (
+                    tf.argmax(tf.cast(tf.math.equal(input_ids, self.config.pad_token_id), input_ids.dtype), axis=-1)
+                    - 1
+                )
+                sequence_lengths = tf.where(sequence_lengths >= 0, sequence_lengths, input_ids.shape[-1] - 1)
+                in_logits = tf.gather(logits, sequence_lengths, batch_dims=1, axis=1)
+            else:
+                sequence_lengths = -1
+                logger.warning_once(
+                    f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
+                    "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
+                )
+        loss = None
+
+        if labels is not None:
+            if input_ids is not None:
+                batch_size, sequence_length = shape_list(input_ids)[:2]
+            else:
+                batch_size, sequence_length = shape_list(inputs_embeds)[:2]
+            assert self.config.pad_token_id is not None or batch_size == 1, (
+                "Cannot handle batch sizes > 1 if no padding token is defined."
+            )
+
+            if not tf.is_tensor(sequence_lengths):
+                in_logits = logits[0:batch_size, sequence_lengths]
+
+            loss = self.hf_compute_loss(tf.reshape(labels, [-1, 1]), tf.reshape(in_logits, [-1, self.num_labels]))
+
+        pooled_logits = in_logits if in_logits is not None else logits
+
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFTransfoXLSequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            mems=transformer_outputs.mems,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+__all__ = [
+    "TFAdaptiveEmbedding",
+    "TFTransfoXLForSequenceClassification",
+    "TFTransfoXLLMHeadModel",
+    "TFTransfoXLMainLayer",
+    "TFTransfoXLModel",
+    "TFTransfoXLPreTrainedModel",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl_utilities.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl_utilities.py
new file mode 100644
index 0000000000000000000000000000000000000000..48205e06fb20a473959544db4971dff0d3e58cbf
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/modeling_tf_transfo_xl_utilities.py
@@ -0,0 +1,178 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+A TF 2.0 Adaptive Softmax for Transformer XL model.
+"""
+
+import tensorflow as tf
+
+from ....modeling_tf_utils import keras
+from ....tf_utils import shape_list
+
+
+class TFAdaptiveSoftmaxMask(keras.layers.Layer):
+    def __init__(self, vocab_size, d_embed, d_proj, cutoffs, div_val=1, keep_order=False, **kwargs):
+        super().__init__(**kwargs)
+
+        self.vocab_size = vocab_size
+        self.d_embed = d_embed
+        self.d_proj = d_proj
+
+        self.cutoffs = cutoffs + [vocab_size]
+        self.cutoff_ends = [0] + self.cutoffs
+        self.div_val = div_val
+
+        self.shortlist_size = self.cutoffs[0]
+        self.n_clusters = len(self.cutoffs) - 1
+        self.head_size = self.shortlist_size + self.n_clusters
+        self.keep_order = keep_order
+
+        self.out_layers = []
+        self.out_projs = []
+
+    def build(self, input_shape):
+        if self.n_clusters > 0:
+            self.cluster_weight = self.add_weight(
+                shape=(self.n_clusters, self.d_embed), initializer="zeros", trainable=True, name="cluster_weight"
+            )
+            self.cluster_bias = self.add_weight(
+                shape=(self.n_clusters,), initializer="zeros", trainable=True, name="cluster_bias"
+            )
+
+        if self.div_val == 1:
+            for i in range(len(self.cutoffs)):
+                if self.d_proj != self.d_embed:
+                    weight = self.add_weight(
+                        shape=(self.d_embed, self.d_proj),
+                        initializer="zeros",
+                        trainable=True,
+                        name=f"out_projs_._{i}",
+                    )
+                    self.out_projs.append(weight)
+                else:
+                    self.out_projs.append(None)
+                weight = self.add_weight(
+                    shape=(self.vocab_size, self.d_embed),
+                    initializer="zeros",
+                    trainable=True,
+                    name=f"out_layers_._{i}_._weight",
+                )
+                bias = self.add_weight(
+                    shape=(self.vocab_size,),
+                    initializer="zeros",
+                    trainable=True,
+                    name=f"out_layers_._{i}_._bias",
+                )
+                self.out_layers.append((weight, bias))
+        else:
+            for i in range(len(self.cutoffs)):
+                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
+                d_emb_i = self.d_embed // (self.div_val**i)
+
+                weight = self.add_weight(
+                    shape=(d_emb_i, self.d_proj), initializer="zeros", trainable=True, name=f"out_projs_._{i}"
+                )
+                self.out_projs.append(weight)
+                weight = self.add_weight(
+                    shape=(r_idx - l_idx, d_emb_i),
+                    initializer="zeros",
+                    trainable=True,
+                    name=f"out_layers_._{i}_._weight",
+                )
+                bias = self.add_weight(
+                    shape=(r_idx - l_idx,),
+                    initializer="zeros",
+                    trainable=True,
+                    name=f"out_layers_._{i}_._bias",
+                )
+                self.out_layers.append((weight, bias))
+        super().build(input_shape)
+
+    @staticmethod
+    def _logit(x, W, b, proj=None):
+        y = x
+        if proj is not None:
+            y = tf.einsum("ibd,ed->ibe", y, proj)
+        return tf.einsum("ibd,nd->ibn", y, W) + b
+
+    @staticmethod
+    def _gather_logprob(logprob, target):
+        lp_size = shape_list(logprob)
+        r = tf.range(lp_size[0], dtype=target.dtype)
+        idx = tf.stack([r, target], 1)
+        return tf.gather_nd(logprob, idx)
+
+    def call(self, hidden, target, return_mean=True, training=False):
+        head_logprob = 0
+        if self.n_clusters == 0:
+            output = self._logit(hidden, self.out_layers[0][0], self.out_layers[0][1], self.out_projs[0])
+            if target is not None:
+                loss = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=target, logits=output)
+            out = tf.nn.log_softmax(output, axis=-1)
+        else:
+            hidden_sizes = shape_list(hidden)
+            out = []
+            loss = tf.zeros(hidden_sizes[:2])
+            for i in range(len(self.cutoffs)):
+                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
+                if target is not None:
+                    mask = (target >= l_idx) & (target < r_idx)
+                    mask_idx = tf.where(mask)
+                    cur_target = tf.boolean_mask(target, mask) - l_idx
+
+                if self.div_val == 1:
+                    cur_W = self.out_layers[0][0][l_idx:r_idx]
+                    cur_b = self.out_layers[0][1][l_idx:r_idx]
+                else:
+                    cur_W = self.out_layers[i][0]
+                    cur_b = self.out_layers[i][1]
+
+                if i == 0:
+                    cur_W = tf.concat([cur_W, self.cluster_weight], 0)
+                    cur_b = tf.concat([cur_b, self.cluster_bias], 0)
+
+                    head_logit = self._logit(hidden, cur_W, cur_b, self.out_projs[0])
+                    head_logprob = tf.nn.log_softmax(head_logit)
+                    out.append(head_logprob[..., : self.cutoffs[0]])
+                    if target is not None:
+                        cur_head_logprob = tf.boolean_mask(head_logprob, mask)
+                        cur_logprob = self._gather_logprob(cur_head_logprob, cur_target)
+                else:
+                    tail_logit = self._logit(hidden, cur_W, cur_b, self.out_projs[i])
+                    tail_logprob = tf.nn.log_softmax(tail_logit)
+                    cluster_prob_idx = self.cutoffs[0] + i - 1  # No probability for the head cluster
+                    logprob_i = head_logprob[..., cluster_prob_idx, None] + tail_logprob
+                    out.append(logprob_i)
+                    if target is not None:
+                        cur_head_logprob = tf.boolean_mask(head_logprob, mask)
+                        cur_tail_logprob = tf.boolean_mask(tail_logprob, mask)
+                        cur_logprob = self._gather_logprob(cur_tail_logprob, cur_target)
+                        cur_logprob += cur_head_logprob[:, self.cutoff_ends[1] + i - 1]
+                if target is not None:
+                    loss += tf.scatter_nd(mask_idx, -cur_logprob, shape_list(loss))
+            out = tf.concat(out, axis=-1)
+
+        if target is not None:
+            if return_mean:
+                loss = tf.reduce_mean(loss)
+            # Add the training-time loss value to the layer using `self.add_loss()`.
+            self.add_loss(loss)
+
+            # Log the loss as a metric (we could log arbitrary metrics,
+            # including different metrics for training and inference.
+            self.add_metric(loss, name=self.name, aggregation="mean" if return_mean else "")
+
+        return out
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/modeling_transfo_xl.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/modeling_transfo_xl.py
new file mode 100644
index 0000000000000000000000000000000000000000..49d07391320d0fbc93c969cc72eb27b3e5e4ed0a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/modeling_transfo_xl.py
@@ -0,0 +1,1303 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+PyTorch Transformer XL model. Adapted from https://github.com/kimiyoung/transformer-xl. In particular
+https://github.com/kimiyoung/transformer-xl/blob/master/pytorch/mem_transformer.py
+"""
+
+import warnings
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ....modeling_utils import PreTrainedModel
+from ....utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+)
+from .configuration_transfo_xl import TransfoXLConfig
+from .modeling_transfo_xl_utilities import ProjectedAdaptiveLogSoftmax
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "transfo-xl/transfo-xl-wt103"
+_CONFIG_FOR_DOC = "TransfoXLConfig"
+
+
+def build_tf_to_pytorch_map(model, config):
+    """
+    A map of modules from TF to PyTorch. This time I use a map to keep the PyTorch model as identical to the original
+    PyTorch model as possible.
+    """
+    tf_to_pt_map = {}
+
+    if hasattr(model, "transformer"):
+        # We are loading in a TransfoXLLMHeadModel => we will load also the Adaptive Softmax
+        tf_to_pt_map.update(
+            {
+                "transformer/adaptive_softmax/cutoff_0/cluster_W": model.crit.cluster_weight,
+                "transformer/adaptive_softmax/cutoff_0/cluster_b": model.crit.cluster_bias,
+            }
+        )
+        for i, (out_l, proj_l, tie_proj) in enumerate(
+            zip(model.crit.out_layers, model.crit.out_projs, config.tie_projs)
+        ):
+            layer_str = f"transformer/adaptive_softmax/cutoff_{i}/"
+            if config.tie_word_embeddings:
+                tf_to_pt_map.update({layer_str + "b": out_l.bias})
+            else:
+                raise NotImplementedError
+                # I don't think this is implemented in the TF code
+                tf_to_pt_map.update({layer_str + "lookup_table": out_l.weight, layer_str + "b": out_l.bias})
+            if not tie_proj:
+                tf_to_pt_map.update({layer_str + "proj": proj_l})
+        # Now load the rest of the transformer
+        model = model.transformer
+
+    # Embeddings
+    for i, (embed_l, proj_l) in enumerate(zip(model.word_emb.emb_layers, model.word_emb.emb_projs)):
+        layer_str = f"transformer/adaptive_embed/cutoff_{i}/"
+        tf_to_pt_map.update({layer_str + "lookup_table": embed_l.weight, layer_str + "proj_W": proj_l})
+
+    # Transformer blocks
+    for i, b in enumerate(model.layers):
+        layer_str = f"transformer/layer_{i}/"
+        tf_to_pt_map.update(
+            {
+                layer_str + "rel_attn/LayerNorm/gamma": b.dec_attn.layer_norm.weight,
+                layer_str + "rel_attn/LayerNorm/beta": b.dec_attn.layer_norm.bias,
+                layer_str + "rel_attn/o/kernel": b.dec_attn.o_net.weight,
+                layer_str + "rel_attn/qkv/kernel": b.dec_attn.qkv_net.weight,
+                layer_str + "rel_attn/r/kernel": b.dec_attn.r_net.weight,
+                layer_str + "ff/LayerNorm/gamma": b.pos_ff.layer_norm.weight,
+                layer_str + "ff/LayerNorm/beta": b.pos_ff.layer_norm.bias,
+                layer_str + "ff/layer_1/kernel": b.pos_ff.CoreNet[0].weight,
+                layer_str + "ff/layer_1/bias": b.pos_ff.CoreNet[0].bias,
+                layer_str + "ff/layer_2/kernel": b.pos_ff.CoreNet[3].weight,
+                layer_str + "ff/layer_2/bias": b.pos_ff.CoreNet[3].bias,
+            }
+        )
+
+    # Relative positioning biases
+    if config.untie_r:
+        r_r_list = []
+        r_w_list = []
+        for b in model.layers:
+            r_r_list.append(b.dec_attn.r_r_bias)
+            r_w_list.append(b.dec_attn.r_w_bias)
+    else:
+        r_r_list = [model.r_r_bias]
+        r_w_list = [model.r_w_bias]
+    tf_to_pt_map.update({"transformer/r_r_bias": r_r_list, "transformer/r_w_bias": r_w_list})
+    return tf_to_pt_map
+
+
+def load_tf_weights_in_transfo_xl(model, config, tf_path):
+    """Load tf checkpoints in a pytorch model"""
+    try:
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            "Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
+        raise
+    # Build TF to PyTorch weights loading map
+    tf_to_pt_map = build_tf_to_pytorch_map(model, config)
+
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    tf_weights = {}
+    for name, shape in init_vars:
+        logger.info(f"Loading TF weight {name} with shape {shape}")
+        array = tf.train.load_variable(tf_path, name)
+        tf_weights[name] = array
+
+    for name, pointer in tf_to_pt_map.items():
+        assert name in tf_weights
+        array = tf_weights[name]
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if "kernel" in name or "proj" in name:
+            array = np.transpose(array)
+        if ("r_r_bias" in name or "r_w_bias" in name) and len(pointer) > 1:
+            # Here we will split the TF weights
+            assert len(pointer) == array.shape[0]
+            for i, p_i in enumerate(pointer):
+                arr_i = array[i, ...]
+                try:
+                    assert p_i.shape == arr_i.shape
+                except AssertionError as e:
+                    e.args += (p_i.shape, arr_i.shape)
+                    raise
+                logger.info(f"Initialize PyTorch weight {name} for layer {i}")
+                p_i.data = torch.from_numpy(arr_i)
+        else:
+            try:
+                assert pointer.shape == array.shape, (
+                    f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched"
+                )
+            except AssertionError as e:
+                e.args += (pointer.shape, array.shape)
+                raise
+            logger.info(f"Initialize PyTorch weight {name}")
+            pointer.data = torch.from_numpy(array)
+        tf_weights.pop(name, None)
+        tf_weights.pop(name + "/Adam", None)
+        tf_weights.pop(name + "/Adam_1", None)
+
+    logger.info(f"Weights not copied to PyTorch model: {', '.join(tf_weights.keys())}")
+    return model
+
+
+class PositionalEmbedding(nn.Module):
+    def __init__(self, demb):
+        super().__init__()
+
+        self.demb = demb
+
+        inv_freq = 1 / (10000 ** (torch.arange(0.0, demb, 2.0) / demb))
+        self.register_buffer("inv_freq", inv_freq)
+
+    def forward(self, pos_seq, bsz=None):
+        sinusoid_inp = torch.outer(pos_seq, self.inv_freq)
+        pos_emb = torch.cat([sinusoid_inp.sin(), sinusoid_inp.cos()], dim=-1)
+
+        if bsz is not None:
+            return pos_emb[:, None, :].expand(-1, bsz, -1)
+        else:
+            return pos_emb[:, None, :]
+
+
+class PositionwiseFF(nn.Module):
+    def __init__(self, d_model, d_inner, dropout, pre_lnorm=False, layer_norm_epsilon=1e-5):
+        super().__init__()
+
+        self.d_model = d_model
+        self.d_inner = d_inner
+        self.dropout = dropout
+
+        self.CoreNet = nn.Sequential(
+            nn.Linear(d_model, d_inner),
+            nn.ReLU(inplace=True),
+            nn.Dropout(dropout),
+            nn.Linear(d_inner, d_model),
+            nn.Dropout(dropout),
+        )
+
+        self.layer_norm = nn.LayerNorm(d_model, eps=layer_norm_epsilon)
+
+        self.pre_lnorm = pre_lnorm
+
+    def forward(self, inp):
+        if self.pre_lnorm:
+            # layer normalization + positionwise feed-forward
+            core_out = self.CoreNet(self.layer_norm(inp))
+
+            # residual connection
+            output = core_out + inp
+        else:
+            # positionwise feed-forward
+            core_out = self.CoreNet(inp)
+
+            # residual connection + layer normalization
+            output = self.layer_norm(inp + core_out)
+
+        return output
+
+
+class RelPartialLearnableMultiHeadAttn(nn.Module):
+    def __init__(
+        self,
+        n_head,
+        d_model,
+        d_head,
+        dropout,
+        dropatt=0,
+        pre_lnorm=False,
+        r_r_bias=None,
+        r_w_bias=None,
+        layer_norm_epsilon=1e-5,
+    ):
+        super().__init__()
+
+        self.n_head = n_head
+        self.d_model = d_model
+        self.d_head = d_head
+        self.dropout = dropout
+
+        self.qkv_net = nn.Linear(d_model, 3 * n_head * d_head, bias=False)
+
+        self.drop = nn.Dropout(dropout)
+        self.dropatt = nn.Dropout(dropatt)
+        self.o_net = nn.Linear(n_head * d_head, d_model, bias=False)
+
+        self.layer_norm = nn.LayerNorm(d_model, eps=layer_norm_epsilon)
+
+        self.scale = 1 / (d_head**0.5)
+
+        self.pre_lnorm = pre_lnorm
+
+        if r_r_bias is None or r_w_bias is None:  # Biases are not shared
+            self.r_r_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
+            self.r_w_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
+        else:
+            self.r_r_bias = r_r_bias
+            self.r_w_bias = r_w_bias
+
+        self.r_net = nn.Linear(self.d_model, self.n_head * self.d_head, bias=False)
+
+    def _rel_shift(self, x):
+        zero_pad_shape = (x.size(0), 1) + x.size()[2:]
+        zero_pad = torch.zeros(zero_pad_shape, device=x.device, dtype=x.dtype)
+        x_padded = torch.cat([zero_pad, x], dim=1)
+
+        x_padded_shape = (x.size(1) + 1, x.size(0)) + x.size()[2:]
+        x_padded = x_padded.view(*x_padded_shape)
+
+        x = x_padded[1:].view_as(x)
+
+        return x
+
+    def forward(self, w, r, attn_mask=None, mems=None, head_mask=None, output_attentions=False):
+        qlen, rlen, bsz = w.size(0), r.size(0), w.size(1)
+
+        if mems is not None:
+            cat = torch.cat([mems, w], 0)
+            if self.pre_lnorm:
+                w_heads = self.qkv_net(self.layer_norm(cat))
+            else:
+                w_heads = self.qkv_net(cat)
+            r_head_k = self.r_net(r)
+
+            w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1)
+            w_head_q = w_head_q[-qlen:]
+        else:
+            if self.pre_lnorm:
+                w_heads = self.qkv_net(self.layer_norm(w))
+            else:
+                w_heads = self.qkv_net(w)
+            r_head_k = self.r_net(r)
+
+            w_head_q, w_head_k, w_head_v = torch.chunk(w_heads, 3, dim=-1)
+
+        klen = w_head_k.size(0)
+
+        w_head_q = w_head_q.view(qlen, bsz, self.n_head, self.d_head)  # qlen x bsz x n_head x d_head
+        w_head_k = w_head_k.view(klen, bsz, self.n_head, self.d_head)  # qlen x bsz x n_head x d_head
+        w_head_v = w_head_v.view(klen, bsz, self.n_head, self.d_head)  # qlen x bsz x n_head x d_head
+
+        r_head_k = r_head_k.view(rlen, self.n_head, self.d_head)  # qlen x n_head x d_head
+
+        # compute attention score
+        rw_head_q = w_head_q + self.r_w_bias  # qlen x bsz x n_head x d_head
+        AC = torch.einsum("ibnd,jbnd->ijbn", (rw_head_q, w_head_k))  # qlen x klen x bsz x n_head
+
+        rr_head_q = w_head_q + self.r_r_bias
+        BD = torch.einsum("ibnd,jnd->ijbn", (rr_head_q, r_head_k))  # qlen x klen x bsz x n_head
+        BD = self._rel_shift(BD)
+
+        # [qlen x klen x bsz x n_head]
+        attn_score = AC + BD
+        attn_score.mul_(self.scale)
+
+        mask_value = torch.finfo(attn_score.dtype).min
+
+        # compute attention probability
+        if attn_mask is not None and torch.sum(attn_mask).item():
+            attn_mask = attn_mask == 1  # Switch to bool
+            if attn_mask.dim() == 2:
+                attn_score = (
+                    attn_score.float().masked_fill(attn_mask[None, :, :, None], mask_value).type_as(attn_score)
+                )
+            elif attn_mask.dim() == 3:
+                attn_score = attn_score.float().masked_fill(attn_mask[:, :, :, None], mask_value).type_as(attn_score)
+
+        # [qlen x klen x bsz x n_head]
+        attn_prob = nn.functional.softmax(attn_score, dim=1)
+        attn_prob = self.dropatt(attn_prob)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_prob = attn_prob * head_mask
+
+        # compute attention vector
+        attn_vec = torch.einsum("ijbn,jbnd->ibnd", (attn_prob, w_head_v))
+
+        # [qlen x bsz x n_head x d_head]
+        attn_vec = attn_vec.contiguous().view(attn_vec.size(0), attn_vec.size(1), self.n_head * self.d_head)
+
+        # linear projection
+        attn_out = self.o_net(attn_vec)
+        attn_out = self.drop(attn_out)
+
+        if self.pre_lnorm:
+            # residual connection
+            outputs = [w + attn_out]
+        else:
+            # residual connection + layer normalization
+            outputs = [self.layer_norm(w + attn_out)]
+
+        if output_attentions:
+            outputs.append(attn_prob)
+
+        return outputs
+
+
+class RelPartialLearnableDecoderLayer(nn.Module):
+    def __init__(self, n_head, d_model, d_head, d_inner, dropout, layer_norm_epsilon=1e-5, **kwargs):
+        super().__init__()
+
+        self.dec_attn = RelPartialLearnableMultiHeadAttn(
+            n_head, d_model, d_head, dropout, layer_norm_epsilon=layer_norm_epsilon, **kwargs
+        )
+        self.pos_ff = PositionwiseFF(
+            d_model, d_inner, dropout, pre_lnorm=kwargs.get("pre_lnorm"), layer_norm_epsilon=layer_norm_epsilon
+        )
+
+    def forward(self, dec_inp, r, dec_attn_mask=None, mems=None, head_mask=None, output_attentions=False):
+        attn_outputs = self.dec_attn(
+            dec_inp,
+            r,
+            attn_mask=dec_attn_mask,
+            mems=mems,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+        )
+        ff_output = self.pos_ff(attn_outputs[0])
+
+        outputs = [ff_output] + attn_outputs[1:]
+
+        return outputs
+
+
+class AdaptiveEmbedding(nn.Module):
+    def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, sample_softmax=False):
+        super().__init__()
+
+        self.n_token = n_token
+        self.d_embed = d_embed
+
+        self.cutoffs = cutoffs + [n_token]
+        self.div_val = div_val
+        self.d_proj = d_proj
+
+        self.emb_scale = d_proj**0.5
+
+        self.cutoff_ends = [0] + self.cutoffs
+
+        self.emb_layers = nn.ModuleList()
+        self.emb_projs = nn.ParameterList()
+        if div_val == 1:
+            self.emb_layers.append(nn.Embedding(n_token, d_embed, sparse=sample_softmax > 0))
+            if d_proj != d_embed:
+                self.emb_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_embed)))
+        else:
+            for i in range(len(self.cutoffs)):
+                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
+                d_emb_i = d_embed // (div_val**i)
+                self.emb_layers.append(nn.Embedding(r_idx - l_idx, d_emb_i))
+                self.emb_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_emb_i)))
+
+    def forward(self, inp):
+        if self.div_val == 1:
+            embed = self.emb_layers[0](inp)
+            if self.d_proj != self.d_embed:
+                embed = nn.functional.linear(embed, self.emb_projs[0])
+        else:
+            param = next(self.parameters())
+            inp_flat = inp.view(-1)
+            emb_flat = torch.zeros([inp_flat.size(0), self.d_proj], dtype=param.dtype, device=param.device)
+            for i in range(len(self.cutoffs)):
+                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
+
+                mask_i = (inp_flat >= l_idx) & (inp_flat < r_idx)
+                indices_i = mask_i.nonzero().squeeze()
+
+                if indices_i.numel() == 0:
+                    continue
+
+                inp_i = inp_flat.index_select(0, indices_i) - l_idx
+                emb_i = self.emb_layers[i](inp_i)
+                emb_i = nn.functional.linear(emb_i, self.emb_projs[i])
+
+                emb_flat.index_copy_(0, indices_i, emb_i)
+
+            embed_shape = inp.size() + (self.d_proj,)
+            embed = emb_flat.view(embed_shape)
+
+        embed.mul_(self.emb_scale)
+
+        return embed
+
+
+class TransfoXLPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config: TransfoXLConfig
+    load_tf_weights = load_tf_weights_in_transfo_xl
+    base_model_prefix = "transformer"
+
+    def _init_weight(self, weight):
+        if self.config.init == "uniform":
+            nn.init.uniform_(weight, -self.config.init_range, self.config.init_range)
+        elif self.config.init == "normal":
+            nn.init.normal_(weight, 0.0, self.config.init_std)
+
+    def _init_bias(self, bias):
+        nn.init.constant_(bias, 0.0)
+
+    def _init_weights(self, m):
+        """Initialize the weights."""
+        classname = m.__class__.__name__
+        if classname.find("Linear") != -1:
+            if hasattr(m, "weight") and m.weight is not None:
+                self._init_weight(m.weight)
+            if hasattr(m, "bias") and m.bias is not None:
+                self._init_bias(m.bias)
+        elif classname.find("AdaptiveEmbedding") != -1:
+            if hasattr(m, "emb_projs"):
+                for i in range(len(m.emb_projs)):
+                    if m.emb_projs[i] is not None:
+                        nn.init.normal_(m.emb_projs[i], 0.0, self.config.proj_init_std)
+        elif classname.find("Embedding") != -1:
+            if hasattr(m, "weight"):
+                self._init_weight(m.weight)
+        elif classname.find("ProjectedAdaptiveLogSoftmax") != -1:
+            if hasattr(m, "cluster_weight") and m.cluster_weight is not None:
+                self._init_weight(m.cluster_weight)
+            if hasattr(m, "cluster_bias") and m.cluster_bias is not None:
+                self._init_bias(m.cluster_bias)
+            if hasattr(m, "out_projs"):
+                for i in range(len(m.out_projs)):
+                    if m.out_projs[i] is not None:
+                        nn.init.normal_(m.out_projs[i], 0.0, self.config.proj_init_std)
+        elif classname.find("LayerNorm") != -1:
+            if hasattr(m, "weight"):
+                nn.init.normal_(m.weight, 1.0, self.config.init_std)
+            if hasattr(m, "bias") and m.bias is not None:
+                self._init_bias(m.bias)
+        else:
+            if hasattr(m, "r_emb"):
+                self._init_weight(m.r_emb)
+            if hasattr(m, "r_w_bias"):
+                self._init_weight(m.r_w_bias)
+            if hasattr(m, "r_r_bias"):
+                self._init_weight(m.r_r_bias)
+            if hasattr(m, "r_bias"):
+                self._init_bias(m.r_bias)
+
+    def resize_token_embeddings(self, new_num_tokens: Optional[int] = None, layer: Optional[int] = -1):
+        """
+        Resize input token embeddings matrix of the model if new_num_tokens != config.vocab_size. Take care of tying
+        weights embeddings afterwards if the model class has a *tie_weights()* method.
+
+        Arguments:
+            new_num_tokens: (*optional*) int:
+                New number of tokens in the embedding matrix. Increasing the size will add newly initialized vectors at
+                the end. Reducing the size will remove vectors from the end. If not provided or None: does nothing and
+                just returns a pointer to the input tokens `torch.nn.Embeddings` Module of the model.
+            layer: (*optional*) int:
+                Layer of the *AdaptiveEmbedding* where the resizing should be done. Per default the last layer will be
+                resized. Be aware that when resizing other than the last layer, you have to ensure that the new
+                token(s) in the tokenizer are at the corresponding position.
+
+        Return: `torch.nn.Embeddings` Pointer to the input tokens Embeddings Module of the model
+        """
+        base_model = getattr(self, self.base_model_prefix, self)  # get the base model if needed
+
+        if new_num_tokens is None:
+            return self.get_input_embeddings()
+
+        new_num_tokens_layer, layer = self._get_new_num_tokens_layer(new_num_tokens, layer)
+        assert new_num_tokens_layer > 0, "The size of the new embedding layer cannot be 0 or less"
+        model_embeds = base_model._resize_token_embeddings(new_num_tokens_layer, layer)
+
+        # Update base model and current model config
+        self.config.vocab_size = new_num_tokens
+        base_model.vocab_size = new_num_tokens
+        base_model.n_token = new_num_tokens
+
+        new_embedding_shapes = self._get_embedding_shapes()
+        self._resize_cutoffs(new_num_tokens, new_num_tokens_layer, new_embedding_shapes, layer)
+
+        # Tie weights again if needed
+        self.tie_weights()
+
+        return model_embeds
+
+    def _get_new_num_tokens_layer(self, new_num_tokens, layer):
+        embeddings = self.get_input_embeddings()
+        if layer == -1:
+            layer = len(embeddings.emb_layers) - 1
+        assert 0 <= layer <= len(embeddings.emb_layers) - 1
+
+        new_num_tokens_layer = (
+            new_num_tokens
+            - sum(emb.weight.shape[0] for emb in embeddings.emb_layers[:layer])
+            - sum(emb.weight.shape[0] for emb in embeddings.emb_layers[layer + 1 :])
+        )
+        return new_num_tokens_layer, layer
+
+    def _get_embedding_shapes(self):
+        embeddings = self.get_input_embeddings()
+        return [emb.weight.shape[0] for emb in embeddings.emb_layers]
+
+    def _resize_token_embeddings(self, new_num_tokens, layer=-1):
+        embeddings = self.get_input_embeddings()
+        if new_num_tokens is None:
+            return embeddings
+        new_embeddings_layer = self._get_resized_embeddings(embeddings.emb_layers[layer], new_num_tokens)
+        embeddings.emb_layers[layer] = new_embeddings_layer
+
+        self.set_input_embeddings(embeddings)
+
+        return self.get_input_embeddings()
+
+    def _resize_cutoffs(self, new_num_tokens, new_emb_size, new_embedding_shapes, layer):
+        embeddings = self.get_input_embeddings()
+
+        for i in range(layer, len(embeddings.cutoffs)):
+            embeddings.cutoffs[i] = sum(new_embedding_shapes[: i + 1])
+
+        embeddings.cutoff_ends = [0] + embeddings.cutoffs
+        embeddings.n_token = new_num_tokens
+
+        self.config.cutoffs = embeddings.cutoffs[:-1]
+
+        return embeddings.cutoffs
+
+
+@dataclass
+class TransfoXLModelOutput(ModelOutput):
+    """
+    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        mems (`list[torch.FloatTensor]` of length `config.n_layers`):
+            Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `mems`
+            input) to speed up sequential decoding. The token ids which have their past given to this model should not
+            be passed as input ids as they have already been computed.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    last_hidden_state: torch.FloatTensor
+    mems: Optional[list[torch.FloatTensor]] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class TransfoXLSequenceClassifierOutputWithPast(ModelOutput):
+    """
+    Base class for outputs of sentence classification models.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Classification (or regression if config.num_labels==1) loss.
+        logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
+            Classification (or regression if config.num_labels==1) scores (before SoftMax).
+        mems (`list[torch.FloatTensor]` of length `config.n_layers`):
+            Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `mems`
+            input) to speed up sequential decoding. The token ids which have their past given to this model should not
+            be passed as input ids as they have already been computed.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    mems: Optional[list[torch.FloatTensor]] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class TransfoXLLMHeadModelOutput(ModelOutput):
+    """
+    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
+
+    Args:
+        losses (`torch.FloatTensor` of shape *(batch_size, sequence_length-1)*, *optional*, returned when `labels` is provided):
+            Language modeling losses (not reduced).
+        prediction_scores (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+            Prediction scores of the language modeling head (scores for each vocabulary token after SoftMax).
+        mems (`list[torch.FloatTensor]` of length `config.n_layers`):
+            Contains pre-computed hidden-states (key and values in the attention blocks). Can be used (see `mems`
+            input) to speed up sequential decoding. The token ids which have their past given to this model should not
+            be passed as input ids as they have already been computed.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        loss (`torch.FloatTensor` of shape `()`, *optional*, returned when `labels` is provided)
+            Reduced language modeling loss.
+    """
+
+    losses: Optional[torch.FloatTensor] = None
+    prediction_scores: Optional[torch.FloatTensor] = None
+    mems: Optional[list[torch.FloatTensor]] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    loss: Optional[torch.FloatTensor] = None
+
+    @property
+    def logits(self):
+        # prediction scores are the output of the adaptive softmax, see
+        # the file `modeling_transfo_xl_utilities`. Since the adaptive
+        # softmax returns the log softmax value, `self.prediction_scores`
+        # are strictly speaking not exactly `logits`, but behave the same
+        # way logits do.
+        return self.prediction_scores
+
+
+TRANSFO_XL_START_DOCSTRING = r"""
+
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
+    Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
+    and behavior.
+
+    Parameters:
+        config ([`TransfoXLConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+TRANSFO_XL_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        mems (`list[torch.FloatTensor]` of length `config.n_layers`):
+            Contains pre-computed hidden-states (key and values in the attention blocks) as computed by the model (see
+            `mems` output below). Can be used to speed up sequential decoding. The token ids which have their mems
+            given to this model should not be passed as `input_ids` as they have already been computed.
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare Bert Model transformer outputting raw hidden-states without any specific head on top.",
+    TRANSFO_XL_START_DOCSTRING,
+)
+class TransfoXLModel(TransfoXLPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.n_token = config.vocab_size
+
+        self.d_embed = config.d_embed
+        self.d_model = config.d_model
+        self.n_head = config.n_head
+        self.d_head = config.d_head
+
+        self.word_emb = AdaptiveEmbedding(
+            config.vocab_size, config.d_embed, config.d_model, config.cutoffs, div_val=config.div_val
+        )
+
+        self.drop = nn.Dropout(config.dropout)
+
+        self.n_layer = config.n_layer
+        self.mem_len = config.mem_len
+        self.attn_type = config.attn_type
+
+        if not config.untie_r:
+            self.r_w_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
+            self.r_r_bias = nn.Parameter(torch.FloatTensor(self.n_head, self.d_head))
+
+        self.layers = nn.ModuleList()
+        if config.attn_type == 0:  # the default attention
+            for i in range(config.n_layer):
+                self.layers.append(
+                    RelPartialLearnableDecoderLayer(
+                        config.n_head,
+                        config.d_model,
+                        config.d_head,
+                        config.d_inner,
+                        config.dropout,
+                        dropatt=config.dropatt,
+                        pre_lnorm=config.pre_lnorm,
+                        r_w_bias=None if config.untie_r else self.r_w_bias,
+                        r_r_bias=None if config.untie_r else self.r_r_bias,
+                        layer_norm_epsilon=config.layer_norm_epsilon,
+                    )
+                )
+        else:  # learnable embeddings and absolute embeddings are not used in our pretrained checkpoints
+            raise NotImplementedError  # Removed them to avoid maintaining dead code
+
+        self.same_length = config.same_length
+        self.clamp_len = config.clamp_len
+
+        if self.attn_type == 0:  # default attention
+            self.pos_emb = PositionalEmbedding(self.d_model)
+        else:  # learnable embeddings and absolute embeddings
+            raise NotImplementedError  # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.word_emb
+
+    def set_input_embeddings(self, new_embeddings):
+        self.word_emb = new_embeddings
+
+    def backward_compatible(self):
+        self.sample_softmax = -1
+
+    def reset_memory_length(self, mem_len):
+        self.mem_len = mem_len
+
+    def _prune_heads(self, heads):
+        logger.info("Head pruning is not implemented for Transformer-XL model")
+        pass
+
+    def init_mems(self, bsz):
+        if self.mem_len > 0:
+            mems = []
+            param = next(self.parameters())
+            for i in range(self.n_layer):
+                empty = torch.zeros(self.mem_len, bsz, self.config.d_model, dtype=param.dtype, device=param.device)
+                mems.append(empty)
+
+            return mems
+        else:
+            return None
+
+    def _update_mems(self, hids, mems, mlen, qlen):
+        # does not deal with None
+        if mems is None:
+            return None
+
+        # mems is not None
+        assert len(hids) == len(mems), "len(hids) != len(mems)"
+
+        # There are `mlen + qlen` steps that can be cached into mems
+        with torch.no_grad():
+            new_mems = []
+            end_idx = mlen + max(0, qlen)
+            beg_idx = max(0, end_idx - self.mem_len)
+            for i in range(len(hids)):
+                cat = torch.cat([mems[i], hids[i]], dim=0)
+                new_mems.append(cat[beg_idx:end_idx].detach())
+
+        return new_mems
+
+    @add_start_docstrings_to_model_forward(TRANSFO_XL_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TransfoXLModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        mems: Optional[list[torch.FloatTensor]] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, TransfoXLModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # the original code for Transformer-XL used shapes [len, bsz] but we want a unified interface in the library
+        # so we transpose here from shape [bsz, len] to shape [len, bsz]
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_ids = input_ids.transpose(0, 1).contiguous()
+            qlen, bsz = input_ids.size()
+        elif inputs_embeds is not None:
+            inputs_embeds = inputs_embeds.transpose(0, 1).contiguous()
+            qlen, bsz = inputs_embeds.shape[0], inputs_embeds.shape[1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if mems is None:
+            mems = self.init_mems(bsz)
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] (a head_mask for each layer)
+        # and head_mask is converted to shape [num_hidden_layers x qlen x klen x bsz x n_head]
+        if head_mask is not None:
+            if head_mask.dim() == 1:
+                head_mask = head_mask.unsqueeze(0).unsqueeze(0).unsqueeze(0).unsqueeze(0)
+                head_mask = head_mask.expand(self.n_layer, -1, -1, -1, -1)
+            elif head_mask.dim() == 2:
+                head_mask = head_mask.unsqueeze(1).unsqueeze(1).unsqueeze(1)
+            head_mask = head_mask.to(
+                dtype=next(self.parameters()).dtype
+            )  # switch to float if need + fp16 compatibility
+        else:
+            head_mask = [None] * self.n_layer
+
+        if inputs_embeds is not None:
+            word_emb = inputs_embeds
+        else:
+            word_emb = self.word_emb(input_ids)
+
+        mlen = mems[0].size(0) if mems is not None else 0
+        klen = mlen + qlen
+        if self.same_length:
+            all_ones = word_emb.new_ones((qlen, klen), dtype=torch.bool)
+            mask_len = klen - self.mem_len
+            if mask_len > 0:
+                mask_shift_len = qlen - mask_len
+            else:
+                mask_shift_len = qlen
+            dec_attn_mask = (torch.triu(all_ones, 1 + mlen) + torch.tril(all_ones, -mask_shift_len))[:, :, None]  # -1
+        else:
+            dec_attn_mask = torch.triu(word_emb.new_ones((qlen, klen), dtype=torch.bool), diagonal=1 + mlen)[
+                :, :, None
+            ]
+
+        hids = []
+        attentions = [] if output_attentions else None
+        if self.attn_type == 0:  # default
+            pos_seq = torch.arange(klen - 1, -1, -1.0, device=word_emb.device, dtype=torch.int64).type_as(
+                dtype=word_emb.dtype
+            )
+            if self.clamp_len > 0:
+                pos_seq.clamp_(max=self.clamp_len)
+            pos_emb = self.pos_emb(pos_seq)
+
+            core_out = self.drop(word_emb)
+            pos_emb = self.drop(pos_emb)
+
+            for i, layer in enumerate(self.layers):
+                hids.append(core_out)
+                mems_i = None if mems is None else mems[i]
+                layer_outputs = layer(
+                    core_out,
+                    pos_emb,
+                    dec_attn_mask=dec_attn_mask,
+                    mems=mems_i,
+                    head_mask=head_mask[i],
+                    output_attentions=output_attentions,
+                )
+                core_out = layer_outputs[0]
+                if output_attentions:
+                    attentions.append(layer_outputs[1])
+        else:  # learnable embeddings and absolute embeddings
+            raise NotImplementedError  # Removed these to avoid maintaining dead code - They are not used in our pretrained checkpoint
+
+        core_out = self.drop(core_out)
+
+        new_mems = self._update_mems(hids, mems, mlen, qlen)
+
+        if output_hidden_states:
+            # Add last layer and transpose to library standard shape [bsz, len, hidden_dim]
+            hids.append(core_out)
+            hids = tuple(t.transpose(0, 1).contiguous() for t in hids)
+        else:
+            hids = None
+        if output_attentions:
+            # Transpose to library standard shape [bsz, n_heads, query_seq_len, key_seq_len]
+            attentions = tuple(t.permute(2, 3, 0, 1).contiguous() for t in attentions)
+        # We transpose back here to shape [bsz, len, hidden_dim]
+        core_out = core_out.transpose(0, 1).contiguous()
+
+        if not return_dict:
+            return tuple(v for v in [core_out, new_mems, hids, attentions] if v is not None)
+
+        return TransfoXLModelOutput(
+            last_hidden_state=core_out,
+            mems=new_mems,
+            hidden_states=hids,
+            attentions=attentions,
+        )
+
+
+@add_start_docstrings(
+    """
+    The Transformer-XL Model with a language modeling head on top (adaptive softmax with weights tied to the adaptive
+    input embeddings)
+    """,
+    TRANSFO_XL_START_DOCSTRING,
+)
+class TransfoXLLMHeadModel(TransfoXLPreTrainedModel):
+    _tied_weights_keys = [r"crit\.out_projs\.\d+", r"crit\.out_layers\.\d+\.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.transformer = TransfoXLModel(config)
+        self.sample_softmax = config.sample_softmax
+        self.trainer_compatible = getattr(config, "trainer_compatible", False)
+
+        if not self.trainer_compatible:
+            warnings.warn(
+                "The output of TransfoXL will be updated in v5 to support a single loss as first argument. In order "
+                "to use that updated output, please specify `trainer_compatible=True` as your configuration"
+                " attribute.",
+                DeprecationWarning,
+            )
+
+        assert self.sample_softmax <= 0, (
+            "Sampling from the softmax is not implemented yet. Please look at issue: #3310:"
+            " https://github.com/huggingface/transformers/issues/3310"
+        )
+
+        self.crit = ProjectedAdaptiveLogSoftmax(
+            config.vocab_size, config.d_embed, config.d_model, config.cutoffs, div_val=config.div_val
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def tie_weights(self):
+        """
+        Run this to be sure output and input (adaptive) softmax weights are tied
+        """
+
+        if self.config.tie_word_embeddings:
+            for i in range(len(self.crit.out_layers)):
+                self._tie_or_clone_weights(self.crit.out_layers[i], self.transformer.word_emb.emb_layers[i])
+        if self.config.tie_projs:
+            for i, tie_proj in enumerate(self.config.tie_projs):
+                if tie_proj and self.config.div_val == 1 and self.config.d_model != self.config.d_embed:
+                    if self.config.torchscript:
+                        self.crit.out_projs[i] = nn.Parameter(self.transformer.word_emb.emb_projs[0].clone())
+                    else:
+                        self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[0]
+                elif tie_proj and self.config.div_val != 1:
+                    if self.config.torchscript:
+                        self.crit.out_projs[i] = nn.Parameter(self.transformer.word_emb.emb_projs[i].clone())
+                    else:
+                        self.crit.out_projs[i] = self.transformer.word_emb.emb_projs[i]
+
+    def reset_memory_length(self, mem_len):
+        self.transformer.reset_memory_length(mem_len)
+
+    def init_mems(self, bsz):
+        return self.transformer.init_mems(bsz)
+
+    @add_start_docstrings_to_model_forward(TRANSFO_XL_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TransfoXLLMHeadModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        mems: Optional[list[torch.FloatTensor]] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, TransfoXLLMHeadModelOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if input_ids is not None:
+            bsz, tgt_len = input_ids.size(0), input_ids.size(1)
+        elif inputs_embeds is not None:
+            bsz, tgt_len = inputs_embeds.size(0), inputs_embeds.size(1)
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            mems=mems,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden = transformer_outputs[0]
+        pred_hid = last_hidden[:, -tgt_len:]
+
+        if labels is not None:
+            # Prevents all labels being -100 and throwing an error
+            # when backwarding the loss
+            miss_valid_label = labels[0, 1:].sum() == (labels.size(1) - 1) * -100
+            if miss_valid_label:
+                # Sets an <EOS> token, just to prevent loss from being NaN
+                labels[0, 1] = self.config.eos_token_id
+
+        softmax_output = self.crit(pred_hid, labels)
+        prediction_scores = softmax_output.view(bsz, tgt_len, -1) if labels is None else ()
+
+        if labels is not None:
+            losses = softmax_output.view(bsz, tgt_len - 1)
+            # Avoids from incorporating padding (-100) tokens into loss value
+            loss = losses[losses != 0].mean()
+        else:
+            losses, loss = None, None
+
+        if not return_dict:
+            if self.trainer_compatible:
+                output = (prediction_scores, losses) if losses is not None else (prediction_scores,)
+                output += transformer_outputs[1:]
+                return ((loss,) + output) if loss is not None else output
+            else:
+                output = (prediction_scores, *transformer_outputs[1:])
+                output = ((losses,) + output) if losses is not None else output
+                return (output + (loss,)) if loss is not None else output
+
+        return TransfoXLLMHeadModelOutput(
+            loss=loss,
+            prediction_scores=prediction_scores,
+            losses=losses,
+            mems=transformer_outputs.mems,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+    def get_output_embeddings(self):
+        """Double-check if you are using adaptive softmax."""
+        if self.sample_softmax > 0:
+            return self.out_layer
+        else:
+            return self.crit.out_layers[-1]
+
+    def prepare_inputs_for_generation(self, input_ids, past_key_values=None, **model_kwargs):
+        inputs = {}
+
+        # if past is defined in model kwargs then use it for faster decoding
+        if past_key_values:
+            inputs["mems"] = past_key_values
+            inputs["input_ids"] = input_ids[:, -1].unsqueeze(-1)
+        else:
+            inputs["input_ids"] = input_ids
+
+        return inputs
+
+    def _resize_cutoffs(self, new_num_tokens, new_emb_size, new_embedding_shapes, layer):
+        new_cutoffs = super()._resize_cutoffs(new_num_tokens, new_emb_size, new_embedding_shapes, layer)
+
+        self.crit.cutoffs = new_cutoffs
+        self.crit.cutoff_ends = [0] + new_cutoffs
+        self.crit.n_token = new_num_tokens
+
+    @staticmethod
+    def _reorder_cache(mems: list[torch.Tensor], beam_idx: torch.Tensor) -> list[torch.Tensor]:
+        """
+        This function is used to re-order the `mems` cache if [`~PreTrainedModel.beam_search`] or
+        [`~PreTrainedModel.beam_sample`] is called. This is required to match `mems` with the correct beam_idx at every
+        generation step.
+        """
+        return [layer_past.index_select(1, beam_idx.to(layer_past.device)) for layer_past in mems]
+
+
+@add_start_docstrings(
+    """
+    The Transformer-XL Model transformer with a sequence classification head on top (linear layer).
+
+    [`TransfoXLForSequenceClassification`] uses the last token in order to do the classification, as other causal
+    models (e.g. GPT-1) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """,
+    TRANSFO_XL_START_DOCSTRING,
+)
+class TransfoXLForSequenceClassification(TransfoXLPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.transformer = TransfoXLModel(config)
+        self.score = nn.Linear(config.d_embed, self.num_labels, bias=False)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(TRANSFO_XL_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TransfoXLSequenceClassifierOutputWithPast,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        mems: Optional[list[torch.FloatTensor]] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, TransfoXLSequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            mems=mems,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size, sequence_length = input_ids.shape[:2]
+        else:
+            batch_size, sequence_length = inputs_embeds.shape[:2]
+
+        assert self.config.pad_token_id is not None or batch_size == 1, (
+            "Cannot handle batch sizes > 1 if no padding token is defined."
+        )
+        if self.config.pad_token_id is None:
+            sequence_lengths = -1
+        else:
+            if input_ids is not None:
+                # if no pad token found, use modulo instead of reverse indexing for ONNX compatibility
+                sequence_lengths = torch.eq(input_ids, self.config.pad_token_id).int().argmax(-1) - 1
+                sequence_lengths = sequence_lengths % input_ids.shape[-1]
+                sequence_lengths = sequence_lengths.to(logits.device)
+            else:
+                sequence_lengths = -1
+                logger.warning_once(
+                    f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
+                    "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
+                )
+
+        pooled_logits = logits[range(batch_size), sequence_lengths]
+
+        loss = None
+        if labels is not None:
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TransfoXLSequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            mems=transformer_outputs.mems,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+__all__ = [
+    "AdaptiveEmbedding",
+    "TransfoXLForSequenceClassification",
+    "TransfoXLLMHeadModel",
+    "TransfoXLModel",
+    "TransfoXLPreTrainedModel",
+    "load_tf_weights_in_transfo_xl",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/modeling_transfo_xl_utilities.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/modeling_transfo_xl_utilities.py
new file mode 100644
index 0000000000000000000000000000000000000000..f76f3ccc6259fcb033b44eb43dd98be23482221c
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/modeling_transfo_xl_utilities.py
@@ -0,0 +1,251 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Utilities for PyTorch Transformer XL model. Directly adapted from https://github.com/kimiyoung/transformer-xl.
+"""
+
+import torch
+from torch import nn
+
+
+# CUDA_MAJOR = int(torch.version.cuda.split('.')[0])
+# CUDA_MINOR = int(torch.version.cuda.split('.')[1])
+
+
+class ProjectedAdaptiveLogSoftmax(nn.Module):
+    def __init__(self, n_token, d_embed, d_proj, cutoffs, div_val=1, keep_order=False):
+        super().__init__()
+
+        self.n_token = n_token
+        self.d_embed = d_embed
+        self.d_proj = d_proj
+
+        self.cutoffs = cutoffs + [n_token]
+        self.cutoff_ends = [0] + self.cutoffs
+        self.div_val = div_val
+
+        self.shortlist_size = self.cutoffs[0]
+        self.n_clusters = len(self.cutoffs) - 1
+        self.head_size = self.shortlist_size + self.n_clusters
+
+        if self.n_clusters > 0:
+            self.cluster_weight = nn.Parameter(torch.zeros(self.n_clusters, self.d_embed))
+            self.cluster_bias = nn.Parameter(torch.zeros(self.n_clusters))
+
+        self.out_layers = nn.ModuleList()
+        self.out_projs = nn.ParameterList()
+
+        if div_val == 1:
+            for i in range(len(self.cutoffs)):
+                if d_proj != d_embed:
+                    self.out_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_embed)))
+                else:
+                    self.out_projs.append(None)
+
+            self.out_layers.append(nn.Linear(d_embed, n_token))
+        else:
+            for i in range(len(self.cutoffs)):
+                l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
+                d_emb_i = d_embed // (div_val**i)
+
+                self.out_projs.append(nn.Parameter(torch.FloatTensor(d_proj, d_emb_i)))
+
+                self.out_layers.append(nn.Linear(d_emb_i, r_idx - l_idx))
+
+        self.keep_order = keep_order
+
+    def _compute_logit(self, hidden, weight, bias, proj):
+        if proj is None:
+            logit = nn.functional.linear(hidden, weight, bias=bias)
+        else:
+            # if CUDA_MAJOR <= 9 and CUDA_MINOR <= 1:
+            proj_hid = nn.functional.linear(hidden, proj.t().contiguous())
+            logit = nn.functional.linear(proj_hid, weight, bias=bias)
+            # else:
+            #     logit = torch.einsum('bd,de,ev->bv', (hidden, proj, weight.t()))
+            #     if bias is not None:
+            #         logit = logit + bias
+
+        return logit
+
+    def forward(self, hidden, labels=None, keep_order=False):
+        """
+        Params:
+            hidden :: [len*bsz x d_proj]
+            labels :: [len*bsz]
+
+        Return:
+            if labels is None: out :: [len*bsz x n_tokens] log probabilities of tokens over the vocabulary else: out ::
+            [(len-1)*bsz] Negative log likelihood. We could replace this implementation by the native PyTorch one if
+            theirs had an option to set bias on all clusters in the native one. here:
+            https://github.com/pytorch/pytorch/blob/dbe6a7a9ff1a364a8706bf5df58a1ca96d2fd9da/torch/nn/modules/adaptive.py#L138
+        """
+
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            hidden = hidden[..., :-1, :].contiguous()
+            labels = labels[..., 1:].contiguous()
+            hidden = hidden.view(-1, hidden.size(-1))
+            labels = labels.view(-1)
+            if hidden.size(0) != labels.size(0):
+                raise RuntimeError("Input and labels should have the same size in the batch dimension.")
+        else:
+            hidden = hidden.view(-1, hidden.size(-1))
+
+        if self.n_clusters == 0:
+            logit = self._compute_logit(hidden, self.out_layers[0].weight, self.out_layers[0].bias, self.out_projs[0])
+            if labels is not None:
+                mask = labels != -100
+                out = torch.zeros_like(labels, dtype=hidden.dtype, device=hidden.device)
+                out[mask] = (
+                    -nn.functional.log_softmax(logit, dim=-1)[mask].gather(1, labels[mask].unsqueeze(1)).squeeze(1)
+                )
+            else:
+                out = nn.functional.log_softmax(logit, dim=-1)
+        else:
+            # construct weights and biases
+            weights, biases = [], []
+            for i in range(len(self.cutoffs)):
+                if self.div_val == 1:
+                    l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
+                    weight_i = self.out_layers[0].weight[l_idx:r_idx]
+                    bias_i = self.out_layers[0].bias[l_idx:r_idx]
+                else:
+                    weight_i = self.out_layers[i].weight
+                    bias_i = self.out_layers[i].bias
+
+                if i == 0:
+                    weight_i = torch.cat([weight_i, self.cluster_weight], dim=0)
+                    bias_i = torch.cat([bias_i, self.cluster_bias], dim=0)
+
+                weights.append(weight_i)
+                biases.append(bias_i)
+
+            head_weight, head_bias, head_proj = weights[0], biases[0], self.out_projs[0]
+
+            head_logit = self._compute_logit(hidden, head_weight, head_bias, head_proj)
+            head_logprob = nn.functional.log_softmax(head_logit, dim=1)
+
+            if labels is None:
+                out = hidden.new_empty((head_logit.size(0), self.n_token))
+            else:
+                out = torch.zeros_like(labels, dtype=hidden.dtype, device=hidden.device)
+
+            offset = 0
+            cutoff_values = [0] + self.cutoffs
+            for i in range(len(cutoff_values) - 1):
+                l_idx, r_idx = cutoff_values[i], cutoff_values[i + 1]
+
+                if labels is not None:
+                    mask_i = (labels >= l_idx) & (labels < r_idx)
+                    indices_i = mask_i.nonzero().squeeze()
+
+                    if indices_i.numel() == 0:
+                        continue
+
+                    target_i = labels.index_select(0, indices_i) - l_idx
+                    head_logprob_i = head_logprob.index_select(0, indices_i)
+                    hidden_i = hidden.index_select(0, indices_i)
+                else:
+                    hidden_i = hidden
+
+                if i == 0:
+                    if labels is not None:
+                        logprob_i = head_logprob_i.gather(1, target_i[:, None]).squeeze(1)
+                    else:
+                        out[:, : self.cutoffs[0]] = head_logprob[:, : self.cutoffs[0]]
+                else:
+                    weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i]
+
+                    tail_logit_i = self._compute_logit(hidden_i, weight_i, bias_i, proj_i)
+                    tail_logprob_i = nn.functional.log_softmax(tail_logit_i, dim=1)
+                    cluster_prob_idx = self.cutoffs[0] + i - 1  # No probability for the head cluster
+                    if labels is not None:
+                        logprob_i = head_logprob_i[:, cluster_prob_idx] + tail_logprob_i.gather(
+                            1, target_i[:, None]
+                        ).squeeze(1)
+                    else:
+                        logprob_i = head_logprob[:, cluster_prob_idx, None] + tail_logprob_i
+                        out[:, l_idx:r_idx] = logprob_i
+
+                if labels is not None:
+                    if (hasattr(self, "keep_order") and self.keep_order) or keep_order:
+                        out.index_copy_(0, indices_i, -logprob_i)
+                    else:
+                        out[offset : offset + logprob_i.size(0)].copy_(-logprob_i)
+                    offset += logprob_i.size(0)
+
+        return out
+
+    def log_prob(self, hidden):
+        r"""
+        Computes log probabilities for all \\(n\_classes\\) From:
+        https://github.com/pytorch/pytorch/blob/master/torch/nn/modules/adaptive.p
+
+        Args:
+            hidden (Tensor): a minibatch of example
+
+        Returns:
+            log-probabilities of for each class \\(c\\) in range \\(0 <= c <= n\_classes\\), where \\(n\_classes\\) is
+            a parameter passed to `AdaptiveLogSoftmaxWithLoss` constructor. Shape:
+
+            - Input: \\((N, in\_features)\\)
+            - Output: \\((N, n\_classes)\\)
+        """
+        if self.n_clusters == 0:
+            logit = self._compute_logit(hidden, self.out_layers[0].weight, self.out_layers[0].bias, self.out_projs[0])
+            return nn.functional.log_softmax(logit, dim=-1)
+        else:
+            # construct weights and biases
+            weights, biases = [], []
+            for i in range(len(self.cutoffs)):
+                if self.div_val == 1:
+                    l_idx, r_idx = self.cutoff_ends[i], self.cutoff_ends[i + 1]
+                    weight_i = self.out_layers[0].weight[l_idx:r_idx]
+                    bias_i = self.out_layers[0].bias[l_idx:r_idx]
+                else:
+                    weight_i = self.out_layers[i].weight
+                    bias_i = self.out_layers[i].bias
+
+                if i == 0:
+                    weight_i = torch.cat([weight_i, self.cluster_weight], dim=0)
+                    bias_i = torch.cat([bias_i, self.cluster_bias], dim=0)
+
+                weights.append(weight_i)
+                biases.append(bias_i)
+
+            head_weight, head_bias, head_proj = weights[0], biases[0], self.out_projs[0]
+            head_logit = self._compute_logit(hidden, head_weight, head_bias, head_proj)
+
+            out = hidden.new_empty((head_logit.size(0), self.n_token))
+            head_logprob = nn.functional.log_softmax(head_logit, dim=1)
+
+            cutoff_values = [0] + self.cutoffs
+            for i in range(len(cutoff_values) - 1):
+                start_idx, stop_idx = cutoff_values[i], cutoff_values[i + 1]
+
+                if i == 0:
+                    out[:, : self.cutoffs[0]] = head_logprob[:, : self.cutoffs[0]]
+                else:
+                    weight_i, bias_i, proj_i = weights[i], biases[i], self.out_projs[i]
+
+                    tail_logit_i = self._compute_logit(hidden, weight_i, bias_i, proj_i)
+                    tail_logprob_i = nn.functional.log_softmax(tail_logit_i, dim=1)
+
+                    logprob_i = head_logprob[:, -i] + tail_logprob_i
+                    out[:, start_idx, stop_idx] = logprob_i
+
+            return out
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/tokenization_transfo_xl.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/tokenization_transfo_xl.py
new file mode 100644
index 0000000000000000000000000000000000000000..70e2da0185563d4115748266474a935ffddef059
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/transfo_xl/tokenization_transfo_xl.py
@@ -0,0 +1,825 @@
+# coding=utf-8
+# Copyright 2018 Google AI, Google Brain and Carnegie Mellon University Authors and the HuggingFace Inc. team.
+# Copyright (c) 2018, NVIDIA CORPORATION.  All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Tokenization classes for Transformer XL model. Adapted from https://github.com/kimiyoung/transformer-xl.
+"""
+
+import glob
+import os
+import pickle
+import re
+from collections import Counter, OrderedDict
+from typing import Optional
+
+import numpy as np
+
+from ....tokenization_utils import PreTrainedTokenizer
+from ....utils import (
+    cached_file,
+    check_torch_load_is_safe,
+    is_sacremoses_available,
+    is_torch_available,
+    logging,
+    requires_backends,
+    strtobool,
+    torch_only_method,
+)
+
+
+if is_sacremoses_available():
+    import sacremoses as sm
+
+
+if is_torch_available():
+    import torch
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {
+    "pretrained_vocab_file": "vocab.pkl",
+    "pretrained_vocab_file_torch": "vocab.bin",
+    "vocab_file": "vocab.txt",
+}
+
+
+PRETRAINED_CORPUS_ARCHIVE_MAP = {
+    "transfo-xl/transfo-xl-wt103": "https://huggingface.co/transfo-xl/transfo-xl-wt103/resolve/main/corpus.bin",
+}
+CORPUS_NAME = "corpus.bin"
+
+MATCH_NUMBERS = r"(?<=\d)[,.](?=\d)", r" @\g<0>@ "
+DETOKENIZE_NUMBERS = [(r" @\,@ ", r","), (r" @\.@ ", r".")]
+
+
+def tokenize_numbers(text_array: list[str]) -> list[str]:
+    """
+    Splits large comma-separated numbers and floating point values. This is done by replacing commas with ' @,@ ' and
+    dots with ' @.@ '.
+
+    Args:
+        text_array: An already tokenized text as list.
+
+    Returns:
+        A list of strings with tokenized numbers.
+
+    Example:
+
+    ```python
+    >>> tokenize_numbers(["$", "5,000", "1.73", "m"])
+    ['$', '5', '@,@', '000', '1', '@.@', '73', 'm']
+    ```"""
+    tokenized = []
+    for i in range(len(text_array)):
+        reg, sub = MATCH_NUMBERS
+        replaced = re.sub(reg, sub, text_array[i]).split()
+        tokenized.extend(replaced)
+
+    return tokenized
+
+
+def detokenize_numbers(text: str) -> str:
+    """
+    Inverts the operation of *tokenize_numbers*. This is replacing ' @,@ ' and ' @.@' by ',' and '.'.
+
+    Args:
+        text: A string where the number should be detokenized.
+
+    Returns:
+        A detokenized string.
+
+    Example:
+
+    ```python
+    >>> detokenize_numbers("$ 5 @,@ 000 1 @.@ 73 m")
+    '$ 5,000 1.73 m'
+    ```"""
+    for reg, sub in DETOKENIZE_NUMBERS:
+        text = re.sub(reg, sub, text)
+    return text
+
+
+class TransfoXLTokenizer(PreTrainedTokenizer):
+    """
+    Construct a Transformer-XL tokenizer adapted from Vocab class in [the original
+    code](https://github.com/kimiyoung/transformer-xl). The Transformer-XL tokenizer is a word-level tokenizer (no
+    sub-word tokenization).
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        special (`list[str]`, *optional*):
+            A list of special tokens (to be treated by the original implementation of this tokenizer).
+        min_freq (`int`, *optional*, defaults to 0):
+            The minimum number of times a token has to be present in order to be kept in the vocabulary (otherwise it
+            will be mapped to `unk_token`).
+        max_size (`int`, *optional*):
+            The maximum size of the vocabulary. If left unset, it will default to the size of the vocabulary found
+            after excluding the tokens according to the `min_freq` rule.
+        lower_case (`bool`, *optional*, defaults to `False`):
+            Whether or not to lowercase the input when tokenizing.
+        delimiter (`str`, *optional*):
+            The delimiter used between tokens.
+        vocab_file (`str`, *optional*):
+            File containing the vocabulary (from the original implementation).
+        pretrained_vocab_file (`str`, *optional*):
+            File containing the vocabulary as saved with the `save_pretrained()` method.
+        never_split (`list[str]`, *optional*):
+            List of tokens that should never be split. If no list is specified, will simply use the existing special
+            tokens.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        eos_token (`str`, *optional*, defaults to `"<eos>"`):
+            The end of sequence token.
+        additional_special_tokens (`list[str]`, *optional*, defaults to `['<formula>']`):
+            A list of additional special tokens (for the HuggingFace functionality).
+        language (`str`, *optional*, defaults to `"en"`):
+            The language of this tokenizer (used for mose preprocessing).
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids"]
+
+    def __init__(
+        self,
+        special=None,
+        min_freq=0,
+        max_size=None,
+        lower_case=False,
+        delimiter=None,
+        vocab_file=None,
+        pretrained_vocab_file: Optional[str] = None,
+        never_split=None,
+        unk_token="<unk>",
+        eos_token="<eos>",
+        additional_special_tokens=["<formula>"],
+        language="en",
+        **kwargs,
+    ):
+        logger.error(
+            "`TransfoXL` was deprecated due to security issues linked to `pickle.load` in `TransfoXLTokenizer`. "
+            "See more details on this model's documentation page: "
+            "`https://github.com/huggingface/transformers/blob/main/docs/source/en/model_doc/transfo-xl.md`."
+        )
+
+        requires_backends(self, "sacremoses")
+        if special is None:
+            special = []
+        self.counter = Counter()
+        self.special = special
+        self.min_freq = min_freq
+        self.max_size = max_size
+        self.lower_case = lower_case
+        self.delimiter = delimiter
+        self.vocab_file = vocab_file
+        self.punctuation_symbols = '!"#$%&()*+,-./\\:;<=>?@[\\]^_`{|}~'
+        self.punction_without_space_before_pattern = re.compile(rf"[^\s][{self.punctuation_symbols}]")
+        self.punctuation_with_space_around_pattern = self._compile_space_around_punctuation_pattern()
+        self.language = language
+        self.moses_punct_normalizer = sm.MosesPunctNormalizer(language)
+        self.moses_tokenizer = sm.MosesTokenizer(language)
+        self.moses_detokenizer = sm.MosesDetokenizer(language)
+        self.idx2sym = []
+        self.sym2idx = OrderedDict()
+        # This try... catch... is not beautiful but honestly this tokenizer was not made to be used
+        # in a library like ours, at all.
+        try:
+            vocab_dict = None
+            if pretrained_vocab_file is not None:
+                # Priority on pickle files (support PyTorch and TF)
+                if not strtobool(os.environ.get("TRUST_REMOTE_CODE", "False")):
+                    raise ValueError(
+                        "This part uses `pickle.load` which is insecure and will execute arbitrary code that is "
+                        "potentially malicious. It's recommended to never unpickle data that could have come from an "
+                        "untrusted source, or that could have been tampered with. If you already verified the pickle "
+                        "data and decided to use it, you can set the environment variable "
+                        "`TRUST_REMOTE_CODE` to `True` to allow it."
+                    )
+                with open(pretrained_vocab_file, "rb") as f:
+                    vocab_dict = pickle.load(f)
+
+                # Loading a torch-saved transfo-xl vocab dict with pickle results in an integer
+                # Entering this if statement means that we tried to load a torch-saved file with pickle, and we failed.
+                # We therefore load it with torch, if it's available.
+                if isinstance(vocab_dict, int):
+                    if not is_torch_available():
+                        raise ImportError(
+                            "Not trying to load dict with PyTorch as you need to install pytorch to load "
+                            "from a PyTorch pretrained vocabulary, "
+                            "or activate it with environment variables USE_TORCH=1 and USE_TF=0."
+                        )
+                    check_torch_load_is_safe()
+                    vocab_dict = torch.load(pretrained_vocab_file, weights_only=True)
+
+            if vocab_dict is not None:
+                for key, value in vocab_dict.items():
+                    if key not in self.__dict__ or key in ["sym2idx", "idx2sym"]:
+                        self.__dict__[key] = value
+            elif vocab_file is not None:
+                self.build_vocab()
+
+        except Exception as e:
+            raise ValueError(
+                f"Unable to parse file {pretrained_vocab_file}. Unknown format. "
+                "If you tried to load a model saved through TransfoXLTokenizerFast, "
+                "please note they are not compatible."
+            ) from e
+
+        if vocab_file is not None:
+            self.build_vocab()
+
+        super().__init__(
+            special=special,
+            min_freq=min_freq,
+            max_size=max_size,
+            lower_case=lower_case,
+            delimiter=delimiter,
+            vocab_file=vocab_file,
+            pretrained_vocab_file=pretrained_vocab_file,
+            never_split=never_split,
+            unk_token=unk_token,
+            eos_token=eos_token,
+            additional_special_tokens=additional_special_tokens,
+            language=language,
+            **kwargs,
+        )
+
+        # these are not required to initialize the parent class as only used when tokenizing.
+        if never_split is None:
+            never_split = self.all_special_tokens
+        self.never_split = never_split
+
+    @property
+    def do_lower_case(self):
+        return self.lower_case
+
+    def _compile_space_around_punctuation_pattern(self):
+        look_ahead_for_special_token = f"(?=[{self.punctuation_symbols}])"
+        look_ahead_to_match_all_except_space = r"(?=[^\s])"
+        return re.compile(r"" + look_ahead_for_special_token + look_ahead_to_match_all_except_space)
+
+    def count_file(self, path, verbose=False, add_eos=False):
+        if verbose:
+            logger.info(f"counting file {path} ...")
+        assert os.path.exists(path), f"Input file {path} not found"
+
+        sents = []
+        with open(path, "r", encoding="utf-8") as f:
+            for idx, line in enumerate(f):
+                if verbose and idx > 0 and idx % 500000 == 0:
+                    logger.info(f"    line {idx}")
+                symbols = self.tokenize(line, add_eos=add_eos)
+                self.counter.update(symbols)
+                sents.append(symbols)
+
+        return sents
+
+    def count_sents(self, sents, verbose=False):
+        """
+        sents : a list of sentences, each a list of tokenized symbols
+        """
+        if verbose:
+            logger.info(f"counting {len(sents)} sents ...")
+        for idx, symbols in enumerate(sents):
+            if verbose and idx > 0 and idx % 500000 == 0:
+                logger.info(f"    line {idx}")
+            self.counter.update(symbols)
+
+    def _build_from_file(self, vocab_file):
+        self.idx2sym = []
+        self.sym2idx = OrderedDict()
+
+        with open(vocab_file, "r", encoding="utf-8") as f:
+            for line in f:
+                symb = line.strip().split()[0]
+                self.add_symbol(symb)
+        if "<UNK>" in self.sym2idx:
+            self.unk_idx = self.sym2idx["<UNK>"]
+        elif "<unk>" in self.sym2idx:
+            self.unk_idx = self.sym2idx["<unk>"]
+        else:
+            raise ValueError("Token not in vocabulary and no <unk> token in vocabulary for replacement.")
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        if os.path.isdir(save_directory):
+            vocab_file = os.path.join(
+                save_directory,
+                (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["pretrained_vocab_file"],
+            )
+        else:
+            vocab_file = (filename_prefix + "-" if filename_prefix else "") + save_directory
+        with open(vocab_file, "wb") as f:
+            pickle.dump(self.__dict__, f)
+        return (vocab_file,)
+
+    def build_vocab(self):
+        if self.vocab_file:
+            logger.info(f"building vocab from {self.vocab_file}")
+            self._build_from_file(self.vocab_file)
+            logger.info(f"Final vocab size {len(self.sym2idx)}")
+        else:
+            logger.info(f"building vocab with min_freq={self.min_freq}, max_size={self.max_size}")
+            self.idx2sym = []
+            self.sym2idx = OrderedDict()
+
+            for sym in self.special:
+                self.add_special(sym)
+
+            for sym, cnt in self.counter.most_common(self.max_size):
+                if cnt < self.min_freq:
+                    break
+                self.add_symbol(sym)
+
+            logger.info(f"Final vocab size {len(self.sym2idx)} from {len(self.counter)} unique tokens")
+
+    @torch_only_method
+    def encode_file(self, path, ordered=False, verbose=False, add_eos=True, add_double_eos=False):
+        if verbose:
+            logger.info(f"encoding file {path} ...")
+        assert os.path.exists(path), f"Output file {path} not found"
+        encoded = []
+        with open(path, "r", encoding="utf-8") as f:
+            for idx, line in enumerate(f):
+                if verbose and idx > 0 and idx % 500000 == 0:
+                    logger.info(f"    line {idx}")
+                symbols = self.tokenize(line, add_eos=add_eos, add_double_eos=add_double_eos)
+                encoded.append(self.convert_to_tensor(symbols))
+
+        if ordered:
+            encoded = torch.cat(encoded)
+
+        return encoded
+
+    @torch_only_method
+    def encode_sents(self, sents, ordered=False, verbose=False):
+        if verbose:
+            logger.info(f"encoding {len(sents)} sents ...")
+        encoded = []
+        for idx, symbols in enumerate(sents):
+            if verbose and idx > 0 and idx % 500000 == 0:
+                logger.info(f"    line {idx}")
+            encoded.append(self.convert_to_tensor(symbols))
+
+        if ordered:
+            encoded = torch.cat(encoded)
+
+        return encoded
+
+    def add_special(self, sym):
+        if sym not in self.sym2idx:
+            self.idx2sym.append(sym)
+            self.sym2idx[sym] = len(self.idx2sym) - 1
+            setattr(self, f"{sym.strip('<>')}_idx", self.sym2idx[sym])
+
+    def add_symbol(self, sym):
+        if sym not in self.sym2idx:
+            self.idx2sym.append(sym)
+            self.sym2idx[sym] = len(self.idx2sym) - 1
+
+    def move_added_token(self, token: str, target_idx: int):
+        """
+        Moves an added token to a specific position in the vocab. This method should be used when resizing an embedding
+        layer other than the last one in the `AdaptiveEmbedding` in order to move the token in the tokenizer from the
+        default position (at the very end) to the desired one.
+
+        Args:
+            token: The token to move to a specific position in the vocab.
+            target_idx: The position where the token should be moved to.
+        """
+        assert token in self.added_tokens_encoder, "Token which should be moved has to be an added token"
+        assert token not in self.idx2sym, "Token which should be moved is already in vocab"
+
+        # Insert sym into vocab
+        self.idx2sym.insert(target_idx, token)
+        self.sym2idx[token] = target_idx
+
+        # Shift following indices in sym2idx
+        for idx in range(target_idx + 1, len(self.idx2sym)):
+            current_sym = self.idx2sym[idx]
+            self.sym2idx[current_sym] = idx
+
+        # Delete token from added_tokens
+        old_index = self._added_tokens_encoder.pop(token)
+        self._added_tokens_decoder.pop(old_index)
+
+    def moses_punct_norm(self, text):
+        return self.moses_punct_normalizer.normalize(text)
+
+    def moses_tokenize(self, text):
+        return self.moses_tokenizer.tokenize(
+            text, aggressive_dash_splits=True, return_str=False, escape=False, protected_patterns=self.never_split
+        )
+
+    def moses_pipeline(self, text: str) -> list[str]:
+        """
+        Does basic tokenization using [`sacremoses.MosesPunctNormalizer`] and [`sacremoses.MosesTokenizer`] with
+        *aggressive_dash_splits=True* (see [`sacremoses.tokenize.MosesTokenizer.tokenize`]). Additionally, large
+        comma-separated numbers and floating point values are split. E.g. "23,000 people are 1.80m tall" -> "23 @,@ 000
+        people are 1 @.@ 80m tall"
+
+        Args:
+            text: Text to be tokenize
+
+        Returns:
+            A list of tokenized string
+
+        Example:
+
+        ```python
+        >>> tokenizer = TransfoXLTokenizer.from_pretrained("transfo-xl/transfo-xl-wt103")
+        >>> tokenizer.moses_pipeline("23,000 people are 1.80 m tall")
+        ['23', '@,@', '000', 'people', 'are', '1', '@.@', '80', 'm', 'tall']
+        ```"""
+        text = self.moses_punct_norm(text)
+        text = self.moses_tokenize(text)
+        text = tokenize_numbers(text)
+        return text
+
+    def _convert_id_to_token(self, idx):
+        """Converts an id in a token (BPE) using the vocab."""
+        assert 0 <= idx < len(self), f"Index {idx} out of vocabulary range"
+        return self.idx2sym[idx]
+
+    def _convert_token_to_id(self, sym):
+        """Converts a token (str) in an id using the vocab."""
+        if sym in self.sym2idx:
+            return self.sym2idx[sym]
+        else:
+            # logger.info(f'encounter unk {sym}')
+            # assert '<eos>' not in sym
+            if hasattr(self, "unk_idx"):
+                return self.sym2idx.get(sym, self.unk_idx)
+            # Backward compatibility with pre-trained models
+            elif "<unk>" in self.sym2idx:
+                return self.sym2idx["<unk>"]
+            elif "<UNK>" in self.sym2idx:
+                return self.sym2idx["<UNK>"]
+            else:
+                raise ValueError("Token not in vocabulary and no <unk> token in vocabulary for replacement.")
+
+    def convert_tokens_to_string(self, tokens):
+        """
+        Converts a sequence of tokens (string) in a single string. Additionally, the split numbers are converted back
+        into it's original form.
+        """
+        out_string = self.moses_detokenizer.detokenize(tokens)
+        return detokenize_numbers(out_string).strip()
+
+    @torch_only_method
+    def convert_to_tensor(self, symbols):
+        return torch.LongTensor(self.convert_tokens_to_ids(symbols))
+
+    @property
+    def vocab_size(self):
+        return len(self.idx2sym)
+
+    def get_vocab(self):
+        vocab = self.sym2idx.copy()
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def _tokenize(self, line, add_eos=False, add_double_eos=False):
+        line = line.strip()
+        # convert to lower case
+        if self.lower_case:
+            line = line.lower()
+
+        # empty delimiter '' will evaluate False
+        if self.delimiter == "":
+            symbols = line
+        else:
+            symbols = self.moses_pipeline(line)
+
+        if add_double_eos:  # lm1b
+            return ["<S>"] + symbols + ["<S>"]
+        elif add_eos:
+            return symbols + ["<eos>"]
+        else:
+            return symbols
+
+
+class LMOrderedIterator:
+    def __init__(self, data, bsz, bptt, device="cpu", ext_len=None):
+        """
+        data -- LongTensor -- the LongTensor is strictly ordered
+        """
+        self.bsz = bsz
+        self.bptt = bptt
+        self.ext_len = ext_len if ext_len is not None else 0
+
+        self.device = device
+
+        # Work out how cleanly we can divide the dataset into bsz parts.
+        self.n_step = data.size(0) // bsz
+
+        # Trim off any extra elements that wouldn't cleanly fit (remainders).
+        data = data.narrow(0, 0, self.n_step * bsz)
+
+        # Evenly divide the data across the bsz batches.
+        self.data = data.view(bsz, -1).t().contiguous().to(device)
+
+        # Number of mini-batches
+        self.n_batch = (self.n_step + self.bptt - 1) // self.bptt
+
+    def get_batch(self, i, bptt=None):
+        if bptt is None:
+            bptt = self.bptt
+        seq_len = min(bptt, self.data.size(0) - 1 - i)
+
+        end_idx = i + seq_len
+        beg_idx = max(0, i - self.ext_len)
+
+        data = self.data[beg_idx:end_idx]
+        target = self.data[i + 1 : i + 1 + seq_len]
+
+        data_out = data.transpose(0, 1).contiguous().to(self.device)
+        target_out = target.transpose(0, 1).contiguous().to(self.device)
+
+        return data_out, target_out, seq_len
+
+    def get_fixlen_iter(self, start=0):
+        for i in range(start, self.data.size(0) - 1, self.bptt):
+            yield self.get_batch(i)
+
+    def get_varlen_iter(self, start=0, std=5, min_len=5, max_deviation=3):
+        max_len = self.bptt + max_deviation * std
+        i = start
+        while True:
+            bptt = self.bptt if np.random.random() < 0.95 else self.bptt / 2.0
+            bptt = min(max_len, max(min_len, int(np.random.normal(bptt, std))))
+            data, target, seq_len = self.get_batch(i, bptt)
+            i += seq_len
+            yield data, target, seq_len
+            if i >= self.data.size(0) - 2:
+                break
+
+    def __iter__(self):
+        return self.get_fixlen_iter()
+
+
+class LMShuffledIterator:
+    def __init__(self, data, bsz, bptt, device="cpu", ext_len=None, shuffle=False):
+        """
+        data -- list[LongTensor] -- there is no order among the LongTensors
+        """
+        self.data = data
+
+        self.bsz = bsz
+        self.bptt = bptt
+        self.ext_len = ext_len if ext_len is not None else 0
+
+        self.device = device
+        self.shuffle = shuffle
+
+    def get_sent_stream(self):
+        # index iterator
+        epoch_indices = np.random.permutation(len(self.data)) if self.shuffle else np.array(range(len(self.data)))
+
+        # sentence iterator
+        for idx in epoch_indices:
+            yield self.data[idx]
+
+    @torch_only_method
+    def stream_iterator(self, sent_stream):
+        # streams for each data in the batch
+        streams = [None] * self.bsz
+
+        data = torch.LongTensor(self.bptt, self.bsz)
+        target = torch.LongTensor(self.bptt, self.bsz)
+
+        n_retain = 0
+
+        while True:
+            # data   : [n_retain+bptt x bsz]
+            # target : [bptt x bsz]
+            data[n_retain:].fill_(-1)
+            target.fill_(-1)
+
+            valid_batch = True
+
+            for i in range(self.bsz):
+                n_filled = 0
+                try:
+                    while n_filled < self.bptt:
+                        if streams[i] is None or len(streams[i]) <= 1:
+                            streams[i] = next(sent_stream)
+                        # number of new tokens to fill in
+                        n_new = min(len(streams[i]) - 1, self.bptt - n_filled)
+                        # first n_retain tokens are retained from last batch
+                        data[n_retain + n_filled : n_retain + n_filled + n_new, i] = streams[i][:n_new]
+                        target[n_filled : n_filled + n_new, i] = streams[i][1 : n_new + 1]
+                        streams[i] = streams[i][n_new:]
+                        n_filled += n_new
+                except StopIteration:
+                    valid_batch = False
+                    break
+
+            if not valid_batch:
+                return
+
+            data_out = data.transpose(0, 1).contiguous().to(self.device)
+            target_out = target.transpose(0, 1).contiguous().to(self.device)
+
+            yield data_out, target_out, self.bptt
+
+            n_retain = min(data.size(0), self.ext_len)
+            if n_retain > 0:
+                data[:n_retain] = data[-n_retain:]
+            data.resize_(n_retain + self.bptt, data.size(1))
+
+    def __iter__(self):
+        # sent_stream is an iterator
+        sent_stream = self.get_sent_stream()
+
+        for batch in self.stream_iterator(sent_stream):
+            yield batch
+
+
+class LMMultiFileIterator(LMShuffledIterator):
+    def __init__(self, paths, vocab, bsz, bptt, device="cpu", ext_len=None, shuffle=False):
+        self.paths = paths
+        self.vocab = vocab
+
+        self.bsz = bsz
+        self.bptt = bptt
+        self.ext_len = ext_len if ext_len is not None else 0
+
+        self.device = device
+        self.shuffle = shuffle
+
+    def get_sent_stream(self, path):
+        sents = self.vocab.encode_file(path, add_double_eos=True)
+        if self.shuffle:
+            np.random.shuffle(sents)
+        sent_stream = iter(sents)
+
+        return sent_stream
+
+    def __iter__(self):
+        if self.shuffle:
+            np.random.shuffle(self.paths)
+
+        for path in self.paths:
+            # sent_stream is an iterator
+            sent_stream = self.get_sent_stream(path)
+            for batch in self.stream_iterator(sent_stream):
+                yield batch
+
+
+class TransfoXLCorpus:
+    @classmethod
+    @torch_only_method
+    def from_pretrained(cls, pretrained_model_name_or_path, cache_dir=None, *inputs, **kwargs):
+        """
+        Instantiate a pre-processed corpus.
+        """
+        vocab = TransfoXLTokenizer.from_pretrained(pretrained_model_name_or_path, *inputs, **kwargs)
+        is_local = os.path.isdir(pretrained_model_name_or_path)
+        # redirect to the cache, if necessary
+        try:
+            resolved_corpus_file = cached_file(pretrained_model_name_or_path, CORPUS_NAME, cache_dir=cache_dir)
+        except OSError:
+            logger.error(
+                f"Corpus '{pretrained_model_name_or_path}' was not found in corpus list"
+                f" ({', '.join(PRETRAINED_CORPUS_ARCHIVE_MAP.keys())}. We assumed '{pretrained_model_name_or_path}'"
+                f" was a path or url but couldn't find files {CORPUS_NAME} at this path or url."
+            )
+            return None
+        if is_local:
+            logger.info(f"loading corpus file {resolved_corpus_file}")
+        else:
+            logger.info(f"loading corpus file {CORPUS_NAME} from cache at {resolved_corpus_file}")
+
+        # Instantiate tokenizer.
+        corpus = cls(*inputs, **kwargs)
+        check_torch_load_is_safe()
+        corpus_dict = torch.load(resolved_corpus_file, weights_only=True)
+        for key, value in corpus_dict.items():
+            corpus.__dict__[key] = value
+        corpus.vocab = vocab
+        if corpus.train is not None:
+            corpus.train = torch.tensor(corpus.train, dtype=torch.long)
+        if corpus.valid is not None:
+            corpus.valid = torch.tensor(corpus.valid, dtype=torch.long)
+        if corpus.test is not None:
+            corpus.test = torch.tensor(corpus.test, dtype=torch.long)
+        return corpus
+
+    def __init__(self, *args, **kwargs):
+        self.vocab = TransfoXLTokenizer(*args, **kwargs)
+        self.dataset = None
+        self.train = None
+        self.valid = None
+        self.test = None
+
+    def build_corpus(self, path, dataset):
+        self.dataset = dataset
+
+        if self.dataset in ["ptb", "wt2", "enwik8", "text8"]:
+            self.vocab.count_file(os.path.join(path, "train.txt"))
+            self.vocab.count_file(os.path.join(path, "valid.txt"))
+            self.vocab.count_file(os.path.join(path, "test.txt"))
+        elif self.dataset == "wt103":
+            self.vocab.count_file(os.path.join(path, "train.txt"))
+        elif self.dataset == "lm1b":
+            train_path_pattern = os.path.join(
+                path,
+                "1-billion-word-language-modeling-benchmark-r13output",
+                "training-monolingual.tokenized.shuffled",
+                "news.en-*",
+            )
+            train_paths = glob.glob(train_path_pattern)
+            # the vocab will load from file when build_vocab() is called
+
+        self.vocab.build_vocab()
+
+        if self.dataset in ["ptb", "wt2", "wt103"]:
+            self.train = self.vocab.encode_file(os.path.join(path, "train.txt"), ordered=True)
+            self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"), ordered=True)
+            self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=True)
+        elif self.dataset in ["enwik8", "text8"]:
+            self.train = self.vocab.encode_file(os.path.join(path, "train.txt"), ordered=True, add_eos=False)
+            self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"), ordered=True, add_eos=False)
+            self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=True, add_eos=False)
+        elif self.dataset == "lm1b":
+            self.train = train_paths
+            self.valid = self.vocab.encode_file(os.path.join(path, "valid.txt"), ordered=False, add_double_eos=True)
+            self.test = self.vocab.encode_file(os.path.join(path, "test.txt"), ordered=False, add_double_eos=True)
+
+    def get_iterator(self, split, *args, **kwargs):
+        if split == "train":
+            if self.dataset in ["ptb", "wt2", "wt103", "enwik8", "text8"]:
+                data_iter = LMOrderedIterator(self.train, *args, **kwargs)
+            elif self.dataset == "lm1b":
+                kwargs["shuffle"] = True
+                data_iter = LMMultiFileIterator(self.train, self.vocab, *args, **kwargs)
+        elif split in ["valid", "test"]:
+            data = self.valid if split == "valid" else self.test
+            if self.dataset in ["ptb", "wt2", "wt103", "enwik8", "text8"]:
+                data_iter = LMOrderedIterator(data, *args, **kwargs)
+            elif self.dataset == "lm1b":
+                data_iter = LMShuffledIterator(data, *args, **kwargs)
+        else:
+            data_iter = None
+            raise ValueError(f"Split not recognized: {split}")
+
+        return data_iter
+
+
+@torch_only_method
+def get_lm_corpus(datadir, dataset):
+    fn = os.path.join(datadir, "cache.pt")
+    fn_pickle = os.path.join(datadir, "cache.pkl")
+    if os.path.exists(fn):
+        logger.info("Loading cached dataset...")
+        check_torch_load_is_safe()
+        corpus = torch.load(fn_pickle, weights_only=True)
+    elif os.path.exists(fn):
+        logger.info("Loading cached dataset from pickle...")
+        if not strtobool(os.environ.get("TRUST_REMOTE_CODE", "False")):
+            raise ValueError(
+                "This part uses `pickle.load` which is insecure and will execute arbitrary code that is potentially "
+                "malicious. It's recommended to never unpickle data that could have come from an untrusted source, or "
+                "that could have been tampered with. If you already verified the pickle data and decided to use it, "
+                "you can set the environment variable `TRUST_REMOTE_CODE` to `True` to allow it."
+            )
+        with open(fn, "rb") as fp:
+            corpus = pickle.load(fp)
+    else:
+        logger.info(f"Producing dataset {dataset}...")
+        kwargs = {}
+        if dataset in ["wt103", "wt2"]:
+            kwargs["special"] = ["<eos>"]
+            kwargs["lower_case"] = False
+        elif dataset == "ptb":
+            kwargs["special"] = ["<eos>"]
+            kwargs["lower_case"] = True
+        elif dataset == "lm1b":
+            kwargs["special"] = []
+            kwargs["lower_case"] = False
+            kwargs["vocab_file"] = os.path.join(datadir, "1b_word_vocab.txt")
+        elif dataset in ["enwik8", "text8"]:
+            pass
+
+        corpus = TransfoXLCorpus(datadir, dataset, **kwargs)
+        torch.save(corpus, fn)
+
+    return corpus
+
+
+__all__ = ["TransfoXLCorpus", "TransfoXLTokenizer"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..941db2f6ac5fefe66e06d598e1adbe0db1cf1769
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/__init__.py
@@ -0,0 +1,20 @@
+# flake8: noqa
+# There's no way to ignore "F401 '...' imported but unused" warnings in this
+# module, but to preserve other warnings. So, don't check this module at all.
+from typing import TYPE_CHECKING
+
+from ....utils import _LazyModule
+from ....utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_tvlt import *
+    from .feature_extraction_tvlt import *
+    from .processing_tvlt import *
+    from .modeling_tvlt import *
+    from .image_processing_tvlt import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/configuration_tvlt.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/configuration_tvlt.py
new file mode 100644
index 0000000000000000000000000000000000000000..57144e349f5bcb244b4f9eeeefebd0312c4770aa
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/configuration_tvlt.py
@@ -0,0 +1,187 @@
+# coding=utf-8
+# Copyright 2023 MURGe-Lab and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TVLT model configuration"""
+
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class TvltConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`TvltModel`]. It is used to instantiate a TVLT
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the TVLT
+    [ZinengTang/tvlt-base](https://huggingface.co/ZinengTang/tvlt-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        spectrogram_length (`int`, *optional*, defaults to 2048):
+            The time length of each audio spectrogram.
+        frequency_length (`int`, *optional*, defaults to 128):
+            The frequency length of audio spectrogram.
+        image_patch_size (`list[int]`, *optional*, defaults to `[16, 16]`):
+            The size (resolution) of each image patch.
+        audio_patch_size (`list[int]`, *optional*, defaults to `[16, 16]`):
+            The size (resolution) of each audio patch.
+        num_image_channels (`int`, *optional*, defaults to 3):
+            The number of input image channels.
+        num_audio_channels (`int`, *optional*, defaults to 1):
+            The number of input audio channels.
+        num_frames (`int`, *optional*, defaults to 8):
+            The maximum number of frames for an input video.
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+        use_mean_pooling (`bool`, *optional*, defaults to `False`):
+            Whether to mean pool the final hidden states instead of using the final hidden state of the [CLS] token.
+        decoder_num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the decoder.
+        decoder_hidden_size (`int`, *optional*, defaults to 512):
+            Dimensionality of the decoder.
+        decoder_num_hidden_layers (`int`, *optional*, defaults to 8):
+            Number of hidden layers in the decoder.
+        decoder_intermediate_size (`int`, *optional*, defaults to 2048):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the decoder.
+        pixel_mask_ratio (`float`, *optional*, defaults to 0.75):
+            Image patch masking ratio.
+        audio_mask_ratio (`float`, *optional*, defaults to 0.15):
+            Audio patch masking ratio.
+        audio_mask_type (`str`, *optional*, defaults to `"frame-level"`):
+            Audio patch masking type, choose between "frame-level" and "patch-level".
+        task_matching (`bool`, *optional*, defaults to `True`):
+            Whether to use vision audio matching task in pretraining.
+        task_mae (`bool`, *optional*, defaults to `True`):
+            Whether to use the masked auto-encoder (MAE) in pretraining.
+        loss_type (`str`, *optional*, defaults to `"classification"`):
+            Loss types including regression and classification.
+
+    Example:
+
+    ```python
+    >>> from transformers import TvltConfig, TvltModel
+
+    >>> # # Initializing a TVLT ZinengTang/tvlt-base style configuration
+    >>> configuration = TvltConfig()
+
+    >>> # # Initializing a model (with random weights) from the ZinengTang/tvlt-base style configuration
+    >>> model = TvltModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "tvlt"
+
+    def __init__(
+        self,
+        image_size=224,
+        spectrogram_length=2048,
+        frequency_length=128,
+        image_patch_size=[16, 16],
+        audio_patch_size=[16, 16],
+        num_image_channels=3,
+        num_audio_channels=1,
+        num_frames=8,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        initializer_range=0.02,
+        layer_norm_eps=1e-6,
+        qkv_bias=True,
+        use_mean_pooling=False,
+        decoder_num_attention_heads=16,
+        decoder_hidden_size=512,
+        decoder_num_hidden_layers=8,
+        decoder_intermediate_size=2048,
+        pixel_mask_ratio=0.75,
+        audio_mask_ratio=0.15,
+        audio_mask_type="frame-level",
+        task_matching=True,
+        task_mae=True,
+        loss_type="classification",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        if audio_mask_type not in ("frame-level", "patch_level"):
+            raise ValueError(
+                "audio_mask_type must be one of two acceptable strategies - {'frame_level', 'patch-level') "
+                f"got {audio_mask_type}"
+            )
+
+        self.image_size = image_size
+        self.spectrogram_length = spectrogram_length
+        self.frequency_length = frequency_length
+        self.image_patch_size = image_patch_size
+        self.audio_patch_size = audio_patch_size
+        self.num_image_channels = num_image_channels
+        self.num_audio_channels = num_audio_channels
+        self.num_frames = num_frames
+
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.qkv_bias = qkv_bias
+        self.use_mean_pooling = use_mean_pooling
+
+        self.decoder_num_attention_heads = decoder_num_attention_heads
+        self.decoder_hidden_size = decoder_hidden_size
+        self.decoder_num_hidden_layers = decoder_num_hidden_layers
+        self.decoder_intermediate_size = decoder_intermediate_size
+        self.pixel_mask_ratio = pixel_mask_ratio
+        self.audio_mask_ratio = audio_mask_ratio
+        self.audio_mask_type = audio_mask_type
+
+        self.task_matching = task_matching
+        self.task_mae = task_mae
+        self.loss_type = loss_type
+
+
+__all__ = ["TvltConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/feature_extraction_tvlt.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/feature_extraction_tvlt.py
new file mode 100644
index 0000000000000000000000000000000000000000..b9350d31a0197926ba96e04dacfdb7bc71adbfc6
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/feature_extraction_tvlt.py
@@ -0,0 +1,233 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for TVLT."""
+
+from math import ceil
+from typing import Optional, Union
+
+import numpy as np
+
+from ....audio_utils import mel_filter_bank, spectrogram, window_function
+from ....feature_extraction_sequence_utils import BatchFeature, SequenceFeatureExtractor
+from ....utils import TensorType, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class TvltFeatureExtractor(SequenceFeatureExtractor):
+    r"""
+    Constructs a TVLT audio feature extractor. This feature extractor can be used to prepare audios for the model.
+
+    This feature extractor inherits from [`FeatureExtractionMixin`] which contains most of the main methods. Users
+    should refer to this superclass for more information regarding those methods.
+
+    Args:
+        spectrogram_length (`dict[str, int]` *optional*, defaults to 2048):
+            The time length of each audio spectrogram.
+        num_channels (`int` *optional*, defaults to 1):
+            Number of audio channels.
+        patch_size (`list[int]` *optional*, defaults to `[16, 16]`):
+            The patch size of audio patch embedding.
+        feature_size (`int`, *optional*, defaults to 128):
+            The frequency length of audio spectrogram.
+        sampling_rate (`int`, *optional*, defaults to 44100):
+            The sampling rate at which the audio files should be digitalized expressed in Hertz (Hz).
+        hop_length_to_sampling_rate (`int`, *optional*, defaults to 86):
+            Hop length is length of the overlapping windows for the STFT used to obtain the Mel Frequency coefficients.
+            For example, with sampling rate 44100, the hop length is 512, with 44100 / 512 = 86
+        n_fft (`int`, *optional*, defaults to 2048):
+            Size of the Fourier transform.
+        padding_value (`float`, *optional*, defaults to 0.0):
+            Padding value used to pad the audio. Should correspond to silences.
+    """
+
+    model_input_names = ["audio_values", "audio_mask"]
+
+    def __init__(
+        self,
+        spectrogram_length=2048,
+        num_channels=1,
+        patch_size=[16, 16],
+        feature_size=128,
+        sampling_rate=44100,
+        hop_length_to_sampling_rate=86,
+        n_fft=2048,
+        padding_value=0.0,
+        **kwargs,
+    ):
+        super().__init__(
+            feature_size=feature_size,
+            sampling_rate=sampling_rate,
+            padding_value=padding_value,
+            **kwargs,
+        )
+
+        self.spectrogram_length = spectrogram_length
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.freq_len = feature_size // self.patch_size[1]
+        self.n_fft = n_fft
+        self.hop_length = sampling_rate // hop_length_to_sampling_rate
+        self.sampling_rate = sampling_rate
+        self.padding_value = padding_value
+        self.mel_filters = mel_filter_bank(
+            num_frequency_bins=1 + n_fft // 2,
+            num_mel_filters=feature_size,
+            min_frequency=0.0,
+            max_frequency=22050.0,
+            sampling_rate=sampling_rate,
+            norm="slaney",
+            mel_scale="slaney",
+        ).T
+
+    def _np_extract_fbank_features(self, waveform: np.ndarray) -> np.ndarray:
+        """
+        Compute the log-mel spectrogram of the provided audio, gives similar results to Whisper's original torch
+        implementation with 1e-5 tolerance.
+        """
+        log_spec = spectrogram(
+            waveform,
+            window_function(self.n_fft, "hann"),
+            frame_length=self.n_fft,
+            hop_length=self.hop_length,
+            power=2.0,
+            mel_filters=self.mel_filters.T,
+            log_mel="dB",
+            db_range=80.0,
+        )
+        log_spec = log_spec[:, :-1]
+        log_spec = log_spec - 20.0
+        log_spec = np.clip(log_spec / 40.0, -2.0, 0.0) + 1.0
+        return log_spec
+
+    def __call__(
+        self,
+        raw_speech: Union[np.ndarray, list[float], list[np.ndarray], list[list[float]]],
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_attention_mask: Optional[bool] = True,
+        sampling_rate: Optional[int] = None,
+        resample: bool = False,
+        mask_audio: bool = False,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Main method to prepare one or several audio(s) for the model.
+
+        Args:
+            raw_speech (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`):
+                The sequence or batch of sequences to be padded. Each sequence can be a numpy array, a list of float
+                values, a list of numpy arrays or a list of list of float values. Must be mono channel audio, not
+                stereo, i.e. single float per timestep.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+            return_attention_mask (`bool`, *optional*, default to `True`):
+                Whether to return the attention mask. If left to the default, will return the attention mask according
+                to the specific feature_extractor's default. [What are attention masks?](../glossary#attention-mask)
+
+                <Tip>
+
+                For TvltTransformer models, `attention_mask` should always be passed for batched inference, to avoid
+                subtle bugs.
+
+                </Tip>
+
+            sampling_rate (`int`, *optional*):
+                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
+                `sampling_rate` at the forward call to prevent silent errors and allow automatic speech recognition
+                pipeline. Current model supports sampling rate 16000 and 44100.
+            resample (`bool`, *optional*, defaults to `False`):
+                If the sampling rate is not matched, resample the input audio to match.
+            mask_audio (`bool`, *optional*, defaults to `False`):
+                Whether or not to mask input audio for MAE task.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **audio_values** -- Audio values to be fed to a model, of shape (batch_size, num_channels, height,
+              width).
+
+            - **audio_mask** -- Audio masks to be fed to a model, of shape (batch_size, num_audio_patches).
+        """
+
+        if sampling_rate is not None:
+            if sampling_rate != self.sampling_rate:
+                raise ValueError(
+                    "This feature extractor is set to support sampling rate"
+                    f" of {self.sampling_rate}. Please make sure that the provided `raw_speech` input was sampled"
+                    f" with {self.sampling_rate} and not {sampling_rate}."
+                )
+        else:
+            logger.warning(
+                "It is strongly recommended to pass the `sampling_rate` argument to this function. "
+                "Failing to do so can result in silent errors that might be hard to debug."
+            )
+
+        is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
+        if is_batched_numpy and len(raw_speech.shape) > 2:
+            raise ValueError(f"Only mono-channel audio is supported for input to {self}")
+        is_batched = is_batched_numpy or (
+            isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], (np.ndarray, tuple, list)))
+        )
+        if is_batched:
+            raw_speech = [np.asarray([speech], dtype=np.float32).T for speech in raw_speech]
+        elif not is_batched and not isinstance(raw_speech, np.ndarray):
+            raw_speech = np.asarray(raw_speech, dtype=np.float32)
+        elif isinstance(raw_speech, np.ndarray) and raw_speech.dtype is np.dtype(np.float64):
+            raw_speech = raw_speech.astype(np.float32)
+        # always return batch
+        if not is_batched:
+            raw_speech = [np.asarray([raw_speech]).T]
+
+        # Convert audio signals to log mel spectrograms, truncate by time axis
+        audio_features = [
+            self._np_extract_fbank_features(waveform.squeeze()).T[: self.spectrogram_length] for waveform in raw_speech
+        ]
+        if isinstance(audio_features[0], list):
+            audio_features = [np.asarray(feature, dtype=np.float32) for feature in audio_features]
+
+        # Create audio attention mask
+        max_patch_len = max(
+            ceil(feature.shape[0] / self.patch_size[0]) * self.freq_len for feature in audio_features
+        )  # The maximum number of audio patches in a batch
+        if return_attention_mask:
+            audio_mask = [
+                (ceil(feature.shape[0] / self.patch_size[0]) * self.freq_len) * [1]
+                + (max_patch_len - ceil(feature.shape[0] / self.patch_size[0]) * self.freq_len) * [0]
+                for feature in audio_features
+            ]
+            audio_mask = np.array(audio_mask).astype(np.float32)
+
+        # convert into correct format for padding
+        max_time_len = max_patch_len // self.freq_len * self.patch_size[0]  # The maximum audio size in a batch
+        padded_audio_features = np.ones([len(audio_features), 1, max_time_len, self.feature_size]).astype(np.float32)
+        padded_audio_features = padded_audio_features * self.padding_value
+        for i in range(len(audio_features)):
+            feature = audio_features[i]
+            padded_audio_features[i, :, : feature.shape[0], :] = feature
+
+        # return as BatchFeature
+        if return_attention_mask:
+            data = {"audio_values": padded_audio_features, "audio_mask": audio_mask}
+        else:
+            data = {"audio_values": padded_audio_features}
+
+        encoded_inputs = BatchFeature(data=data, tensor_type=return_tensors)
+        return encoded_inputs
+
+
+__all__ = ["TvltFeatureExtractor"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/image_processing_tvlt.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/image_processing_tvlt.py
new file mode 100644
index 0000000000000000000000000000000000000000..01fb42429a96fdb8b171dbeedbd21e45a66a718f
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/image_processing_tvlt.py
@@ -0,0 +1,438 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for TVLT."""
+
+from typing import Optional, Union
+
+import numpy as np
+
+from ....image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ....image_transforms import (
+    get_resize_output_image_size,
+    resize,
+    to_channel_dimension_format,
+)
+from ....image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    is_valid_image,
+    to_numpy_array,
+    valid_images,
+    validate_kwargs,
+    validate_preprocess_arguments,
+)
+from ....utils import TensorType, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+def make_batched(videos) -> list[list[ImageInput]]:
+    if isinstance(videos, (list, tuple)) and isinstance(videos[0], (list, tuple)):
+        return videos
+
+    elif isinstance(videos, (list, tuple)) and is_valid_image(videos[0]):
+        videos_dim = np.array(videos[0]).ndim
+        if videos_dim == 3:
+            return [videos]
+        elif videos_dim == 4:
+            return videos
+
+    elif is_valid_image(videos):
+        videos_dim = np.array(videos).ndim
+        if videos_dim == 3:
+            return [[videos]]
+        elif videos_dim == 4:
+            return [videos]
+        elif videos_dim == 5:
+            return videos
+
+    raise ValueError(f"Could not make batched video from {videos}")
+
+
+class TvltImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a TVLT image processor.
+
+    This processor can be used to prepare either videos or images for the model by converting images to 1-frame videos.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by the
+            `do_resize` parameter in the `preprocess` method.
+        size (`dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
+            Size of the output image after resizing. The shortest edge of the image will be resized to
+            `size["shortest_edge"]` while maintaining the aspect ratio of the original image. Can be overridden by
+            `size` in the `preprocess` method.
+        patch_size (`list[int]` *optional*, defaults to [16,16]):
+            The patch size of image patch embedding.
+        num_frames (`int` *optional*, defaults to 8):
+            The maximum number of video frames.
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+            Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
+            `preprocess` method.
+        do_center_crop (`bool`, *optional*, defaults to `True`):
+            Whether to center crop the image to the specified `crop_size`. Can be overridden by the `do_center_crop`
+            parameter in the `preprocess` method.
+        crop_size (`dict[str, int]`, *optional*, defaults to `{"height": 224, "width": 224}`):
+            Size of the image after applying the center crop. Can be overridden by the `crop_size` parameter in the
+            `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
+            parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to 1/255):
+            Defines the scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter
+            in the `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method.
+        image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+    """
+
+    model_input_names = [
+        "pixel_values",
+        "pixel_mask",
+        "pixel_values_mixed",
+        "pixel_mask_mixed",
+    ]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Optional[dict[str, int]] = None,
+        patch_size: list[int] = [16, 16],
+        num_frames: int = 8,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_center_crop: bool = True,
+        crop_size: Optional[dict[str, int]] = None,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, list[float]]] = IMAGENET_STANDARD_MEAN,
+        image_std: Optional[Union[float, list[float]]] = IMAGENET_STANDARD_STD,
+        init_mask_generator=False,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"shortest_edge": 224}
+        size = get_size_dict(size, default_to_square=False)
+        crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
+        crop_size = get_size_dict(crop_size, param_name="crop_size")
+
+        self.do_resize = do_resize
+        self.size = size
+        self.patch_size = patch_size
+        self.num_frames = num_frames
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean
+        self.image_std = image_std
+        self._valid_processor_keys = [
+            "videos",
+            "do_resize",
+            "size",
+            "patch_size",
+            "num_frames",
+            "resample",
+            "do_center_crop",
+            "crop_size",
+            "do_rescale",
+            "rescale_factor",
+            "do_normalize",
+            "image_mean",
+            "image_std",
+            "is_mixed",
+            "return_tensors",
+            "data_format",
+            "input_data_format",
+        ]
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`dict[str, int]`):
+                Size of the output image. If `size` is of the form `{"height": h, "width": w}`, the output image will
+                have the size `(h, w)`. If `size` is of the form `{"shortest_edge": s}`, the output image will have its
+                shortest edge of length `s` while keeping the aspect ratio of the original image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+                Resampling filter to use when resiizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        size = get_size_dict(size, default_to_square=False)
+        if "shortest_edge" in size:
+            output_size = get_resize_output_image_size(
+                image, size["shortest_edge"], default_to_square=False, input_data_format=input_data_format
+            )
+        elif "height" in size and "width" in size:
+            output_size = (size["height"], size["width"])
+        else:
+            raise ValueError(f"Size must have 'height' and 'width' or 'shortest_edge' as keys. Got {size.keys()}")
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    def _preprocess_image(
+        self,
+        image: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample: Optional[PILImageResampling] = None,
+        do_center_crop: Optional[bool] = None,
+        crop_size: Optional[dict[str, int]] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """Preprocesses a single image."""
+
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_center_crop=do_center_crop,
+            crop_size=crop_size,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+
+        # All transformations expect numpy arrays.
+        image = to_numpy_array(image)
+
+        if do_rescale and is_scaled_image(image):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            input_data_format = infer_channel_dimension_format(image)
+
+        if do_resize:
+            image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+
+        if do_center_crop:
+            image = self.center_crop(image, size=crop_size, input_data_format=input_data_format)
+
+        if do_rescale:
+            image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+
+        if do_normalize:
+            image = self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+        image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+        return image
+
+    def preprocess(
+        self,
+        videos: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        patch_size: Optional[list[int]] = None,
+        num_frames: Optional[int] = None,
+        resample: Optional[PILImageResampling] = None,
+        do_center_crop: Optional[bool] = None,
+        crop_size: Optional[dict[str, int]] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        is_mixed: bool = False,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Preprocess an videos or image or batch of videos or images.
+
+        Args:
+            videos (`ImageInput`):
+                Images or videos to preprocess. Expects a single or batch of frames with pixel values ranging from 0 to
+                255. If passing in frames with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after applying resize.
+            patch_size (`list[int]` *optional*, defaults to self.patch_size):
+                The patch size of image patch embedding.
+            num_frames (`int` *optional*, defaults to self.num_frames):
+                The maximum number of video frames.
+            resample (`PILImageResampling`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`, Only
+                has an effect if `do_resize` is set to `True`.
+            do_center_crop (`bool`, *optional*, defaults to `self.do_centre_crop`):
+                Whether to centre crop the image.
+            crop_size (`dict[str, int]`, *optional*, defaults to `self.crop_size`):
+                Size of the image after applying the centre crop.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean.
+            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation.
+            is_mixed (`bool`, *optional*):
+                If the input video has negative samples.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                    - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                    - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                    - Unset: Use the inferred channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **pixel_values** -- Pixel values to be fed to a model, of shape (batch_size, num_channels, height,
+              width).
+
+            - **pixel_mask** -- Pixel masks to be fed to a model, of shape (batch_size, num_pixel_patches).
+
+            - **pixel_values_mixed** -- Pixel values with both positive or negative to be fed to a model, of shape
+              (batch_size, num_channels, height, width).
+
+            - **pixel_mask_mixed** -- Pixel masks with both positive or negative to be fed to a model, of shape
+              (batch_size, num_pixel_patches).
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        resample = resample if resample is not None else self.resample
+        do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+
+        size = size if size is not None else self.size
+        size = get_size_dict(size, default_to_square=False)
+        crop_size = crop_size if crop_size is not None else self.crop_size
+        crop_size = get_size_dict(crop_size, param_name="crop_size")
+        patch_size = patch_size if patch_size is not None else self.patch_size
+        num_frames = num_frames if patch_size is not None else self.num_frames
+
+        validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys)
+
+        if not valid_images(videos):
+            raise ValueError(
+                "Invalid image or video type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        videos = make_batched(videos)
+
+        # Check number of frames is fewer than maximum frames
+        for video in videos:
+            if len(video) > self.num_frames:
+                raise ValueError(
+                    f"number of frames must not be greater than the maximum frames of the model {self.num_frames}."
+                )
+
+        max_num_frames = max(len(video) for video in videos)
+        num_patches_per_image = (size["shortest_edge"] // patch_size[0]) ** 2
+        video_masks = np.array(
+            [
+                len(video) * num_patches_per_image * [1] + (max_num_frames - len(video)) * num_patches_per_image * [0]
+                for video in videos
+            ]
+        )
+
+        videos = [
+            [
+                self._preprocess_image(
+                    image=img,
+                    do_resize=do_resize,
+                    size=size,
+                    resample=resample,
+                    do_center_crop=do_center_crop,
+                    crop_size=crop_size,
+                    do_rescale=do_rescale,
+                    rescale_factor=rescale_factor,
+                    do_normalize=do_normalize,
+                    image_mean=image_mean,
+                    image_std=image_std,
+                    data_format=data_format,
+                    input_data_format=input_data_format,
+                )
+                for img in video
+            ]
+            for video in videos
+        ]
+
+        # If videos contain both positive/negative, use mixed key for video-audio matching task
+        if is_mixed:
+            data = {"pixel_values_mixed": videos, "pixel_mask_mixed": video_masks}
+        else:
+            data = {"pixel_values": videos, "pixel_mask": video_masks}
+
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+
+__all__ = ["TvltImageProcessor"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/modeling_tvlt.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/modeling_tvlt.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b21df928ff3d74022d858f7a73578802127dd08
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/modeling_tvlt.py
@@ -0,0 +1,1274 @@
+# coding=utf-8
+# Copyright 2023 MURGe-Lab and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch TVLT model."""
+
+import collections.abc
+import math
+from copy import deepcopy
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ....activations import ACT2FN
+from ....modeling_layers import GradientCheckpointingLayer
+from ....modeling_outputs import BaseModelOutput, SequenceClassifierOutput
+from ....modeling_utils import PreTrainedModel
+from ....pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ....utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_tvlt import TvltConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CONFIG_FOR_DOC = "TvltConfig"
+_CHECKPOINT_FOR_DOC = "ZinengTang/tvlt-base"
+
+
+@dataclass
+class TvltModelOutput(ModelOutput):
+    """
+    Class for TvltModel's outputs, with potential hidden states and attentions.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+            Sequence of hidden-states at the output of the last layer of the model.
+        last_pixel_hidden_state (`torch.FloatTensor` of shape `(batch_size, pixel_sequence_length, hidden_size)`):
+            Pixel sequence of hidden-states at the output of the last layer of the model.
+        last_audio_hidden_state (`torch.FloatTensor` of shape `(batch_size, audio_sequence_length, hidden_size)`):
+            Audio sequence of hidden-states at the output of the last layer of the model.
+        pixel_label_masks (`torch.FloatTensor` of shape `(batch_size, pixel_patch_length)`):
+            Tensor indicating which pixel patches are masked (1) and which are not (0).
+        audio_label_masks (`torch.FloatTensor` of shape `(batch_size, audio_patch_length)`):
+            Tensor indicating which audio patches are masked (1) and which are not (0).
+        pixel_ids_restore (`torch.LongTensor` of shape `(batch_size, pixel_patch_length)`):
+            Tensor containing the ids permutation of pixel masking.
+        audio_ids_restore (`torch.LongTensor` of shape `(batch_size, audio_patch_length)`):
+            Tensor containing the ids permutation of audio masking.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
+            plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+    """
+
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    last_pixel_hidden_state: Optional[torch.FloatTensor] = None
+    last_audio_hidden_state: Optional[torch.FloatTensor] = None
+    pixel_label_masks: Optional[torch.LongTensor] = None
+    audio_label_masks: Optional[torch.LongTensor] = None
+    pixel_ids_restore: Optional[torch.LongTensor] = None
+    audio_ids_restore: Optional[torch.LongTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+@dataclass
+class TvltDecoderOutput(ModelOutput):
+    """
+    Class for TvltDecoder's outputs, with potential hidden states and attentions.
+
+    Args:
+        logits (`torch.FloatTensor` of shape `(batch_size, patch_size ** 2 * num_channels)`):
+            Pixel reconstruction logits.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
+            plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+    """
+
+    logits: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+@dataclass
+class TvltForPreTrainingOutput(ModelOutput):
+    """
+    Class for TvltForPreTraining's outputs, with potential hidden states and attentions.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`):
+            Pixel reconstruction loss.
+        matching_logits (`torch.FloatTensor` of shape `(batch_size, 1)`):
+            Matching objective logits.
+        pixel_logits (`torch.FloatTensor` of shape
+            `(batch_size, pixel_patch_length, image_patch_size ** 3 * pixel_num_channels)`): Pixel reconstruction
+            logits.
+        audio_logits (`torch.FloatTensor` of shape
+            `(batch_size, audio_patch_length, image_patch_size[0] * image_patch_size[1])`): Audio reconstruction
+            logits.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings and one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each layer
+            plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+            the self-attention heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    matching_logits: Optional[torch.FloatTensor] = None
+    pixel_logits: Optional[torch.FloatTensor] = None
+    audio_logits: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+def generate_pixel_mask_noise(pixel_values, pixel_mask=None, mask_ratio=0.75):
+    """Generate noise for audio masking."""
+
+    batch_size, seq_len = pixel_values.shape[:2]
+    noise = torch.rand((batch_size, seq_len), device=pixel_values.device)  # noise in [0, 1]
+    len_keep = int(seq_len * (1 - mask_ratio))
+    return noise, len_keep
+
+
+def generate_audio_mask_noise(audio_values, audio_mask=None, mask_ratio=0.75, mask_type="patch-level", freq_len=8):
+    """Generate noise for audio masking."""
+
+    batch_size, seq_len = audio_values.shape[:2]
+    if mask_type == "frame-level":
+        num_time_patches = seq_len // freq_len
+        noise = (
+            torch.rand(batch_size, num_time_patches, device=audio_values.device)
+            .unsqueeze(-1)
+            .repeat(1, 1, freq_len)
+            .view(batch_size, seq_len)
+        )  # noise in [0, 1]
+    elif mask_type == "patch-level":
+        noise = torch.rand(batch_size, seq_len, device=audio_values.device)  # noise in [0, 1]
+    len_keep = int(seq_len * (1 - mask_ratio))
+    return noise, len_keep
+
+
+def random_masking(sequence, noise, len_keep, attention_masks=None):
+    """
+    Perform random masking by per-sample shuffling on frame-level. Per-sample shuffling is done by argsort random
+    noise. sequence: [batch_size, seq_len, hidden_dim], sequence
+    """
+
+    batch_size, seq_len, hidden_dim = sequence.shape
+
+    # sort noise for each sample
+    ids_shuffle = torch.argsort(noise, dim=1)  # ascend: small is keep, large is remove
+    ids_restore = torch.argsort(ids_shuffle, dim=1)
+
+    # keep the first subset
+    ids_keep = ids_shuffle[:, :len_keep]
+    sequence_masked = torch.gather(sequence, dim=1, index=ids_keep.unsqueeze(-1).repeat(1, 1, hidden_dim))
+
+    # generate the binary mask: 0 is keep, 1 is remove
+    label_masks = torch.ones([batch_size, seq_len], device=sequence.device)
+    label_masks[:, :len_keep] = 0
+    # unshuffle to get the binary mask
+    label_masks = torch.gather(label_masks, dim=1, index=ids_restore)
+
+    if attention_masks is not None:
+        label_masks *= attention_masks
+        attention_masks = torch.gather(attention_masks, dim=1, index=ids_keep)
+
+    return sequence_masked, attention_masks, label_masks, ids_restore
+
+
+class TvltPixelEmbeddings(nn.Module):
+    """Construct the patch and position embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.patch_embeddings = TvltPixelPatchEmbeddings(config)
+        self.num_patches_per_image = self.patch_embeddings.num_patches_per_image
+
+        self.type_embed_v = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+        self.temporal_embed = nn.Parameter(torch.zeros(1, config.num_frames, config.hidden_size))
+        self.pos_embed_v = nn.Parameter(torch.zeros(1, self.num_patches_per_image, config.hidden_size))
+
+        self.config = config
+
+    def forward(self, pixel_values, attention_masks=None):
+        # create patch embeddings
+        batch_size, num_frames, num_channels, height, width = pixel_values.shape
+
+        embeddings = self.patch_embeddings(pixel_values)
+        embeddings += self.pos_embed_v.repeat(1, num_frames, 1)
+        embeddings += torch.repeat_interleave(self.temporal_embed[:, :num_frames], self.num_patches_per_image, dim=1)
+        embeddings += self.type_embed_v
+
+        return embeddings, attention_masks
+
+
+class TvltAudioEmbeddings(nn.Module):
+    """Construct the patch and position embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.patch_embeddings = TvltAudioPatchEmbeddings(config)
+        self.num_patches = self.patch_embeddings.num_patches
+
+        self.type_embed_a = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+        self.num_freq_patches = config.frequency_length // config.audio_patch_size[1]
+        self.pos_embed_a = nn.Parameter(torch.zeros(1, self.num_patches // self.num_freq_patches, config.hidden_size))
+        self.freq_embed = nn.Parameter(torch.zeros(1, self.num_freq_patches, config.hidden_size))
+
+        self.num_freq_patches = config.frequency_length // config.audio_patch_size[1]
+        self.config = config
+
+    def forward(self, audio_values, attention_masks=None):
+        # create patch embeddings
+        embeddings = self.patch_embeddings(audio_values)
+
+        num_time_patches = embeddings.size(1) // self.num_freq_patches
+        embeddings += self.freq_embed.repeat(1, num_time_patches, 1)
+        embeddings += torch.repeat_interleave(self.pos_embed_a[:, :num_time_patches], self.num_freq_patches, dim=1)
+        embeddings += self.type_embed_a
+
+        return embeddings, attention_masks
+
+
+class TvltPixelPatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        image_size, patch_size = config.image_size, config.image_patch_size
+        num_channels, hidden_size = config.num_image_channels, config.hidden_size
+
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches_per_image = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches_per_image = num_patches_per_image
+        self.hidden_size = hidden_size
+
+        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        batch_size, num_frames, num_channels, height, width = pixel_values.shape
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        if height != self.image_size[0] or width != self.image_size[1]:
+            raise ValueError(
+                f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
+            )
+
+        pixel_values = pixel_values.reshape(batch_size * num_frames, num_channels, height, width)
+        embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2)
+        embeddings = embeddings.reshape(batch_size, num_frames * self.num_patches_per_image, self.hidden_size)
+
+        return embeddings
+
+
+class TvltAudioPatchEmbeddings(nn.Module):
+    """
+    This class turns `audio_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        spectrogram_length, frequency_length, patch_size = (
+            config.spectrogram_length,
+            config.frequency_length,
+            config.audio_patch_size,
+        )
+        num_channels, hidden_size = config.num_audio_channels, config.hidden_size
+
+        spectrogram_size = (spectrogram_length, frequency_length)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (spectrogram_size[1] // patch_size[1]) * (spectrogram_size[0] // patch_size[0])
+        patch_shape = (spectrogram_size[0] // patch_size[0], spectrogram_size[1] // patch_size[1])
+        self.spectrogram_size = spectrogram_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+        self.patch_shape = patch_shape
+
+        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, audio_values: torch.Tensor) -> torch.Tensor:
+        batch_size, num_channels, height, width = audio_values.shape
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        if height > self.spectrogram_size[0] or width != self.spectrogram_size[1]:
+            raise ValueError(
+                f"Input audio size ({height}*{width}) doesn't match model"
+                f" ({self.spectrogram_size[0]}*{self.spectrogram_size[1]})."
+            )
+        embeddings = self.projection(audio_values).flatten(2).transpose(1, 2)
+
+        return embeddings
+
+
+class TvltSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
+                f"heads {config.num_attention_heads}."
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x):
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(*new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False):
+        mixed_query_layer = self.query(hidden_states)
+
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in BertModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.Softmax(dim=-1)(attention_scores)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(*new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+class TvltSelfOutput(nn.Module):
+    """
+    The residual connection is defined in TvltLayer instead of here (as is the case with other models), due to the
+    layernorm applied before each block.
+    """
+
+    def __init__(self, config: TvltConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        return hidden_states
+
+
+class TvltAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.attention = TvltSelfAttention(config)
+        self.output = TvltSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False):
+        self_outputs = self.attention(hidden_states, attention_mask, head_mask, output_attentions)
+
+        attention_output = self.output(self_outputs[0], hidden_states)
+
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class TvltIntermediate(nn.Module):
+    def __init__(self, config: TvltConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+
+        return hidden_states
+
+
+class TvltOutput(nn.Module):
+    def __init__(self, config: TvltConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        hidden_states = hidden_states + input_tensor
+
+        return hidden_states
+
+
+class TvltLayer(GradientCheckpointingLayer):
+    """This corresponds to the Block class in the timm implementation."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = TvltAttention(config)
+        self.intermediate = TvltIntermediate(config)
+        self.output = TvltOutput(config)
+        self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states, attention_mask=None, head_mask=None, output_attentions=False):
+        self_attention_outputs = self.attention(
+            self.layernorm_before(hidden_states),  # in ViLT, layernorm is applied before self-attention
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        # first residual connection
+        hidden_states = attention_output + hidden_states.to(attention_output.device)
+
+        # in ViLT, layernorm is also applied after self-attention
+        layer_output = self.layernorm_after(hidden_states)
+        layer_output = self.intermediate(layer_output)
+
+        # second residual connection is done here
+        layer_output = self.output(layer_output, hidden_states)
+
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+
+class TvltEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([TvltLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        head_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            layer_outputs = layer_module(hidden_states, attention_mask, layer_head_mask, output_attentions)
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class TvltPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config: TvltConfig
+    base_model_prefix = "tvlt"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+TVLT_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`TvltConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+TVLT_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`TvltProcessor`]. See [`TvltProcessor.__call__`] for
+            details.
+
+        audio_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Audio values. Audio values can be obtained using [`TvltProcessor`]. See [`TvltProcessor.__call__`] for
+            details.
+
+        pixel_mask (`torch.FloatTensor` of shape `(batch_size, num_pixel_patches)`):
+            Pixel masks. Pixel masks can be obtained using [`TvltProcessor`]. See [`TvltProcessor.__call__`] for
+            details.
+
+        audio_mask (`torch.FloatTensor` of shape `(batch_size, num_audio_patches)`):
+            Audio masks. Audio masks can be obtained using [`TvltProcessor`]. See [`TvltProcessor.__call__`] for
+            details.
+
+        pixel_values_mixed (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
+            Pixel values that mix positive and negative samples in Tvlt vision-audio matching. Pixel values mixed can
+            be obtained using [`TvltProcessor`]. See [`TvltProcessor.__call__`] for details.
+
+        pixel_mask_mixed (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel masks of pixel_values_mixed. Pixel masks mixed can be obtained using [`TvltProcessor`]. See
+            [`TvltProcessor.__call__`] for details.
+
+        mask_pixel (`bool`, *optional*):
+            Whether to mask pixel for MAE tasks. Only set to True in TvltForPreTraining.
+
+        mask_audio (`bool`, *optional*):
+            Whether to mask audio for MAE tasks. Only set to True in TvltForPreTraining.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare TVLT Model transformer outputting raw hidden-states without any specific head on top.",
+    TVLT_START_DOCSTRING,
+)
+class TvltModel(TvltPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+
+        self.pixel_embeddings = TvltPixelEmbeddings(config)
+        self.audio_embeddings = TvltAudioEmbeddings(config)
+        self.encoder = TvltEncoder(config)
+
+        self.cls_embedding = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+
+        if config.use_mean_pooling:
+            self.layernorm = None
+        else:
+            self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.pixel_embeddings.patch_embeddings, self.audio_embeddings.patch_embeddings
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(TVLT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=TvltModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        audio_values: torch.FloatTensor,
+        pixel_mask: Optional[torch.FloatTensor] = None,
+        audio_mask: Optional[torch.FloatTensor] = None,
+        mask_pixel: bool = False,
+        mask_audio: bool = False,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.FloatTensor], TvltModelOutput]:
+        r"""
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import TvltProcessor, TvltModel
+        >>> import numpy as np
+        >>> import torch
+
+        >>> num_frames = 8
+        >>> images = list(np.random.randn(num_frames, 3, 224, 224))
+        >>> audio = list(np.random.randn(10000))
+
+        >>> processor = TvltProcessor.from_pretrained("ZinengTang/tvlt-base")
+        >>> model = TvltModel.from_pretrained("ZinengTang/tvlt-base")
+
+        >>> input_dict = processor(images, audio, sampling_rate=44100, return_tensors="pt")
+
+        >>> outputs = model(**input_dict)
+        >>> loss = outputs.loss
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        pixel_embedding_output, pixel_mask = self.pixel_embeddings(pixel_values, pixel_mask)
+
+        audio_embedding_output, audio_mask = self.audio_embeddings(audio_values, audio_mask)
+
+        # Mask pixel if mask_pixel is True
+        pixel_label_masks = None
+        pixel_ids_restore = None
+        if mask_pixel:
+            pixel_mask_noise, pixel_len_keep = generate_pixel_mask_noise(
+                pixel_embedding_output, pixel_mask=pixel_mask, mask_ratio=self.config.pixel_mask_ratio
+            )
+            pixel_embedding_output, pixel_mask, pixel_label_masks, pixel_ids_restore = random_masking(
+                pixel_embedding_output,
+                pixel_mask_noise,
+                pixel_len_keep,
+                attention_masks=pixel_mask,
+            )
+
+        # Mask audio if mask_audio is True
+        audio_label_masks = None
+        audio_ids_restore = None
+        if mask_audio:
+            num_freq_patches = self.config.frequency_length // self.config.audio_patch_size[1]
+            audio_mask_noise, audio_len_keep = generate_audio_mask_noise(
+                audio_embedding_output,
+                audio_mask=audio_mask,
+                mask_ratio=self.config.audio_mask_ratio,
+                mask_type=self.config.audio_mask_type,
+                freq_len=num_freq_patches,
+            )
+            audio_embedding_output, audio_mask, audio_label_masks, audio_ids_restore = random_masking(
+                audio_embedding_output,
+                audio_mask_noise,
+                audio_len_keep,
+                attention_masks=audio_mask,
+            )
+
+        # Prepare for encoder inputs and attention masks
+        batch_size = pixel_values.size(0)
+        embedding_output = torch.cat(
+            [self.cls_embedding.repeat(batch_size, 1, 1), pixel_embedding_output, audio_embedding_output], 1
+        )
+        masked_pixel_len = pixel_embedding_output.size(1)
+
+        attention_mask = None
+        if pixel_mask is not None and audio_mask is not None:
+            attention_mask = torch.cat([pixel_mask[:, :1], pixel_mask, audio_mask], 1)
+
+        input_shape = embedding_output.size()
+        extended_attention_mask = None
+        if attention_mask is not None:
+            extended_attention_mask = self.get_extended_attention_mask(attention_mask, input_shape)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            attention_mask=extended_attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        if self.layernorm is not None:
+            sequence_output = self.layernorm(sequence_output)
+
+        pixel_sequence_output = sequence_output[:, 1 : 1 + masked_pixel_len]
+        audio_sequence_output = sequence_output[:, 1 + masked_pixel_len :]
+        if not return_dict:
+            return (
+                sequence_output,
+                pixel_sequence_output,
+                audio_sequence_output,
+                pixel_label_masks,
+                audio_label_masks,
+                pixel_ids_restore,
+                audio_ids_restore,
+            ) + encoder_outputs[1:]
+
+        return TvltModelOutput(
+            last_hidden_state=sequence_output,
+            last_pixel_hidden_state=pixel_sequence_output,
+            last_audio_hidden_state=audio_sequence_output,
+            pixel_label_masks=pixel_label_masks,
+            audio_label_masks=audio_label_masks,
+            pixel_ids_restore=pixel_ids_restore,
+            audio_ids_restore=audio_ids_restore,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class TvltDecoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        decoder_config = deepcopy(config)
+        decoder_config.hidden_size = config.decoder_hidden_size
+        decoder_config.num_hidden_layers = config.decoder_num_hidden_layers
+        decoder_config.num_attention_heads = config.decoder_num_attention_heads
+        decoder_config.intermediate_size = config.decoder_intermediate_size
+        self.decoder_layers = nn.ModuleList(
+            [TvltLayer(decoder_config) for _ in range(config.decoder_num_hidden_layers)]
+        )
+
+        self.layernorm = nn.LayerNorm(config.decoder_hidden_size, eps=config.layer_norm_eps)
+
+        self.gradient_checkpointing = False
+        self.config = config
+
+    def forward(
+        self,
+        hidden_states,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        # apply Transformer layers (blocks)
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        for i, layer_module in enumerate(self.decoder_layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_outputs = layer_module(hidden_states, output_attentions=output_attentions)
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        # predictor projection
+        logits = self.layernorm(hidden_states)
+
+        if not return_dict:
+            return tuple(v for v in [logits, all_hidden_states, all_self_attentions] if v is not None)
+        return TvltDecoderOutput(logits=logits, hidden_states=all_hidden_states, attentions=all_self_attentions)
+
+
+@add_start_docstrings(
+    "The TVLT Model transformer with the decoder on top for self-supervised pre-training.",
+    TVLT_START_DOCSTRING,
+)
+class TvltForPreTraining(TvltPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+
+        self.task_matching = config.task_matching
+        self.task_mae = config.task_mae
+        if not (self.task_matching or self.task_mae):
+            raise ValueError("Must set at least one of matching task and MAE task to true")
+
+        self.tvlt = TvltModel(config)
+
+        if self.task_matching:
+            self.matching_head = TvltMatchingHead(config)
+
+        if self.task_mae:
+            self.encoder_to_decoder = nn.Linear(config.hidden_size, config.decoder_hidden_size, bias=True)
+
+            self.pixel_mask_token = nn.Parameter(torch.zeros(1, 1, config.decoder_hidden_size))
+            self.audio_mask_token = nn.Parameter(torch.zeros(1, 1, config.decoder_hidden_size))
+
+            self.decoder = TvltDecoder(config)
+
+            decoder_hidden_size = config.decoder_hidden_size
+
+            num_frames = config.num_frames
+            num_patches_per_image = self.tvlt.pixel_embeddings.num_patches_per_image
+            self.decoder_pixel_pos_embed = nn.Parameter(torch.zeros(1, num_patches_per_image, decoder_hidden_size))
+            self.decoder_temporal_embed = nn.Parameter(torch.zeros(1, config.num_frames, decoder_hidden_size))
+            self.decoder_pixel_type_embed = nn.Parameter(torch.zeros(1, 1, decoder_hidden_size))
+
+            num_audio_patches = self.tvlt.audio_embeddings.num_patches
+            num_freq_patches = config.frequency_length // config.audio_patch_size[1]
+            self.decoder_audio_pos_embed = nn.Parameter(
+                torch.zeros(1, num_audio_patches // num_freq_patches, decoder_hidden_size)
+            )
+            self.decoder_freq_embed = nn.Parameter(torch.zeros(1, num_freq_patches, decoder_hidden_size))
+            self.decoder_audio_type_embed = nn.Parameter(torch.zeros(1, 1, decoder_hidden_size))
+
+            pixel_mae_output_dim = self.config.image_patch_size[0] ** 2 * self.config.num_image_channels
+            self.pixel_mae_head = TvltMAEHead(config, pixel_mae_output_dim)
+            audio_mae_output_dim = (
+                self.config.audio_patch_size[0] * self.config.audio_patch_size[1] * self.config.num_audio_channels
+            )
+            self.audio_mae_head = TvltMAEHead(config, audio_mae_output_dim)
+
+            self.num_frames = num_frames
+            self.num_patches_per_image = num_patches_per_image
+            self.num_freq_patches = num_freq_patches
+            self.image_patch_size = config.image_patch_size
+            self.audio_patch_size = config.audio_patch_size
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def patchify_pixel(self, pixel_values):
+        """
+        pixel_values: [batch_size, num_frames, 3, height, width]
+        """
+        batch_size, num_frames, num_channels, height, width = pixel_values.shape
+        num_patches_height = pixel_values.shape[3] // self.image_patch_size[0]
+        num_patches_width = pixel_values.shape[4] // self.image_patch_size[1]
+        patchified_pixel_values = pixel_values.reshape(
+            shape=(
+                batch_size,
+                num_frames,
+                num_channels,
+                num_patches_height,
+                self.image_patch_size[0],
+                num_patches_width,
+                self.image_patch_size[1],
+            )
+        )
+        patchified_pixel_values = torch.einsum("ntchpwq->nthwpqc", patchified_pixel_values)
+        patchified_pixel_values = patchified_pixel_values.reshape(
+            shape=(
+                batch_size,
+                num_patches_height * num_patches_width * num_frames,
+                self.image_patch_size[0] * self.image_patch_size[1] * num_channels,
+            )
+        )
+        return patchified_pixel_values
+
+    def patchify_audio(self, audio_values):
+        """
+        audio_values: [batch_size, 1, height, width]
+        """
+        batch_size, num_channels, height, width = audio_values.shape
+        num_patches_height = height // self.audio_patch_size[0]
+        num_patches_width = width // self.audio_patch_size[1]
+        patchified_audio_values = audio_values.reshape(
+            shape=(
+                batch_size,
+                num_channels,
+                num_patches_height,
+                self.audio_patch_size[0],
+                num_patches_width,
+                self.audio_patch_size[1],
+            )
+        )
+        patchified_audio_values = torch.einsum("nchpwq->nhwpqc", patchified_audio_values)
+        patchified_audio_values = patchified_audio_values.reshape(
+            shape=(
+                batch_size,
+                num_patches_height * num_patches_width,
+                self.audio_patch_size[0] * self.audio_patch_size[1] * num_channels,
+            )
+        )
+        return patchified_audio_values
+
+    def pixel_mae_loss(self, pixel_values, pixel_predictions, mask):
+        patchified_pixel_values = self.patchify_pixel(pixel_values)
+        loss = (pixel_predictions - patchified_pixel_values) ** 2
+        loss = loss.mean(dim=-1)  # [batch_size, pixel_pixel_length], mean loss per patch
+        loss = (loss * mask).sum() / mask.sum()  # mean loss on removed patches
+        return loss
+
+    def audio_mae_loss(self, audio_values, audio_predictions, mask):
+        patchified_audio_values = self.patchify_audio(audio_values)
+        loss = (audio_predictions - patchified_audio_values) ** 2
+        loss = loss.mean(dim=-1)  # [batch_size, audio_pixel_length], mean loss per patch
+        loss = (loss * mask).sum() / mask.sum()  # mean loss on removed patches
+        return loss
+
+    def concatenate_mask(self, mask_token, sequence, ids_restore):
+        batch_size, seq_length, dim = sequence.shape
+        mask_tokens = mask_token.repeat(batch_size, ids_restore.shape[1] - seq_length, 1)
+        padded_sequence = torch.cat([sequence, mask_tokens], dim=1)
+        padded_sequence = torch.gather(
+            padded_sequence, dim=1, index=ids_restore.unsqueeze(-1).repeat(1, 1, dim)
+        )  # unshuffle
+        return padded_sequence
+
+    @add_start_docstrings_to_model_forward(TVLT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=TvltForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        audio_values: torch.FloatTensor,
+        pixel_mask: Optional[torch.FloatTensor] = None,
+        audio_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        pixel_values_mixed: Optional[torch.FloatTensor] = None,
+        pixel_mask_mixed: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.FloatTensor], TvltForPreTrainingOutput]:
+        r"""
+        pixel_values_mixed (`torch.FloatTensor` of shape `(batch_size, num_frames, num_channels, height, width)`):
+            Pixel values that mix positive and negative samples in Tvlt vision-audio matching. Audio values can be
+            obtained using [`TvltProcessor`]. See [`TvltProcessor.__call__`] for details.
+
+        pixel_mask_mixed (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel masks of pixel_values_mixed. Pixel values mixed can be obtained using [`TvltProcessor`]. See
+            [`TvltProcessor.__call__`] for details.
+
+        labels (`torch.LongTensor` of shape `(batch_size, num_labels)`, *optional*):
+            Labels for computing the vision audio matching loss. Indices should be in `[0, 1]`. num_labels has to be 1.
+
+        Return:
+
+        Examples:
+
+        ```python
+        >>> from transformers import TvltProcessor, TvltForPreTraining
+        >>> import numpy as np
+        >>> import torch
+
+        >>> num_frames = 8
+        >>> images = list(np.random.randn(num_frames, 3, 224, 224))
+        >>> images_mixed = list(np.random.randn(num_frames, 3, 224, 224))
+        >>> audio = list(np.random.randn(10000))
+        >>> processor = TvltProcessor.from_pretrained("ZinengTang/tvlt-base")
+        >>> model = TvltForPreTraining.from_pretrained("ZinengTang/tvlt-base")
+        >>> input_dict = processor(
+        ...     images, audio, images_mixed, sampling_rate=44100, mask_pixel=True, mask_audio=True, return_tensors="pt"
+        ... )
+
+        >>> outputs = model(**input_dict)
+        >>> loss = outputs.loss
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        total_loss = 0.0
+
+        if self.task_matching:
+            if labels is None:
+                raise ValueError("Matching task requires labels")
+            if pixel_values_mixed is None:
+                raise ValueError("Matching task requires pixel_values_mixed")
+
+            outputs = self.tvlt(
+                pixel_values_mixed,
+                audio_values,
+                pixel_mask=pixel_mask_mixed,
+                audio_mask=audio_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+
+            sequence_output = outputs[0]
+            matching_logits = self.matching_head(sequence_output)
+
+            loss_fct = BCEWithLogitsLoss()
+            loss = loss_fct(matching_logits.view(-1), labels.view(-1))
+            total_loss += loss
+
+        pixel_logits = None
+        audio_logits = None
+        if self.task_mae and self.training:
+            outputs = self.tvlt(
+                pixel_values,
+                audio_values,
+                pixel_mask=pixel_mask,
+                audio_mask=audio_mask,
+                mask_pixel=True,
+                mask_audio=True,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+            pixel_sequence_output = outputs.last_pixel_hidden_state if return_dict else outputs[1]
+            audio_sequence_output = outputs.last_audio_hidden_state if return_dict else outputs[2]
+            pixel_label_masks = outputs.pixel_label_masks if return_dict else outputs[3]
+            audio_label_masks = outputs.audio_label_masks if return_dict else outputs[4]
+            pixel_ids_restore = outputs.pixel_ids_restore if return_dict else outputs[5]
+            audio_ids_restore = outputs.audio_ids_restore if return_dict else outputs[6]
+
+            pixel_decoder_input = self.encoder_to_decoder(
+                pixel_sequence_output
+            )  # [batch_size, num_masked_pixel_patches, decoder_hidden_size]
+            audio_decoder_input = self.encoder_to_decoder(
+                audio_sequence_output
+            )  # [batch_size, num_masked_audio_patches, decoder_hidden_size]
+            num_frames = pixel_values.size(1)
+            pixel_decoder_input = self.concatenate_mask(self.pixel_mask_token, pixel_decoder_input, pixel_ids_restore)
+            pixel_decoder_input = pixel_decoder_input + self.decoder_pixel_pos_embed.repeat(1, num_frames, 1)
+            pixel_decoder_input = pixel_decoder_input + torch.repeat_interleave(
+                self.decoder_temporal_embed[:, :num_frames], self.num_patches_per_image, dim=1
+            )
+            pixel_decoder_input = pixel_decoder_input + self.decoder_pixel_type_embed
+            pixel_decoder_outputs = self.decoder(pixel_decoder_input)
+            pixel_logits = self.pixel_mae_head(pixel_decoder_outputs.logits)
+
+            audio_decoder_input = self.concatenate_mask(self.audio_mask_token, audio_decoder_input, audio_ids_restore)
+            num_time_patches = audio_decoder_input.size(1) // self.num_freq_patches
+            audio_decoder_input = audio_decoder_input + self.decoder_freq_embed.repeat(1, num_time_patches, 1)
+            audio_decoder_input = audio_decoder_input + torch.repeat_interleave(
+                self.decoder_audio_pos_embed[:, :num_time_patches], self.num_freq_patches, dim=1
+            )
+            audio_decoder_input = audio_decoder_input + self.decoder_audio_type_embed
+            audio_decoder_outputs = self.decoder(audio_decoder_input)
+            audio_logits = self.audio_mae_head(audio_decoder_outputs.logits)
+
+            loss = self.pixel_mae_loss(pixel_values, pixel_logits, pixel_label_masks) + self.audio_mae_loss(
+                audio_values, audio_logits, audio_label_masks
+            )
+            total_loss += loss
+
+        if not return_dict:
+            output = (matching_logits, pixel_logits, audio_logits) + outputs[7:]
+            return ((total_loss,) + output) if loss is not None else output
+
+        return TvltForPreTrainingOutput(
+            loss=total_loss,
+            matching_logits=matching_logits,
+            pixel_logits=pixel_logits,
+            audio_logits=audio_logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class TvltPooler(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+class TvltMatchingHead(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.pooler = TvltPooler(config)
+        self.fc = nn.Linear(config.hidden_size, 1)
+
+    def forward(self, hidden_states):
+        hidden_states = self.fc(self.pooler(hidden_states))
+        return hidden_states
+
+
+class TvltMAEHead(nn.Module):
+    def __init__(self, config, output_dim=None):
+        super().__init__()
+        self.config = config
+        self.decoder = nn.Linear(config.decoder_hidden_size, output_dim)
+
+    def forward(self, hidden_states):
+        hidden_states = self.decoder(hidden_states)
+        return hidden_states
+
+
+@add_start_docstrings(
+    """
+    Tvlt Model transformer with a classifier head on top (an MLP on top of the final hidden state of the [CLS] token)
+    for audiovisual classification tasks, e.g. CMU-MOSEI Sentiment Analysis and Audio to Video Retrieval.
+    """,
+    TVLT_START_DOCSTRING,
+)
+class TvltForAudioVisualClassification(TvltPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.tvlt = TvltModel(config)
+
+        # Classifier head
+        self.classifier = nn.Sequential(
+            nn.Linear(config.hidden_size, config.hidden_size * 2),
+            nn.LayerNorm(config.hidden_size * 2, eps=config.layer_norm_eps),
+            nn.GELU(),
+            nn.Linear(config.hidden_size * 2, config.num_labels),
+        )
+        self.config = config
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(TVLT_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=SequenceClassifierOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        audio_values: torch.FloatTensor,
+        pixel_mask: Optional[torch.FloatTensor] = None,
+        audio_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.LongTensor] = None,
+    ) -> Union[tuple[torch.FloatTensor], SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, num_labels)`, *optional*):
+            Labels for computing the audiovisual loss. Indices should be in `[0, ..., num_classes-1]` where num_classes
+            refers to the number of classes in audiovisual tasks.
+
+        Return:
+
+        Examples:
+        ```python
+        >>> from transformers import TvltProcessor, TvltForAudioVisualClassification
+        >>> import numpy as np
+        >>> import torch
+
+        >>> num_frames = 8
+        >>> images = list(np.random.randn(num_frames, 3, 224, 224))
+        >>> audio = list(np.random.randn(10000))
+        >>> processor = TvltProcessor.from_pretrained("ZinengTang/tvlt-base")
+        >>> model = TvltForAudioVisualClassification.from_pretrained("ZinengTang/tvlt-base")
+        >>> input_dict = processor(images, audio, sampling_rate=44100, return_tensors="pt")
+
+        >>> outputs = model(**input_dict)
+        >>> loss = outputs.loss
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.tvlt(
+            pixel_values,
+            audio_values,
+            pixel_mask=pixel_mask,
+            audio_mask=audio_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = outputs[0][:, 0]
+        logits = self.classifier(sequence_output)  # rank value
+
+        loss = None
+        if labels is not None:
+            if self.config.loss_type == "regression":
+                loss_fct = MSELoss()
+                loss = loss_fct(logits, labels)
+            elif self.config.loss_type == "classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(logits, labels)
+
+        if not return_dict:
+            output = (logits,) + outputs[4:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = ["TvltModel", "TvltForPreTraining", "TvltForAudioVisualClassification", "TvltPreTrainedModel"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/processing_tvlt.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/processing_tvlt.py
new file mode 100644
index 0000000000000000000000000000000000000000..1ecbac0dfa16f7c2cf70d07775bba2beeb456973
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/tvlt/processing_tvlt.py
@@ -0,0 +1,86 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for TVLT.
+"""
+
+from ....processing_utils import ProcessorMixin
+
+
+class TvltProcessor(ProcessorMixin):
+    r"""
+    Constructs a TVLT processor which wraps a TVLT image processor and TVLT feature extractor into a single processor.
+
+    [`TvltProcessor`] offers all the functionalities of [`TvltImageProcessor`] and [`TvltFeatureExtractor`]. See the
+    docstring of [`~TvltProcessor.__call__`] for more information.
+
+    Args:
+        image_processor (`TvltImageProcessor`):
+            An instance of [`TvltImageProcessor`]. The image processor is a required input.
+        feature_extractor (`TvltFeatureExtractor`):
+            An instance of [`TvltFeatureExtractor`]. The feature extractor is a required input.
+    """
+
+    attributes = ["image_processor", "feature_extractor"]
+    image_processor_class = "TvltImageProcessor"
+    feature_extractor_class = "TvltFeatureExtractor"
+
+    def __init__(self, image_processor, feature_extractor):
+        super().__init__(image_processor=image_processor, feature_extractor=feature_extractor)
+
+        self.image_processor = image_processor
+        self.feature_extractor = feature_extractor
+
+    def __call__(
+        self,
+        images=None,
+        audio=None,
+        images_mixed=None,
+        sampling_rate=None,
+        mask_audio=False,
+        mask_pixel=False,
+        *args,
+        **kwargs,
+    ):
+        """
+        Forwards the `images` argument to TvltImageProcessor's [`~TvltImageProcessor.preprocess`] and the `audio`
+        argument to TvltFeatureExtractor's [`~TvltFeatureExtractor.__call__`]. Please refer to the docstring of the
+        above two methods for more information.
+        """
+
+        if images is None and audio is None:
+            raise ValueError("You need to specify either an `images` or `audio` input to process.")
+
+        images_mixed_dict = None
+        if images is not None:
+            images_dict = self.image_processor(images, mask_pixel=mask_pixel, *args, **kwargs)
+        if images_mixed is not None:
+            images_mixed_dict = self.image_processor(images_mixed, is_mixed=True, *args, **kwargs)
+        if audio is not None:
+            audio_dict = self.feature_extractor(
+                audio, *args, sampling_rate=sampling_rate, mask_audio=mask_audio, **kwargs
+            )
+
+        output_dict = {}
+        if audio is not None:
+            output_dict.update(audio_dict)
+        if images is not None:
+            output_dict.update(images_dict)
+        if images_mixed_dict is not None:
+            output_dict.update(images_mixed_dict)
+        return output_dict
+
+
+__all__ = ["TvltProcessor"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/van/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/van/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9552c827365d3913539be3eafaf822146ada5829
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/van/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ....utils import _LazyModule
+from ....utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_van import *
+    from .modeling_van import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/van/configuration_van.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/van/configuration_van.py
new file mode 100644
index 0000000000000000000000000000000000000000..be94e45ec63d9516584c320c98e0b1157c092c2c
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/van/configuration_van.py
@@ -0,0 +1,110 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""VAN model configuration"""
+
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class VanConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`VanModel`]. It is used to instantiate a VAN model
+    according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the VAN
+    [Visual-Attention-Network/van-base](https://huggingface.co/Visual-Attention-Network/van-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        patch_sizes (`list[int]`, *optional*, defaults to `[7, 3, 3, 3]`):
+            Patch size to use in each stage's embedding layer.
+        strides (`list[int]`, *optional*, defaults to `[4, 2, 2, 2]`):
+            Stride size to use in each stage's embedding layer to downsample the input.
+        hidden_sizes (`list[int]`, *optional*, defaults to `[64, 128, 320, 512]`):
+            Dimensionality (hidden size) at each stage.
+        depths (`list[int]`, *optional*, defaults to `[3, 3, 12, 3]`):
+            Depth (number of layers) for each stage.
+        mlp_ratios (`list[int]`, *optional*, defaults to `[8, 8, 4, 4]`):
+            The expansion ratio for mlp layer at each stage.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in each layer. If string, `"gelu"`, `"relu"`,
+            `"selu"` and `"gelu_new"` are supported.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        layer_scale_init_value (`float`, *optional*, defaults to 0.01):
+            The initial value for layer scaling.
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            The dropout probability for stochastic depth.
+        dropout_rate (`float`, *optional*, defaults to 0.0):
+            The dropout probability for dropout.
+
+    Example:
+    ```python
+    >>> from transformers import VanModel, VanConfig
+
+    >>> # Initializing a VAN van-base style configuration
+    >>> configuration = VanConfig()
+    >>> # Initializing a model from the van-base style configuration
+    >>> model = VanModel(configuration)
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "van"
+
+    def __init__(
+        self,
+        image_size=224,
+        num_channels=3,
+        patch_sizes=[7, 3, 3, 3],
+        strides=[4, 2, 2, 2],
+        hidden_sizes=[64, 128, 320, 512],
+        depths=[3, 3, 12, 3],
+        mlp_ratios=[8, 8, 4, 4],
+        hidden_act="gelu",
+        initializer_range=0.02,
+        layer_norm_eps=1e-6,
+        layer_scale_init_value=1e-2,
+        drop_path_rate=0.0,
+        dropout_rate=0.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.image_size = image_size
+        self.num_channels = num_channels
+        self.patch_sizes = patch_sizes
+        self.strides = strides
+        self.hidden_sizes = hidden_sizes
+        self.depths = depths
+        self.mlp_ratios = mlp_ratios
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.layer_scale_init_value = layer_scale_init_value
+        self.drop_path_rate = drop_path_rate
+        self.dropout_rate = dropout_rate
+
+
+__all__ = ["VanConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/van/modeling_van.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/van/modeling_van.py
new file mode 100644
index 0000000000000000000000000000000000000000..c0fc0bc1a63728ea54935775b085dd8041e46a45
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/van/modeling_van.py
@@ -0,0 +1,520 @@
+# coding=utf-8
+# Copyright 2022 BNRist (Tsinghua University), TKLNDST (Nankai University) and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Visual Attention Network (VAN) model."""
+
+import math
+from collections import OrderedDict
+from typing import Optional, Union
+
+import torch
+from torch import nn
+
+from ....activations import ACT2FN
+from ....modeling_outputs import (
+    BaseModelOutputWithNoAttention,
+    BaseModelOutputWithPoolingAndNoAttention,
+    ImageClassifierOutputWithNoAttention,
+)
+from ....modeling_utils import PreTrainedModel
+from ....utils import add_code_sample_docstrings, add_start_docstrings, add_start_docstrings_to_model_forward, logging
+from .configuration_van import VanConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "VanConfig"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "Visual-Attention-Network/van-base"
+_EXPECTED_OUTPUT_SHAPE = [1, 512, 7, 7]
+
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "Visual-Attention-Network/van-base"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
+
+
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+
+
+class VanDropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return f"p={self.drop_prob}"
+
+
+class VanOverlappingPatchEmbedder(nn.Module):
+    """
+    Downsamples the input using a patchify operation with a `stride` of 4 by default making adjacent windows overlap by
+    half of the area. From [PVTv2: Improved Baselines with Pyramid Vision
+    Transformer](https://huggingface.co/papers/2106.13797).
+    """
+
+    def __init__(self, in_channels: int, hidden_size: int, patch_size: int = 7, stride: int = 4):
+        super().__init__()
+        self.convolution = nn.Conv2d(
+            in_channels, hidden_size, kernel_size=patch_size, stride=stride, padding=patch_size // 2
+        )
+        self.normalization = nn.BatchNorm2d(hidden_size)
+
+    def forward(self, input: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.convolution(input)
+        hidden_state = self.normalization(hidden_state)
+        return hidden_state
+
+
+class VanMlpLayer(nn.Module):
+    """
+    MLP with depth-wise convolution, from [PVTv2: Improved Baselines with Pyramid Vision
+    Transformer](https://huggingface.co/papers/2106.13797).
+    """
+
+    def __init__(
+        self,
+        in_channels: int,
+        hidden_size: int,
+        out_channels: int,
+        hidden_act: str = "gelu",
+        dropout_rate: float = 0.5,
+    ):
+        super().__init__()
+        self.in_dense = nn.Conv2d(in_channels, hidden_size, kernel_size=1)
+        self.depth_wise = nn.Conv2d(hidden_size, hidden_size, kernel_size=3, padding=1, groups=hidden_size)
+        self.activation = ACT2FN[hidden_act]
+        self.dropout1 = nn.Dropout(dropout_rate)
+        self.out_dense = nn.Conv2d(hidden_size, out_channels, kernel_size=1)
+        self.dropout2 = nn.Dropout(dropout_rate)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.in_dense(hidden_state)
+        hidden_state = self.depth_wise(hidden_state)
+        hidden_state = self.activation(hidden_state)
+        hidden_state = self.dropout1(hidden_state)
+        hidden_state = self.out_dense(hidden_state)
+        hidden_state = self.dropout2(hidden_state)
+        return hidden_state
+
+
+class VanLargeKernelAttention(nn.Module):
+    """
+    Basic Large Kernel Attention (LKA).
+    """
+
+    def __init__(self, hidden_size: int):
+        super().__init__()
+        self.depth_wise = nn.Conv2d(hidden_size, hidden_size, kernel_size=5, padding=2, groups=hidden_size)
+        self.depth_wise_dilated = nn.Conv2d(
+            hidden_size, hidden_size, kernel_size=7, dilation=3, padding=9, groups=hidden_size
+        )
+        self.point_wise = nn.Conv2d(hidden_size, hidden_size, kernel_size=1)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.depth_wise(hidden_state)
+        hidden_state = self.depth_wise_dilated(hidden_state)
+        hidden_state = self.point_wise(hidden_state)
+        return hidden_state
+
+
+class VanLargeKernelAttentionLayer(nn.Module):
+    """
+    Computes attention using Large Kernel Attention (LKA) and attends the input.
+    """
+
+    def __init__(self, hidden_size: int):
+        super().__init__()
+        self.attention = VanLargeKernelAttention(hidden_size)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        attention = self.attention(hidden_state)
+        attended = hidden_state * attention
+        return attended
+
+
+class VanSpatialAttentionLayer(nn.Module):
+    """
+    Van spatial attention layer composed by projection (via conv) -> act -> Large Kernel Attention (LKA) attention ->
+    projection (via conv) + residual connection.
+    """
+
+    def __init__(self, hidden_size: int, hidden_act: str = "gelu"):
+        super().__init__()
+        self.pre_projection = nn.Sequential(
+            OrderedDict(
+                [
+                    ("conv", nn.Conv2d(hidden_size, hidden_size, kernel_size=1)),
+                    ("act", ACT2FN[hidden_act]),
+                ]
+            )
+        )
+        self.attention_layer = VanLargeKernelAttentionLayer(hidden_size)
+        self.post_projection = nn.Conv2d(hidden_size, hidden_size, kernel_size=1)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        residual = hidden_state
+        hidden_state = self.pre_projection(hidden_state)
+        hidden_state = self.attention_layer(hidden_state)
+        hidden_state = self.post_projection(hidden_state)
+        hidden_state = hidden_state + residual
+        return hidden_state
+
+
+class VanLayerScaling(nn.Module):
+    """
+    Scales the inputs by a learnable parameter initialized by `initial_value`.
+    """
+
+    def __init__(self, hidden_size: int, initial_value: float = 1e-2):
+        super().__init__()
+        self.weight = nn.Parameter(initial_value * torch.ones(hidden_size), requires_grad=True)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        # unsqueezing for broadcasting
+        hidden_state = self.weight.unsqueeze(-1).unsqueeze(-1) * hidden_state
+        return hidden_state
+
+
+class VanLayer(nn.Module):
+    """
+    Van layer composed by normalization layers, large kernel attention (LKA) and a multi layer perceptron (MLP).
+    """
+
+    def __init__(
+        self,
+        config: VanConfig,
+        hidden_size: int,
+        mlp_ratio: int = 4,
+        drop_path_rate: float = 0.5,
+    ):
+        super().__init__()
+        self.drop_path = VanDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
+        self.pre_normomalization = nn.BatchNorm2d(hidden_size)
+        self.attention = VanSpatialAttentionLayer(hidden_size, config.hidden_act)
+        self.attention_scaling = VanLayerScaling(hidden_size, config.layer_scale_init_value)
+        self.post_normalization = nn.BatchNorm2d(hidden_size)
+        self.mlp = VanMlpLayer(
+            hidden_size, hidden_size * mlp_ratio, hidden_size, config.hidden_act, config.dropout_rate
+        )
+        self.mlp_scaling = VanLayerScaling(hidden_size, config.layer_scale_init_value)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        residual = hidden_state
+        # attention
+        hidden_state = self.pre_normomalization(hidden_state)
+        hidden_state = self.attention(hidden_state)
+        hidden_state = self.attention_scaling(hidden_state)
+        hidden_state = self.drop_path(hidden_state)
+        # residual connection
+        hidden_state = residual + hidden_state
+        residual = hidden_state
+        # mlp
+        hidden_state = self.post_normalization(hidden_state)
+        hidden_state = self.mlp(hidden_state)
+        hidden_state = self.mlp_scaling(hidden_state)
+        hidden_state = self.drop_path(hidden_state)
+        # residual connection
+        hidden_state = residual + hidden_state
+        return hidden_state
+
+
+class VanStage(nn.Module):
+    """
+    VanStage, consisting of multiple layers.
+    """
+
+    def __init__(
+        self,
+        config: VanConfig,
+        in_channels: int,
+        hidden_size: int,
+        patch_size: int,
+        stride: int,
+        depth: int,
+        mlp_ratio: int = 4,
+        drop_path_rate: float = 0.0,
+    ):
+        super().__init__()
+        self.embeddings = VanOverlappingPatchEmbedder(in_channels, hidden_size, patch_size, stride)
+        self.layers = nn.Sequential(
+            *[
+                VanLayer(
+                    config,
+                    hidden_size,
+                    mlp_ratio=mlp_ratio,
+                    drop_path_rate=drop_path_rate,
+                )
+                for _ in range(depth)
+            ]
+        )
+        self.normalization = nn.LayerNorm(hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.embeddings(hidden_state)
+        hidden_state = self.layers(hidden_state)
+        # rearrange b c h w -> b (h w) c
+        batch_size, hidden_size, height, width = hidden_state.shape
+        hidden_state = hidden_state.flatten(2).transpose(1, 2)
+        hidden_state = self.normalization(hidden_state)
+        # rearrange  b (h w) c- > b c h w
+        hidden_state = hidden_state.view(batch_size, height, width, hidden_size).permute(0, 3, 1, 2)
+        return hidden_state
+
+
+class VanEncoder(nn.Module):
+    """
+    VanEncoder, consisting of multiple stages.
+    """
+
+    def __init__(self, config: VanConfig):
+        super().__init__()
+        self.stages = nn.ModuleList([])
+        patch_sizes = config.patch_sizes
+        strides = config.strides
+        hidden_sizes = config.hidden_sizes
+        depths = config.depths
+        mlp_ratios = config.mlp_ratios
+        drop_path_rates = [
+            x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths), device="cpu")
+        ]
+
+        for num_stage, (patch_size, stride, hidden_size, depth, mlp_expansion, drop_path_rate) in enumerate(
+            zip(patch_sizes, strides, hidden_sizes, depths, mlp_ratios, drop_path_rates)
+        ):
+            is_first_stage = num_stage == 0
+            in_channels = hidden_sizes[num_stage - 1]
+            if is_first_stage:
+                in_channels = config.num_channels
+            self.stages.append(
+                VanStage(
+                    config,
+                    in_channels,
+                    hidden_size,
+                    patch_size=patch_size,
+                    stride=stride,
+                    depth=depth,
+                    mlp_ratio=mlp_expansion,
+                    drop_path_rate=drop_path_rate,
+                )
+            )
+
+    def forward(
+        self,
+        hidden_state: torch.Tensor,
+        output_hidden_states: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[tuple, BaseModelOutputWithNoAttention]:
+        all_hidden_states = () if output_hidden_states else None
+
+        for _, stage_module in enumerate(self.stages):
+            hidden_state = stage_module(hidden_state)
+
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_state,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_state, all_hidden_states] if v is not None)
+
+        return BaseModelOutputWithNoAttention(last_hidden_state=hidden_state, hidden_states=all_hidden_states)
+
+
+class VanPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config: VanConfig
+    base_model_prefix = "van"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            nn.init.trunc_normal_(module.weight, std=self.config.initializer_range)
+            if isinstance(module, nn.Linear) and module.bias is not None:
+                nn.init.constant_(module.bias, 0)
+        elif isinstance(module, nn.LayerNorm):
+            nn.init.constant_(module.bias, 0)
+            nn.init.constant_(module.weight, 1.0)
+        elif isinstance(module, nn.Conv2d):
+            fan_out = module.kernel_size[0] * module.kernel_size[1] * module.out_channels
+            fan_out //= module.groups
+            module.weight.data.normal_(0, math.sqrt(2.0 / fan_out))
+            if module.bias is not None:
+                module.bias.data.zero_()
+
+
+VAN_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`VanConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+VAN_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`ConvNextImageProcessor.__call__`] for details.
+
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all stages. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare VAN model outputting raw features without any specific head on top. Note, VAN does not have an embedding"
+    " layer.",
+    VAN_START_DOCSTRING,
+)
+class VanModel(VanPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.encoder = VanEncoder(config)
+        # final layernorm layer
+        self.layernorm = nn.LayerNorm(config.hidden_sizes[-1], eps=config.layer_norm_eps)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(VAN_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPoolingAndNoAttention,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor],
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutputWithPoolingAndNoAttention]:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_outputs = self.encoder(
+            pixel_values,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_state = encoder_outputs[0]
+        # global average pooling, n c w h -> n c
+        pooled_output = last_hidden_state.mean(dim=[-2, -1])
+
+        if not return_dict:
+            return (last_hidden_state, pooled_output) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndNoAttention(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+        )
+
+
+@add_start_docstrings(
+    """
+    VAN Model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
+    ImageNet.
+    """,
+    VAN_START_DOCSTRING,
+)
+class VanForImageClassification(VanPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.van = VanModel(config)
+        # Classifier head
+        self.classifier = (
+            nn.Linear(config.hidden_sizes[-1], config.num_labels) if config.num_labels > 0 else nn.Identity()
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(VAN_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=ImageClassifierOutputWithNoAttention,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, ImageClassifierOutputWithNoAttention]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.van(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict)
+
+        pooled_output = outputs.pooler_output if return_dict else outputs[1]
+
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(labels, logits, self.config)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return ImageClassifierOutputWithNoAttention(loss=loss, logits=logits, hidden_states=outputs.hidden_states)
+
+
+__all__ = ["VanForImageClassification", "VanModel", "VanPreTrainedModel"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/vit_hybrid/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/vit_hybrid/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5bd93aa4dabea9b2adbc54eeeb3664de589f43a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/vit_hybrid/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2022 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ....utils import _LazyModule
+from ....utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_vit_hybrid import *
+    from .image_processing_vit_hybrid import *
+    from .modeling_vit_hybrid import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/vit_hybrid/configuration_vit_hybrid.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/vit_hybrid/configuration_vit_hybrid.py
new file mode 100644
index 0000000000000000000000000000000000000000..e96c6c4a1b58cd40e86dfac71dfcb9c0eb19f57c
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/vit_hybrid/configuration_vit_hybrid.py
@@ -0,0 +1,180 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""ViT Hybrid model configuration"""
+
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
+from ...auto.configuration_auto import CONFIG_MAPPING
+from ...bit import BitConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class ViTHybridConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ViTHybridModel`]. It is used to instantiate a ViT
+    Hybrid model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the ViT Hybrid
+    [google/vit-hybrid-base-bit-384](https://huggingface.co/google/vit-hybrid-base-bit-384) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        backbone_config (`Union[dict[str, Any], PretrainedConfig]`, *optional*):
+            The configuration of the backbone in a dictionary or the config object of the backbone.
+        backbone (`str`, *optional*):
+            Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
+            will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
+            is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
+        use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
+            Whether to use pretrained weights for the backbone.
+        use_timm_backbone (`bool`, *optional*, defaults to `False`):
+            Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
+            library.
+        backbone_kwargs (`dict`, *optional*):
+            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
+            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 1):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        backbone_featmap_shape (`list[int]`, *optional*, defaults to `[1, 1024, 24, 24]`):
+            Used only for the `hybrid` embedding type. The shape of the feature maps of the backbone.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+
+    Example:
+
+    ```python
+    >>> from transformers import ViTHybridConfig, ViTHybridModel
+
+    >>> # Initializing a ViT Hybrid vit-hybrid-base-bit-384 style configuration
+    >>> configuration = ViTHybridConfig()
+
+    >>> # Initializing a model (with random weights) from the vit-hybrid-base-bit-384 style configuration
+    >>> model = ViTHybridModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "vit-hybrid"
+
+    def __init__(
+        self,
+        backbone_config=None,
+        backbone=None,
+        use_pretrained_backbone=False,
+        use_timm_backbone=False,
+        backbone_kwargs=None,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        image_size=224,
+        patch_size=1,
+        num_channels=3,
+        backbone_featmap_shape=[1, 1024, 24, 24],
+        qkv_bias=True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if use_pretrained_backbone:
+            raise ValueError("Pretrained backbones are not supported yet.")
+
+        if backbone_config is not None and backbone is not None:
+            raise ValueError("You can't specify both `backbone` and `backbone_config`.")
+
+        if backbone_config is None and backbone is None:
+            logger.info("`backbone_config` is `None`. Initializing the config with a `BiT` backbone.")
+            backbone_config = {
+                "global_padding": "same",
+                "layer_type": "bottleneck",
+                "depths": [3, 4, 9],
+                "out_features": ["stage3"],
+                "embedding_dynamic_padding": True,
+            }
+
+        if backbone_kwargs is not None and backbone_kwargs and backbone_config is not None:
+            raise ValueError("You can't specify both `backbone_kwargs` and `backbone_config`.")
+
+        if isinstance(backbone_config, dict):
+            if "model_type" in backbone_config:
+                backbone_config_class = CONFIG_MAPPING[backbone_config["model_type"]]
+            else:
+                logger.info(
+                    "`model_type` is not found in `backbone_config`. Use `Bit` as the backbone configuration class."
+                )
+                backbone_config_class = BitConfig
+            backbone_config = backbone_config_class(**backbone_config)
+
+        self.backbone_featmap_shape = backbone_featmap_shape
+        self.backbone_config = backbone_config
+        self.backbone = backbone
+        self.use_pretrained_backbone = use_pretrained_backbone
+        self.use_timm_backbone = use_timm_backbone
+        self.backbone_kwargs = backbone_kwargs
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.qkv_bias = qkv_bias
+
+    @property
+    def sub_configs(self):
+        return (
+            {"backbone_config": type(self.backbone_config)}
+            if getattr(self, "backbone_config", None) is not None
+            else {}
+        )
+
+
+__all__ = ["ViTHybridConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/vit_hybrid/image_processing_vit_hybrid.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/vit_hybrid/image_processing_vit_hybrid.py
new file mode 100644
index 0000000000000000000000000000000000000000..92d518363b2c34b5b30fd04dc0dbe982c8551e3d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/vit_hybrid/image_processing_vit_hybrid.py
@@ -0,0 +1,341 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for ViT hybrid."""
+
+from typing import Optional, Union
+
+import numpy as np
+
+from ....image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ....image_transforms import (
+    convert_to_rgb,
+    get_resize_output_image_size,
+    resize,
+    to_channel_dimension_format,
+)
+from ....image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_flat_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_kwargs,
+    validate_preprocess_arguments,
+)
+from ....utils import TensorType, is_vision_available, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+if is_vision_available():
+    import PIL
+
+
+class ViTHybridImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a ViT Hybrid image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
+            `do_resize` in the `preprocess` method.
+        size (`dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
+            Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with
+            the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess`
+            method.
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+            Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
+        do_center_crop (`bool`, *optional*, defaults to `True`):
+            Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the
+            `preprocess` method.
+        crop_size (`dict[str, int]` *optional*, defaults to 224):
+            Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess`
+            method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
+            the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
+            method.
+        do_normalize:
+            Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
+        image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_center_crop: bool = True,
+        crop_size: Optional[dict[str, int]] = None,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_convert_rgb: bool = True,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"shortest_edge": 224}
+        size = get_size_dict(size, default_to_square=False)
+        crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
+        crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
+
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
+        self.do_convert_rgb = do_convert_rgb
+        self._valid_processor_keys = [
+            "images",
+            "do_resize",
+            "size",
+            "resample",
+            "do_center_crop",
+            "crop_size",
+            "do_rescale",
+            "rescale_factor",
+            "do_normalize",
+            "image_mean",
+            "image_std",
+            "do_convert_rgb",
+            "return_tensors",
+            "data_format",
+            "input_data_format",
+        ]
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
+        resized to keep the input aspect ratio.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`dict[str, int]`):
+                Size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                Resampling filter to use when resiizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        default_to_square = True
+        if "shortest_edge" in size:
+            size = size["shortest_edge"]
+            default_to_square = False
+        elif "height" in size and "width" in size:
+            size = (size["height"], size["width"])
+        else:
+            raise ValueError("Size must contain either 'shortest_edge' or 'height' and 'width'.")
+
+        output_size = get_resize_output_image_size(
+            image,
+            size=size,
+            default_to_square=default_to_square,
+            input_data_format=input_data_format,
+        )
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample: Optional[PILImageResampling] = None,
+        do_center_crop: Optional[bool] = None,
+        crop_size: Optional[int] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_convert_rgb: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
+                the longest edge resized to keep the input aspect ratio.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
+                Whether to center crop the image.
+            crop_size (`dict[str, int]`, *optional*, defaults to `self.crop_size`):
+                Size of the center crop. Only has an effect if `do_center_crop` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: defaults to the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        size = get_size_dict(size, param_name="size", default_to_square=False)
+        resample = resample if resample is not None else self.resample
+        do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
+        crop_size = crop_size if crop_size is not None else self.crop_size
+        crop_size = get_size_dict(crop_size, param_name="crop_size", default_to_square=True)
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+
+        images = make_flat_list_of_images(images)
+
+        validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_center_crop=do_center_crop,
+            crop_size=crop_size,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+
+        # PIL RGBA images are converted to RGB
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if do_rescale and is_scaled_image(images[0]):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        all_images = []
+        for image in images:
+            if do_resize:
+                image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+
+            if do_center_crop:
+                image = self.center_crop(image=image, size=crop_size, input_data_format=input_data_format)
+
+            if do_rescale:
+                image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+
+            if do_normalize:
+                image = self.normalize(
+                    image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
+                )
+
+            all_images.append(image)
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+            for image in all_images
+        ]
+
+        data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+
+__all__ = ["ViTHybridImageProcessor"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/vit_hybrid/modeling_vit_hybrid.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/vit_hybrid/modeling_vit_hybrid.py
new file mode 100644
index 0000000000000000000000000000000000000000..86b1594a20c92b8f2f528714d4ea9f36e93b8e4d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/vit_hybrid/modeling_vit_hybrid.py
@@ -0,0 +1,740 @@
+# coding=utf-8
+# Copyright 2022 Google AI, Ross Wightman, The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch ViT Hybrid model."""
+
+import collections.abc
+import math
+from typing import Optional, Union
+
+import torch
+from torch import nn
+
+from ....activations import ACT2FN
+from ....modeling_layers import GradientCheckpointingLayer
+from ....modeling_outputs import BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
+from ....modeling_utils import PreTrainedModel
+from ....pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ....utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    torch_int,
+)
+from ....utils.backbone_utils import load_backbone
+from .configuration_vit_hybrid import ViTHybridConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "ViTHybridConfig"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "google/vit-hybrid-base-bit-384"
+_EXPECTED_OUTPUT_SHAPE = [1, 197, 768]
+
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "google/vit-hybrid-base-bit-384"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
+
+
+class ViTHybridEmbeddings(nn.Module):
+    """
+    Construct the CLS token, position and patch embeddings. Optionally, also the mask token.
+    """
+
+    def __init__(self, config: ViTHybridConfig, use_mask_token: bool = False) -> None:
+        super().__init__()
+
+        self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size))
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size)) if use_mask_token else None
+        self.patch_embeddings = ViTHybridPatchEmbeddings(config)
+        num_patches = self.patch_embeddings.num_patches
+        self.position_embeddings = nn.Parameter(torch.randn(1, num_patches + 1, config.hidden_size))
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.patch_size = config.patch_size
+        self.config = config
+
+    # Copied from transformers.models.vit.modeling_vit.ViTEmbeddings.interpolate_pos_encoding
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
+        images. This method is also adapted to support torch.jit tracing.
+
+        Adapted from:
+        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
+        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
+        """
+
+        num_patches = embeddings.shape[1] - 1
+        num_positions = self.position_embeddings.shape[1] - 1
+
+        # always interpolate when tracing to ensure the exported model works for dynamic input shapes
+        if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
+            return self.position_embeddings
+
+        class_pos_embed = self.position_embeddings[:, :1]
+        patch_pos_embed = self.position_embeddings[:, 1:]
+
+        dim = embeddings.shape[-1]
+
+        new_height = height // self.patch_size
+        new_width = width // self.patch_size
+
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            size=(new_height, new_width),
+            mode="bicubic",
+            align_corners=False,
+        )
+
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+
+        return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        interpolate_pos_encoding: bool = False,
+    ) -> torch.Tensor:
+        batch_size, num_channels, height, width = pixel_values.shape
+        embeddings = self.patch_embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
+
+        if bool_masked_pos is not None:
+            seq_length = embeddings.shape[1]
+            mask_tokens = self.mask_token.expand(batch_size, seq_length, -1)
+            # replace the masked visual tokens by mask_tokens
+            mask = bool_masked_pos.unsqueeze(-1).type_as(mask_tokens)
+            embeddings = embeddings * (1.0 - mask) + mask_tokens * mask
+
+        # add the [CLS] token to the embedded patch tokens
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+
+        # add positional encoding to each token
+        if interpolate_pos_encoding:
+            embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+        else:
+            embeddings = embeddings + self.position_embeddings
+
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+class ViTHybridPatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(self, config, feature_size=None):
+        super().__init__()
+        image_size, patch_size = config.image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.hidden_size
+
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+
+        self.backbone = load_backbone(config)
+        if self.backbone.config.model_type != "bit":
+            raise ValueError(f"Backbone model type {self.backbone.model_type} is not supported.")
+        feature_dim = self.backbone.channels[-1]
+
+        if feature_size is None:
+            feature_map = config.backbone_featmap_shape
+
+            feature_size = feature_map[-2:]
+            feature_dim = feature_map[1]
+        else:
+            feature_size = (
+                feature_size if isinstance(feature_size, collections.abc.Iterable) else (feature_size, feature_size)
+            )
+            feature_dim = self.backbone.channels[-1]
+
+        self.grid_size = (feature_size[0] // patch_size[0], feature_size[1] // patch_size[1])
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+
+        self.projection = nn.Conv2d(feature_dim, hidden_size, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, pixel_values: torch.Tensor, interpolate_pos_encoding: bool = False) -> torch.Tensor:
+        _, num_channels, height, width = pixel_values.shape
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        if not interpolate_pos_encoding:
+            if height != self.image_size[0] or width != self.image_size[1]:
+                raise ValueError(
+                    f"Input image size ({height}*{width}) doesn't match model"
+                    f" ({self.image_size[0]}*{self.image_size[1]})."
+                )
+
+        features = self.backbone(pixel_values).feature_maps[-1]
+        embeddings = self.projection(features).flatten(2).transpose(1, 2)
+
+        return embeddings
+
+
+class ViTHybridSelfAttention(nn.Module):
+    def __init__(self, config: ViTHybridConfig) -> None:
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
+                f"heads {config.num_attention_heads}."
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def transpose_for_scores(self, x: torch.Tensor) -> torch.Tensor:
+        new_x_shape = x.size()[:-1] + (self.num_attention_heads, self.attention_head_size)
+        x = x.view(new_x_shape)
+        return x.permute(0, 2, 1, 3)
+
+    def forward(
+        self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
+    ) -> Union[tuple[torch.Tensor, torch.Tensor], tuple[torch.Tensor]]:
+        mixed_query_layer = self.query(hidden_states)
+
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+class ViTHybridSdpaSelfAttention(ViTHybridSelfAttention):
+    def __init__(self, config: ViTHybridConfig) -> None:
+        super().__init__(config)
+        self.attention_probs_dropout_prob = config.attention_probs_dropout_prob
+
+    def forward(
+        self, hidden_states, head_mask: Optional[torch.Tensor] = None, output_attentions: bool = False
+    ) -> Union[tuple[torch.Tensor, torch.Tensor], tuple[torch.Tensor]]:
+        mixed_query_layer = self.query(hidden_states)
+
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+        query_layer = self.transpose_for_scores(mixed_query_layer)
+
+        context_layer = torch.nn.functional.scaled_dot_product_attention(
+            query_layer,
+            key_layer,
+            value_layer,
+            head_mask,
+            self.attention_probs_dropout_prob if self.training else 0.0,
+            is_causal=False,
+            scale=None,
+        )
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        return context_layer, None
+
+
+class ViTHybridSelfOutput(nn.Module):
+    """
+    The residual connection is defined in ViTHybridLayer instead of here (as is the case with other models), due to the
+    layernorm applied before each block.
+    """
+
+    def __init__(self, config: ViTHybridConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        return hidden_states
+
+
+class ViTHybridAttention(nn.Module):
+    def __init__(self, config: ViTHybridConfig) -> None:
+        super().__init__()
+        self.attention = ViTHybridSelfAttention(config)
+        self.output = ViTHybridSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads: set[int]) -> None:
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[tuple[torch.Tensor, torch.Tensor], tuple[torch.Tensor]]:
+        self_outputs = self.attention(hidden_states, head_mask, output_attentions)
+
+        attention_output = self.output(self_outputs[0], hidden_states)
+
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class ViTHybridSdpaAttention(ViTHybridAttention):
+    def __init__(self, config: ViTHybridConfig) -> None:
+        super().__init__(config)
+        self.attention = ViTHybridSdpaSelfAttention(config)
+
+
+class ViTHybridIntermediate(nn.Module):
+    def __init__(self, config: ViTHybridConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+
+        return hidden_states
+
+
+class ViTHybridOutput(nn.Module):
+    def __init__(self, config: ViTHybridConfig) -> None:
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        hidden_states = hidden_states + input_tensor
+
+        return hidden_states
+
+
+VIT_HYBRID_ATTENTION_CLASSES = {
+    "eager": ViTHybridAttention,
+    "sdpa": ViTHybridSdpaAttention,
+}
+
+
+class ViTHybridLayer(GradientCheckpointingLayer):
+    """This corresponds to the Block class in the timm implementation."""
+
+    def __init__(self, config: ViTHybridConfig) -> None:
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = VIT_HYBRID_ATTENTION_CLASSES[config._attn_implementation](config)
+        self.intermediate = ViTHybridIntermediate(config)
+        self.output = ViTHybridOutput(config)
+        self.layernorm_before = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.layernorm_after = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Union[tuple[torch.Tensor, torch.Tensor], tuple[torch.Tensor]]:
+        self_attention_outputs = self.attention(
+            self.layernorm_before(hidden_states),  # in ViTHybrid, layernorm is applied before self-attention
+            head_mask,
+            output_attentions=output_attentions,
+        )
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        # first residual connection
+        # We assign to correct device for `accelerate`, check: https://github.com/huggingface/transformers/pull/20705/
+        hidden_states = attention_output + hidden_states.to(attention_output.device)
+
+        # in ViTHybrid, layernorm is also applied after self-attention
+        layer_output = self.layernorm_after(hidden_states)
+        layer_output = self.intermediate(layer_output)
+
+        # second residual connection is done here
+        layer_output = self.output(layer_output, hidden_states)
+
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+
+class ViTHybridEncoder(nn.Module):
+    def __init__(self, config: ViTHybridConfig) -> None:
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([ViTHybridLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, BaseModelOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            layer_outputs = layer_module(hidden_states, layer_head_mask, output_attentions)
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class ViTHybridPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config: ViTHybridConfig
+    base_model_prefix = "vit"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["ViTHybridEmbeddings", "ViTHybridLayer"]
+    _supports_sdpa = True
+
+    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid
+            # `trunc_normal_cpu` not implemented in `half` issues
+            module.weight.data = nn.init.trunc_normal_(
+                module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range
+            ).to(module.weight.dtype)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, ViTHybridEmbeddings):
+            module.position_embeddings.data = nn.init.trunc_normal_(
+                module.position_embeddings.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.position_embeddings.dtype)
+            module.cls_token.data = nn.init.trunc_normal_(
+                module.cls_token.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.cls_token.dtype)
+            module.mask_token.data.zero_()
+
+
+VIT_START_DOCSTRING = r"""
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass. Use it
+    as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`ViTHybridConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+VIT_INPUTS_DOCSTRING = r"""
+    Args:
+        pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`ViTHybridImageProcessor.__call__`] for details.
+
+        head_mask (`torch.FloatTensor` of shape `(num_heads,)` or `(num_layers, num_heads)`, *optional*):
+            Mask to nullify selected heads of the self-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+@add_start_docstrings(
+    "The bare ViT Hybrid Model transformer outputting raw hidden-states without any specific head on top.",
+    VIT_START_DOCSTRING,
+)
+class ViTHybridModel(ViTHybridPreTrainedModel):
+    def __init__(self, config: ViTHybridConfig, add_pooling_layer: bool = True, use_mask_token: bool = False):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = ViTHybridEmbeddings(config, use_mask_token=use_mask_token)
+        self.encoder = ViTHybridEncoder(config)
+
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.pooler = ViTHybridPooler(config) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> ViTHybridPatchEmbeddings:
+        return self.embeddings.patch_embeddings
+
+    def _prune_heads(self, heads_to_prune: dict[int, list[int]]) -> None:
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=BaseModelOutputWithPooling,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutputWithPooling]:
+        r"""
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, num_patches)`, *optional*):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0).
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        # TODO: maybe have a cleaner way to cast the input (from `ImageProcessor` side?)
+        expected_dtype = self.embeddings.patch_embeddings.projection.weight.dtype
+        if pixel_values.dtype != expected_dtype:
+            pixel_values = pixel_values.to(expected_dtype)
+
+        embedding_output = self.embeddings(
+            pixel_values, bool_masked_pos=bool_masked_pos, interpolate_pos_encoding=interpolate_pos_encoding
+        )
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+        pooled_output = self.pooler(sequence_output) if self.pooler is not None else None
+
+        if not return_dict:
+            head_outputs = (sequence_output, pooled_output) if pooled_output is not None else (sequence_output,)
+            return head_outputs + encoder_outputs[1:]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+class ViTHybridPooler(nn.Module):
+    def __init__(self, config: ViTHybridConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.activation = nn.Tanh()
+
+    def forward(self, hidden_states):
+        # We "pool" the model by simply taking the hidden state corresponding
+        # to the first token.
+        first_token_tensor = hidden_states[:, 0]
+        pooled_output = self.dense(first_token_tensor)
+        pooled_output = self.activation(pooled_output)
+        return pooled_output
+
+
+@add_start_docstrings(
+    """
+    ViT Hybrid Model transformer with an image classification head on top (a linear layer on top of the final hidden
+    state of the [CLS] token) e.g. for ImageNet.
+    """,
+    VIT_START_DOCSTRING,
+)
+class ViTHybridForImageClassification(ViTHybridPreTrainedModel):
+    def __init__(self, config: ViTHybridConfig) -> None:
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.vit = ViTHybridModel(config, add_pooling_layer=False)
+
+        # Classifier head
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @add_start_docstrings_to_model_forward(VIT_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=ImageClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, ImageClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.vit(
+            pixel_values,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        logits = self.classifier(sequence_output[:, 0, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(labels, logits, self.config)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return ImageClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = ["ViTHybridForImageClassification", "ViTHybridModel", "ViTHybridPreTrainedModel"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/xlm_prophetnet/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/xlm_prophetnet/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c13c67012fa157a94bae21734a1f59b26c78588d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/xlm_prophetnet/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2020 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ....utils import _LazyModule
+from ....utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_xlm_prophetnet import *
+    from .modeling_xlm_prophetnet import *
+    from .tokenization_xlm_prophetnet import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/xlm_prophetnet/configuration_xlm_prophetnet.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/xlm_prophetnet/configuration_xlm_prophetnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..59f42577c59f19d9d379eaaec15029eaed6967c3
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/xlm_prophetnet/configuration_xlm_prophetnet.py
@@ -0,0 +1,181 @@
+# coding=utf-8
+# Copyright 2020 The Microsoft Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""XLM-ProphetNet model configuration"""
+
+from typing import Callable, Optional, Union
+
+from ....configuration_utils import PretrainedConfig
+from ....utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class XLMProphetNetConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`XLMProphetNetModel`]. It is used to instantiate a
+    XLMProphetNet model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the XLMProphetNet
+    [microsoft/xprophetnet-large-wiki100-cased](https://huggingface.co/microsoft/xprophetnet-large-wiki100-cased)
+    architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        activation_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for activations inside the fully connected layer.
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        vocab_size (`int`, *optional*, defaults to 30522):
+            Vocabulary size of the ProphetNET model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`XLMProphetNetModel`].
+        hidden_size (`int`, *optional*, defaults to 1024):
+            Dimensionality of the layers and the pooler layer.
+        encoder_ffn_dim (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        num_encoder_layers (`int`, *optional*, defaults to 12):
+            Number of encoder layers.
+        num_encoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
+            Dimensionality of the `intermediate` (often named feed-forward) layer in decoder.
+        num_decoder_layers (`int`, *optional*, defaults to 12):
+            Number of decoder layers.
+        num_decoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        attention_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        add_cross_attention (`bool`, *optional*, defaults to `True`):
+            Whether cross-attention layers should be added to the model.
+        is_encoder_decoder (`bool`, *optional*, defaults to `True`):
+            Whether this is an encoder/decoder model.
+        pad_token_id (`int`, *optional*, defaults to 1)
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 0)
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2)
+            End of stream token id.
+        ngram (`int`, *optional*, defaults to 2)
+            Number of future tokens to predict. Set to 1 to be same as traditional Language model to predict next first
+            token.
+        num_buckets (`int`, *optional*, defaults to 32)
+            The number of buckets to use for each attention layer. This is for relative position calculation. See the
+            [T5 paper](see https://huggingface.co/papers/1910.10683) for more details.
+        relative_max_distance (`int`, *optional*, defaults to 128)
+            Relative distances greater than this number will be put into the last same bucket. This is for relative
+            position calculation. See the [T5 paper](see https://huggingface.co/papers/1910.10683) for more details.
+        disable_ngram_loss (`bool`, *optional*, defaults to `False`):
+            Whether be trained predicting only the next first token.
+        eps (`float`, *optional*, defaults to 0.0):
+            Controls the `epsilon` parameter value for label smoothing in the loss calculation. If set to 0, no label
+            smoothing is performed.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+    """
+
+    model_type = "xlm-prophetnet"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "num_attention_heads": "num_encoder_attention_heads",
+    }
+
+    def __init__(
+        self,
+        activation_dropout: Optional[float] = 0.1,
+        activation_function: Optional[Union[str, Callable]] = "gelu",
+        vocab_size: Optional[int] = 30522,
+        hidden_size: Optional[int] = 1024,
+        encoder_ffn_dim: Optional[int] = 4096,
+        num_encoder_layers: Optional[int] = 12,
+        num_encoder_attention_heads: Optional[int] = 16,
+        decoder_ffn_dim: Optional[int] = 4096,
+        num_decoder_layers: Optional[int] = 12,
+        num_decoder_attention_heads: Optional[int] = 16,
+        attention_dropout: Optional[float] = 0.1,
+        dropout: Optional[float] = 0.1,
+        max_position_embeddings: Optional[int] = 512,
+        init_std: Optional[float] = 0.02,
+        is_encoder_decoder: Optional[bool] = True,
+        add_cross_attention: Optional[bool] = True,
+        decoder_start_token_id: Optional[int] = 0,
+        ngram: Optional[int] = 2,
+        num_buckets: Optional[int] = 32,
+        relative_max_distance: Optional[int] = 128,
+        disable_ngram_loss: Optional[bool] = False,
+        eps: Optional[float] = 0.0,
+        use_cache: Optional[bool] = True,
+        pad_token_id: Optional[int] = 0,
+        bos_token_id: Optional[int] = 1,
+        eos_token_id: Optional[int] = 2,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.num_encoder_layers = num_encoder_layers
+        self.num_encoder_attention_heads = num_encoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.num_decoder_layers = num_decoder_layers
+        self.num_decoder_attention_heads = num_decoder_attention_heads
+        self.max_position_embeddings = max_position_embeddings
+        self.init_std = init_std  # Normal(0, this parameter)
+        self.activation_function = activation_function
+
+        # parameters for xlmprophetnet
+        self.ngram = ngram
+        self.num_buckets = num_buckets
+        self.relative_max_distance = relative_max_distance
+        self.disable_ngram_loss = disable_ngram_loss
+        self.eps = eps
+
+        # 3 Types of Dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.dropout = dropout
+
+        self.use_cache = use_cache
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            add_cross_attention=add_cross_attention,
+            decoder_start_token_id=decoder_start_token_id,
+            **kwargs,
+        )
+
+    @property
+    def num_hidden_layers(self) -> int:
+        return self.num_encoder_layers + self.num_decoder_layers
+
+    @num_hidden_layers.setter
+    def num_hidden_layers(self, value):
+        raise NotImplementedError(
+            "This model does not support the setting of `num_hidden_layers`. Please set `num_encoder_layers` and"
+            " `num_decoder_layers`."
+        )
+
+
+__all__ = ["XLMProphetNetConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/xlm_prophetnet/modeling_xlm_prophetnet.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/xlm_prophetnet/modeling_xlm_prophetnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..36f6e6097bc3e55a153b991df9161b01c09c153a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/xlm_prophetnet/modeling_xlm_prophetnet.py
@@ -0,0 +1,2305 @@
+# coding=utf-8
+# Copyright 2020 The Microsoft Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch XLM-ProphetNet model."""
+
+import copy
+import math
+import warnings
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+from torch import Tensor, nn
+from torch.nn import LayerNorm
+
+from ....activations import ACT2FN
+from ....cache_utils import Cache
+from ....modeling_layers import GradientCheckpointingLayer
+from ....modeling_outputs import BaseModelOutput
+from ....modeling_utils import PreTrainedModel
+from ....utils import (
+    ModelOutput,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from ....utils.deprecation import deprecate_kwarg
+from .configuration_xlm_prophetnet import XLMProphetNetConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+_CONFIG_FOR_DOC = "XLMProphetNetConfig"
+
+
+XLM_PROPHETNET_START_DOCSTRING = r"""
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    Original ProphetNet code can be found [here](https://github.com/microsoft/ProphetNet). Checkpoints were converted
+    from original Fairseq checkpoints. For more information on the checkpoint conversion, please take a look at the
+    file `convert_prophetnet_original_pytorch_checkpoint_to_pytorch.py`.
+
+    This model is a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) sub-class. Use
+    it as a regular PyTorch Module and refer to the PyTorch documentation for all matters related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`XLMProphetNetConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+XLM_PROPHETNET_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+
+            XLMProphetNet uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If
+            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+
+        decoder_attention_mask (`torch.BoolTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+        head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        decoder_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+XLM_PROPHETNET_STANDALONE_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+def softmax(hidden_state, dim, onnx_trace=False):
+    if onnx_trace:
+        return nn.functional.softmax(hidden_state.float(), dim=dim)
+    else:
+        return nn.functional.softmax(hidden_state, dim=dim, dtype=torch.float32)
+
+
+def ngram_attention_bias(sequence_length, ngram, device, dtype):
+    """
+    This function computes the bias for the predict stream
+    """
+    left_block = (
+        torch.ones((ngram, sequence_length, sequence_length), device=device, dtype=dtype) * torch.finfo(dtype).min
+    )
+    right_block = left_block.detach().clone()
+    # create bias
+    for stream_idx in range(ngram):
+        right_block[stream_idx].fill_diagonal_(0, wrap=False)
+        left_block[stream_idx].triu_(-stream_idx + 1)
+
+    left_block[:, :, 0] = 0
+    return torch.cat([left_block, right_block], dim=2)
+
+
+def compute_relative_buckets(num_buckets, max_distance, relative_positions, is_bidirectional=False):
+    """
+    This function computes individual parts of the relative position buckets. For more detail, see paper.
+    """
+    inv_relative_positions = -relative_positions
+    rel_positions_bucket = 0
+
+    if is_bidirectional:
+        num_buckets = num_buckets // 2
+        rel_positions_bucket = (
+            rel_positions_bucket
+            + torch.lt(inv_relative_positions, torch.zeros_like(inv_relative_positions)).int() * num_buckets
+        )
+        inv_relative_positions = torch.abs(inv_relative_positions)
+    else:
+        inv_relative_positions = torch.max(inv_relative_positions, torch.zeros_like(inv_relative_positions))
+
+    max_exact = num_buckets // 2
+    is_small = torch.lt(inv_relative_positions, max_exact)
+    val_if_large = max_exact + torch.log(inv_relative_positions.float() / max_exact) / math.log(
+        max_distance / max_exact
+    ) * (num_buckets - max_exact)
+    val_if_large = torch.min(val_if_large, torch.ones_like(val_if_large) * (num_buckets - 1)).int()
+    rel_positions_bucket = rel_positions_bucket + torch.where(is_small, inv_relative_positions.int(), val_if_large)
+    return rel_positions_bucket
+
+
+def compute_all_stream_relative_buckets(num_buckets, max_distance, position_ids):
+    """
+    This function computes both main and predict relative position buckets. For more detail, see paper.
+    """
+    # main stream
+    main_stream_relative_positions = position_ids.unsqueeze(1).repeat(1, position_ids.size(-1), 1)
+    main_stream_relative_positions = main_stream_relative_positions - position_ids.unsqueeze(-1)
+
+    # predicting stream
+    predicting_stream_relative_positions = torch.cat((position_ids - 1, position_ids), dim=-1).unsqueeze(1)
+    predicting_stream_relative_positions = predicting_stream_relative_positions.repeat(1, position_ids.size(-1), 1)
+    predicting_stream_relative_positions = predicting_stream_relative_positions - position_ids.unsqueeze(-1)
+
+    # get both position buckets
+    main_relative_position_buckets = compute_relative_buckets(
+        num_buckets, max_distance, main_stream_relative_positions, is_bidirectional=False
+    )
+    predict_relative_position_buckets = compute_relative_buckets(
+        num_buckets, max_distance, predicting_stream_relative_positions, is_bidirectional=False
+    )
+    return main_relative_position_buckets, predict_relative_position_buckets
+
+
+@dataclass
+class XLMProphetNetSeq2SeqLMOutput(ModelOutput):
+    """
+    Base class for sequence-to-sequence language models outputs.
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss.
+        logits (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, config.vocab_size)`):
+            Prediction scores of the main stream language modeling head (scores for each vocabulary token before
+            SoftMax).
+        logits_ngram (`torch.FloatTensor` of shape `(batch_size, ngram * decoder_sequence_length, config.vocab_size)`):
+            Prediction scores of the predict stream language modeling head (scores for each vocabulary token before
+            SoftMax).
+        past_key_values (`list[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
+            num_attn_heads, decoder_sequence_length, embed_size_per_head)`).
+
+            Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
+            used (see `past_key_values` input) to speed up sequential decoding.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, decoder_sequence_length, hidden_size)`.
+
+            Hidden-states of main stream of the decoder at the output of each layer plus the initial embedding outputs.
+        decoder_ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.
+
+            Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
+            outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
+            decoder_sequence_length, decoder_sequence_length)`.
+
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        decoder_ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
+            decoder_sequence_length, decoder_sequence_length)`.
+
+            Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
+            weighted average in the self-attention heads.
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
+            encoder_sequence_length, decoder_sequence_length)`.
+
+            Attentions weights of the cross-attention layer of the decoder, after the attention softmax, used to
+            compute the weighted average in the
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, encoder_sequence_length, hidden_size)`.
+
+            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
+            encoder_sequence_length, encoder_sequence_length)`. Attentions weights of the encoder, after the attention
+            softmax, used to compute the weighted average in the self-attention heads.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    logits_ngram: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[Cache] = None
+    decoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    decoder_ngram_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[tuple[torch.FloatTensor]] = None
+    decoder_ngram_attentions: Optional[tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[tuple[torch.FloatTensor]] = None
+
+    @property
+    def decoder_cross_attentions(self):
+        warnings.warn(
+            "`decoder_cross_attentions` is deprecated and will be removed soon. Please use `cross_attentions`"
+            " instead.",
+            FutureWarning,
+        )
+        return self.cross_attentions
+
+
+@dataclass
+class XLMProphetNetSeq2SeqModelOutput(ModelOutput):
+    """
+    Base class for model encoder's outputs that also contains : pre-computed hidden states that can speed up sequential
+    decoding.
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, hidden_size)`):
+            Sequence of main stream hidden-states at the output of the last layer of the decoder of the model.
+
+            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
+            hidden_size)` is output.
+        last_hidden_state_ngram (`torch.FloatTensor` of shape `(batch_size,ngram * decoder_sequence_length, config.vocab_size)`, *optional*):
+            Sequence of predict stream hidden-states at the output of the last layer of the decoder of the model.
+        past_key_values (`list[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
+            num_attn_heads, decoder_sequence_length, embed_size_per_head)`).
+
+            Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
+            used (see `past_key_values` input) to speed up sequential decoding.
+        decoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, decoder_sequence_length, hidden_size)`.
+
+            Hidden-states of main stream of the decoder at the output of each layer plus the initial embedding outputs.
+        decoder_ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.
+
+            Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
+            outputs.
+        decoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
+            decoder_sequence_length, decoder_sequence_length)`.
+
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        decoder_ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
+            decoder_sequence_length, decoder_sequence_length)`.
+
+            Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
+            weighted average in the
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
+            encoder_sequence_length, decoder_sequence_length)`.
+
+            Attentions weights of the cross-attention layer of the decoder, after the attention softmax, used to
+            compute the weighted average in the
+        encoder_last_hidden_state (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder of the model.
+        encoder_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, encoder_sequence_length, hidden_size)`.
+
+            Hidden-states of the encoder at the output of each layer plus the initial embedding outputs.
+        encoder_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
+            encoder_sequence_length, encoder_sequence_length)`.
+
+            Attentions weights of the encoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+    """
+
+    last_hidden_state: torch.FloatTensor
+    last_hidden_state_ngram: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[Cache] = None
+    decoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    decoder_ngram_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[tuple[torch.FloatTensor]] = None
+    decoder_ngram_attentions: Optional[tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[tuple[torch.FloatTensor]] = None
+
+    @property
+    def decoder_cross_attentions(self):
+        warnings.warn(
+            "`decoder_cross_attentions` is deprecated and will be removed soon. Please use `cross_attentions`"
+            " instead.",
+            FutureWarning,
+        )
+        return self.cross_attentions
+
+
+@dataclass
+class XLMProphetNetDecoderModelOutput(ModelOutput):
+    """
+    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
+
+    Args:
+        last_hidden_state (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, hidden_size)`):
+            Sequence of main stream hidden-states at the output of the last layer of the decoder of the model.
+
+            If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
+            hidden_size)` is output.
+        last_hidden_state_ngram (`torch.FloatTensor` of shape `(batch_size, ngram * decoder_sequence_length, config.vocab_size)`):
+            Sequence of predict stream hidden-states at the output of the last layer of the decoder of the model.
+        past_key_values (`list[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
+            num_attn_heads, decoder_sequence_length, embed_size_per_head)`).
+
+            Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
+            used (see `past_key_values` input) to speed up sequential decoding.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, decoder_sequence_length, hidden_size)`.
+
+            Hidden-states of main stream of the decoder at the output of each layer plus the initial embedding outputs.
+        ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.
+
+            Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
+            outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
+            decoder_sequence_length, decoder_sequence_length)`.
+
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
+            decoder_sequence_length, decoder_sequence_length)`.
+
+            Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
+            weighted average in the
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
+            encoder_sequence_length, decoder_sequence_length)`.
+
+            Attentions weights of the cross-attention layer of the decoder, after the attention softmax, used to
+            compute the weighted average in the
+    """
+
+    last_hidden_state: torch.FloatTensor
+    last_hidden_state_ngram: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[Cache] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    hidden_states_ngram: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    ngram_attentions: Optional[tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+class XLMProphetNetDecoderLMOutput(ModelOutput):
+    """
+    Base class for model's outputs that may also contain a past key/values (to speed up sequential decoding).
+
+    Args:
+        loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+            Language modeling loss.
+        logits (`torch.FloatTensor` of shape `(batch_size, decoder_sequence_length, config.vocab_size)`):
+            Prediction scores of the main stream language modeling head (scores for each vocabulary token before
+            SoftMax).
+        logits_ngram (`torch.FloatTensor` of shape `(batch_size, ngram * decoder_sequence_length, config.vocab_size)`):
+            Prediction scores of the predict stream language modeling head (scores for each vocabulary token before
+            SoftMax).
+        past_key_values (`list[torch.FloatTensor]`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+            List of `torch.FloatTensor` of length `config.n_layers`, with each tensor of shape `(2, batch_size,
+            num_attn_heads, decoder_sequence_length, embed_size_per_head)`).
+
+            Contains pre-computed hidden-states (key and values in the attention blocks) of the decoder that can be
+            used (see `past_key_values` input) to speed up sequential decoding.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, decoder_sequence_length, hidden_size)`.
+
+            Hidden-states of main stream of the decoder at the output of each layer plus the initial embedding outputs.
+        ngram_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, ngram * decoder_sequence_length, hidden_size)`.
+
+            Hidden-states of the predict stream of the decoder at the output of each layer plus the initial embedding
+            outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
+            decoder_sequence_length, decoder_sequence_length)`.
+
+            Attentions weights of the decoder, after the attention softmax, used to compute the weighted average in the
+            self-attention heads.
+        ngram_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
+            decoder_sequence_length, decoder_sequence_length)`.
+
+            Attentions weights of the predict stream of the decoder, after the attention softmax, used to compute the
+            weighted average in the
+        cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_attn_heads,
+            encoder_sequence_length, decoder_sequence_length)`.
+
+            Attentions weights of the cross-attention layer of the decoder, after the attention softmax, used to
+            compute the weighted average in the
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    logits_ngram: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[Cache] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    hidden_states_ngram: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    ngram_attentions: Optional[tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[tuple[torch.FloatTensor]] = None
+
+
+class XLMProphetNetPreTrainedModel(PreTrainedModel):
+    config: XLMProphetNetConfig
+    base_model_prefix = "prophetnet"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.init_std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.init_std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+    def _shift_right(self, input_ids):
+        decoder_start_token_id = self.config.decoder_start_token_id
+        pad_token_id = self.config.pad_token_id
+
+        assert decoder_start_token_id is not None, (
+            "self.model.config.decoder_start_token_id has to be defined. In XLMProphetNet it is usually set to the"
+            " pad_token_id. See XLMProphetNet docs for more information"
+        )
+
+        # shift inputs to the right
+        shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+        shifted_input_ids[..., 1:] = input_ids[..., :-1].clone()
+        shifted_input_ids[..., 0] = decoder_start_token_id
+
+        assert pad_token_id is not None, "self.model.config.pad_token_id has to be defined."
+        # replace possible -100 values in labels by `pad_token_id`
+        shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+        assert torch.all(shifted_input_ids >= 0).item(), "Verify that `shifted_input_ids` has only positive values"
+
+        return shifted_input_ids
+
+
+class XLMProphetNetPositionalEmbeddings(nn.Embedding):
+    """
+    This module learns positional embeddings up to a fixed maximum size. Padding ids are ignored by either offsetting
+    based on padding_idx or by setting padding_idx to None and ensuring that the appropriate position ids are passed to
+    the forward function.
+    """
+
+    def __init__(self, config: XLMProphetNetConfig) -> None:
+        self.max_length = config.max_position_embeddings
+        super().__init__(config.max_position_embeddings, config.hidden_size, config.pad_token_id)
+
+    def forward(self, inputs_shape, device, attention_mask=None, past_key_values=None, position_ids=None):
+        assert (position_ids is None) or (self.padding_idx is None), (
+            "If position_ids is pre-computed then padding_idx should not be set."
+        )
+
+        if position_ids is None:
+            if past_key_values is not None:
+                # position_ids is the same for every token when decoding a single step
+                # Without the int() cast, it doesn't work in some cases when exporting to ONNX
+                prev_num_input_ids = past_key_values.get_seq_length()
+                num_input_ids = inputs_shape[1] + prev_num_input_ids
+                position_ids = torch.ones((1, 1), dtype=torch.long, device=device) * (
+                    int(self.padding_idx + num_input_ids)
+                )
+            else:
+                if attention_mask is None:
+                    attention_mask = torch.ones(inputs_shape, dtype=torch.long, device=device)
+
+                # retrieve position_ids from input_ids / attention_mask
+                position_ids = (
+                    torch.cumsum(attention_mask, dim=1).type_as(attention_mask) * attention_mask
+                ).long() + self.padding_idx
+
+                # make sure position_ids are not bigger then max_length
+                position_ids = position_ids.clamp(0, self.max_length - 1)
+
+        return super().forward(position_ids), position_ids
+
+    def _forward(self, position_ids):
+        return super().forward(position_ids)
+
+
+class XLMProphetNetAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        config: XLMProphetNetConfig,
+        num_attn_heads: int,
+    ):
+        super().__init__()
+        hidden_size = config.hidden_size
+
+        self.attention_dropout = config.attention_dropout
+        self.dropout = config.dropout
+        self.num_attn_heads = num_attn_heads
+        self.head_dim = hidden_size // num_attn_heads
+
+        assert self.head_dim * num_attn_heads == hidden_size, (
+            "`config.hidden_size` must be divisible by `config.num_encoder_attention_heads` and"
+            " `config.num_decoder_attention_heads`"
+        )
+
+        self.key_proj = nn.Linear(hidden_size, hidden_size)
+        self.value_proj = nn.Linear(hidden_size, hidden_size)
+        self.query_proj = nn.Linear(hidden_size, hidden_size)
+
+        self.out_proj = nn.Linear(hidden_size, hidden_size)
+
+    def _shape(self, tensor: torch.Tensor, seq_len: int, bsz: int):
+        return tensor.view(bsz, seq_len, self.num_attn_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states,
+        key_value_states: Optional[Tensor] = None,
+        attention_mask: Optional[Tensor] = None,
+        layer_head_mask: Optional[Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: bool = False,
+    ) -> tuple[Tensor, Optional[Tensor]]:
+        batch_size, tgt_len, hidden_size = hidden_states.size()
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        assert list(hidden_states.size()) == [
+            batch_size,
+            tgt_len,
+            hidden_size,
+        ], f"Size of hidden states should be {batch_size, tgt_len, hidden_size}, but is {hidden_states.size()}"
+
+        # previous time steps are cached - no need to recompute key and value if they are static
+        query_states = self.query_proj(hidden_states) / (self.head_dim**0.5)
+
+        if is_cross_attention and past_key_values is not None:
+            # reuse k,v, cross_attentions
+            key_states = past_key_values[0]
+            value_states = past_key_values[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.key_proj(key_value_states), -1, batch_size)
+            value_states = self._shape(self.value_proj(key_value_states), -1, batch_size)
+        else:
+            # self_attention
+            key_states = self._shape(self.key_proj(hidden_states), -1, batch_size)
+            value_states = self._shape(self.value_proj(hidden_states), -1, batch_size)
+
+        if is_cross_attention:
+            # if cross_attention save Tuple(torch.Tensor, torch.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if encoder bi-directional self-attention `past_key_values` is always `None`
+            past_key_values = (key_states, value_states)
+
+        # project states into the correct shape
+        proj_shape = (batch_size, self.num_attn_heads, -1, self.head_dim)
+        query_states = self._shape(query_states, tgt_len, batch_size).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+        src_len = key_states.size(2)
+        attn_weights = torch.einsum("bsij,bsjk->bsik", query_states, key_states.transpose(2, 3))
+        expected_shape = (batch_size, self.num_attn_heads, tgt_len, src_len)
+        if attn_weights.size() != expected_shape:
+            raise ValueError(f"Attention weights should have size {expected_shape}, but is {attn_weights.size()}")
+
+        # This is part of a workaround to get around fork/join parallelism not supporting Optional types.
+        if attention_mask is not None and attention_mask.dim() == 0:
+            attention_mask = None
+
+        expected_shape = (batch_size, self.num_attn_heads, 1, src_len)
+        if attention_mask is not None and attention_mask.size() != expected_shape:
+            raise ValueError(f"Attention mask should have size {expected_shape}, but is {attention_mask.size()}")
+        if attention_mask is not None:  # don't attend to padding symbols
+            attn_weights = attn_weights + attention_mask
+        if output_attentions:
+            attn_weights_reshaped = attn_weights
+        else:
+            attn_weights_reshaped = None
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if layer_head_mask is not None:
+            assert layer_head_mask.size() == (self.num_attn_heads,), (
+                f"Head mask for a single layer should be of size {(self.num_attn_heads,)}, but is"
+                f" {layer_head_mask.size()}"
+            )
+            attn_weights = layer_head_mask.view(1, -1, 1, 1) * attn_weights.view(
+                batch_size, self.num_attn_heads, tgt_len, src_len
+            )
+
+            # apply head_mask also on attn_weights_reshaped which is used for n-gram attention inside the model
+            attn_weights_reshaped = layer_head_mask.view(1, -1, 1, 1) * attn_weights_reshaped
+
+        attn_probs = nn.functional.dropout(
+            attn_weights,
+            p=self.attention_dropout,
+            training=self.training,
+        )
+        attn_output = torch.einsum("bsij,bsjk->bsik", attn_probs, value_states)
+        expected_shape = (batch_size, self.num_attn_heads, tgt_len, self.head_dim)
+        if attn_output.size() != expected_shape:
+            raise ValueError(f"`attn_output` should have shape {expected_shape}, but is of shape {attn_output.size()}")
+
+        attn_output = attn_output.transpose(1, 2).reshape(batch_size, tgt_len, hidden_size)
+        attn_output = self.out_proj(attn_output)
+
+        attn_output = nn.functional.dropout(attn_output, p=self.dropout, training=self.training)
+        return attn_output, attn_weights_reshaped, past_key_values
+
+
+class XLMProphetNetFeedForward(nn.Module):
+    """
+    This is the residual two feed-forward layer block based on the original Transformer implementation.
+    """
+
+    def __init__(self, config: XLMProphetNetConfig, ffn_dim: int):
+        super().__init__()
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.intermediate = nn.Linear(config.hidden_size, ffn_dim)
+        self.output = nn.Linear(ffn_dim, config.hidden_size)
+        self.activation_dropout = config.activation_dropout
+        self.dropout = config.dropout
+
+    def forward(self, hidden_states):
+        hidden_states = self.intermediate(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.output(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        return hidden_states
+
+
+class XLMProphetNetNgramSelfAttention(nn.Module):
+    def __init__(self, config: XLMProphetNetConfig):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.num_buckets = config.num_buckets
+        self.relative_max_distance = config.relative_max_distance
+        self.num_attn_heads = config.num_decoder_attention_heads
+        self.dropout = config.dropout
+        self.attention_dropout = config.attention_dropout
+        self.head_dim = config.hidden_size // self.num_attn_heads
+        self.ngram = config.ngram
+
+        assert self.head_dim * self.num_attn_heads == config.hidden_size, (
+            "config.hidden_size must be divisible by num_attn_heads"
+        )
+        # key, value, query projection
+        self.key_proj = nn.Linear(config.hidden_size, config.hidden_size)
+        self.value_proj = nn.Linear(config.hidden_size, config.hidden_size)
+        self.query_proj = nn.Linear(config.hidden_size, config.hidden_size)
+
+        # out projection
+        self.out_proj = nn.Linear(config.hidden_size, config.hidden_size)
+
+        # rel position embeddings
+        self.relative_pos_embeddings = nn.Linear(config.hidden_size, self.num_buckets * self.num_attn_heads)
+
+        # for onnx runtime
+        self.onnx_trace = False
+
+    def _shape(self, tensor, seq_len, batch_size):
+        return tensor.view(batch_size, seq_len, self.num_attn_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def prepare_for_onnx_export_(self):
+        self.onnx_trace = True
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states,
+        past_key_values: Optional[Cache] = None,
+        attention_mask=None,
+        layer_head_mask=None,
+        extended_predict_attention_mask=None,
+        main_relative_position_buckets=None,
+        predict_relative_position_buckets=None,
+        position_ids=None,
+    ):
+        batch_size, ngram_sequence_length, hidden_size = hidden_states.size()
+        assert list(hidden_states.size()) == [batch_size, ngram_sequence_length, hidden_size], (
+            f"`hidden_states` should be of shape {batch_size, ngram_sequence_length, hidden_size}, but is of shape"
+            f" {hidden_states.shape}"
+        )
+
+        # project
+        query_states = self.query_proj(hidden_states)
+        key_states = self.key_proj(hidden_states)
+        value_states = self.value_proj(hidden_states)
+
+        # normalize
+        query_states = query_states / (self.head_dim**0.5)
+
+        # reshape
+        query_states = self._shape(query_states, ngram_sequence_length, batch_size)
+        key_states = self._shape(key_states, -1, batch_size)
+        value_states = self._shape(value_states, -1, batch_size)
+        proj_shape = (batch_size, self.num_attn_heads, -1, self.head_dim)
+
+        query_states = query_states.view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        # chunk into main stream and predict stream
+        hidden_states_list = hidden_states.chunk(1 + self.ngram, dim=1)
+        query_states_list = query_states.chunk(1 + self.ngram, dim=2)
+        key_states_list = key_states.chunk(1 + self.ngram, dim=2)
+        value_states_list = value_states.chunk(1 + self.ngram, dim=2)
+
+        main_hidden_states, hidden_states_predict_list = hidden_states_list[0], hidden_states_list[1:]
+        main_query_states, predict_query_states_list = query_states_list[0], query_states_list[1:]
+        main_key_states, predict_key_states_list = key_states_list[0], key_states_list[1:]
+        main_value_states, predict_value_states_list = value_states_list[0], value_states_list[1:]
+
+        # saved states are stored with shape (batch_size, num_attn_heads, seq_len, head_dim)
+        if past_key_values is not None:
+            prev_main_key_states = past_key_values[0]
+            main_key_states = torch.cat((prev_main_key_states, main_key_states), dim=2)
+            prev_main_value_states = past_key_values[1]
+            main_value_states = torch.cat((prev_main_value_states, main_value_states), dim=2)
+
+        # Update cache
+        past_key_values = (main_key_states, main_value_states)
+
+        # get seq_length of main stream only
+        sequence_length = ngram_sequence_length // (1 + self.ngram)
+
+        # MAIN-STREAM
+        # main attn weights
+        # [batch_size, number_heads, sequence_length, head_dimesion]
+        # x [batch_size, number_heads, head_dimesion, sequence_length]
+        # -> [batch_size, number_heads, sequence_length, sequence_length]
+        main_attn_weights = torch.einsum("bntc,bncs->bnts", main_query_states, main_key_states.transpose(2, 3))
+
+        # retrieve relative position embeddings for each layer -> see paper for more details
+        main_relative_pos_embeddings = self.get_main_relative_pos_embeddings(
+            main_hidden_states, main_attn_weights, position_ids, main_relative_position_buckets
+        )
+
+        main_attn_weights = main_attn_weights + main_relative_pos_embeddings
+
+        if attention_mask is not None:
+            main_attn_weights = main_attn_weights + attention_mask
+
+        main_attn_probs = softmax(
+            main_attn_weights,
+            dim=-1,
+            onnx_trace=self.onnx_trace,
+        ).type_as(main_attn_weights)
+
+        if layer_head_mask is not None:
+            assert layer_head_mask.size() == (self.num_attn_heads,), (
+                f"Head mask for a single layer should be of size {(self.num_attn_heads,)}, but is"
+                f" {layer_head_mask.size()}"
+            )
+            main_attn_probs = layer_head_mask.view(1, -1, 1, 1) * main_attn_probs.view(
+                batch_size, self.num_attn_heads, -1, sequence_length
+            )
+
+        main_attn_probs = nn.functional.dropout(main_attn_probs, p=self.attention_dropout, training=self.training)
+        # project to attn_output
+        # [batch_size, number_heads, sequence_length, sequence_length]
+        # x [batch_size, number_heads, sequence_length, head_dimesion]
+        # -> [batch_size, number_heads, sequence_length, head_dimesion]
+        main_attn_output = torch.einsum("bntc,bncs->bnts", main_attn_probs, main_value_states)
+        # reshape so that num_heads dim is merged into last `head_dim` axis
+        main_attn_output = main_attn_output.transpose(1, 2).reshape(batch_size, 1, sequence_length, hidden_size)
+        main_attn_output = self.out_proj(main_attn_output)
+
+        # PREDICT-STREAM
+        # [batch_size, ngram, number_heads, sequence_length, head_dimesion]
+        predict_query_states = torch.stack(predict_query_states_list, 1).view(
+            batch_size, self.ngram, self.num_attn_heads, sequence_length, self.head_dim
+        )
+
+        # [batch_size, ngram, number_heads, 2*sequence_length, head_dimesion]
+        predict_key_states = torch.stack([torch.cat([main_key_states, key], 2) for key in predict_key_states_list], 1)
+
+        # [batch_size, sequence_length, ngram, hidden_size]
+        predict_hidden_states = torch.stack(hidden_states_predict_list, dim=2)
+
+        # [batch_size, number_heads, ngram, 2*sequence_length, head_dimesion]
+        predict_value_states = torch.cat(
+            [torch.cat([main_value_states, v_p], 2).unsqueeze(2) for v_p in predict_value_states_list], 2
+        )
+
+        # [batch_size, ngram, number_heads, sequence_length, head_dimesion]
+        # x [batch_size, ngram, number_heads, 2*sequence_length, head_dimesion]
+        # -> [batch_size, ngram, number_heads, sequence_length, 2*sequence_length]
+        predict_attn_weights = torch.einsum("bnhtc,bnhsc->bnhts", (predict_query_states, predict_key_states))
+
+        # retrieve relative position embeddings for each layer -> see paper for more details
+        # [batch_size, ngram, number_heads, sequence_length, predict_relative_pos_embeddings]
+        predict_relative_pos_embeddings = self.get_predict_relative_pos_embeddings(
+            predict_hidden_states, predict_attn_weights, position_ids, predict_relative_position_buckets
+        )
+
+        # [batch_size, ngram, number_heads, sequence_length, 2*sequence_length]
+        predict_attn_weights = predict_attn_weights + predict_relative_pos_embeddings
+
+        if extended_predict_attention_mask is not None:
+            # Permuting Predict attention mask to [batch_size, ngram, number_heads, sequence_length, 2*sequence_length]
+            extended_predict_attention_mask = extended_predict_attention_mask.permute(0, 2, 1, 3, 4)
+            extended_predict_attention_mask = extended_predict_attention_mask.to(predict_attn_weights.dtype)
+            predict_attn_weights = predict_attn_weights + extended_predict_attention_mask
+
+        predict_attn_probs = softmax(
+            predict_attn_weights,
+            dim=-1,
+            onnx_trace=self.onnx_trace,
+        ).type_as(predict_attn_weights)
+
+        if layer_head_mask is not None:
+            assert layer_head_mask.size() == (self.num_attn_heads,), (
+                f"Head mask for a single layer should be of size {(self.num_attn_heads,)}, but is"
+                f" {layer_head_mask.size()}"
+            )
+            predict_attn_probs = layer_head_mask.view(1, 1, -1, 1, 1) * predict_attn_probs
+
+        predict_attn_probs = nn.functional.dropout(
+            predict_attn_probs, p=self.attention_dropout, training=self.training
+        )
+        # project to attention output
+        # [batch_size, ngram, number_heads, sequence_length, 2*sequence_length]
+        # x [batch_size, ngram, number_heads, 2*sequence_length, head_dimesion]
+        # -> [batch_size, ngram, number_heads, sequence_length, head_dimesion]
+        predict_attn_output = torch.einsum(
+            "bnhts,bnhsc->bnhtc", (predict_attn_probs, predict_value_states.transpose(1, 2))
+        )
+
+        # reshape so that num_heads dim is merged into last `head_dim` axis
+        # [batch_size, ngram, number_heads, sequence_length, head_dimesion] -> [batch_size, ngram, sequence_length, hidden_size]
+        predict_attn_output = predict_attn_output.transpose(2, 3)
+        predict_attn_output = predict_attn_output.reshape(batch_size, self.ngram, sequence_length, hidden_size)
+        predict_attn_output = self.out_proj(predict_attn_output)
+
+        # concat to single attn output
+        # [batch_size, (1+ngram)*sequence_length, hidden_size]
+        attn_output = torch.cat([main_attn_output, predict_attn_output], 1).view(batch_size, -1, hidden_size)
+        # reshape into better form for `config.output_attentions`
+        main_attn_probs = main_attn_probs.view(batch_size, self.num_attn_heads, sequence_length, -1)
+
+        attn_output = nn.functional.dropout(attn_output, p=self.dropout, training=self.training)
+
+        return attn_output, main_attn_probs, predict_attn_probs, past_key_values
+
+    def get_main_relative_pos_embeddings(
+        self, hidden_states, attn_weights, position_ids, main_relative_position_buckets
+    ):
+        # input hidden_states [batch_size, sequence_length, hidden_size]
+        # input attn_weights [batch_size, num_heads, sequence_length, sequence_length]
+        # input position_ids [batch_size, sequence_length] or [1,1]
+        batch_size, num_attn_heads, tgt_len, src_len = attn_weights.shape
+        attn_weights = attn_weights.view(batch_size, num_attn_heads, tgt_len, src_len)
+        if main_relative_position_buckets is None:
+            batch_size, sequence_length = hidden_states.shape[:2]
+            relative_positions = (
+                torch.arange(1, attn_weights.shape[-1] + 1)
+                .unsqueeze(0)
+                .unsqueeze(0)
+                .repeat(batch_size, sequence_length, 1)
+                .to(position_ids.device)
+            )
+            # [batch_size, sequence_length, sequence_length+1]
+            relative_positions = relative_positions - position_ids.unsqueeze(0).repeat(batch_size, sequence_length, 1)
+            main_relative_position_buckets = compute_relative_buckets(
+                self.num_buckets, self.relative_max_distance, relative_positions, False
+            )
+
+        # [batch_size, sequence_length, num_buckets * num_heads]
+        rel_pos_embeddings = self.relative_pos_embeddings(hidden_states)
+        rel_pos_embeddings = rel_pos_embeddings.view(
+            rel_pos_embeddings.shape[:2] + (self.num_buckets, self.num_attn_heads)
+        )
+        rel_pos_embeddings = rel_pos_embeddings.permute(0, 3, 1, 2)
+        # [batch_size, num_heads, sequence_length, num_buckets]
+        rel_pos_embeddings = rel_pos_embeddings.reshape(attn_weights.shape[:3] + (-1,))
+
+        main_relative_position_buckets = main_relative_position_buckets.repeat(1, self.num_attn_heads, 1)
+        # [batch_size * num_heads * sequence_length, sequence_length]
+        main_relative_position_buckets = main_relative_position_buckets.view(
+            -1, main_relative_position_buckets.shape[-1]
+        )
+        main_relative_position_buckets = main_relative_position_buckets.long()
+        # [batch_size * num_heads * sequence_length, sequence_length]
+        rel_pos_embeddings = rel_pos_embeddings.reshape(-1, rel_pos_embeddings.size(-1))
+
+        main_relative_pos_embeddings = torch.gather(rel_pos_embeddings, dim=1, index=main_relative_position_buckets)
+        main_relative_pos_embeddings = main_relative_pos_embeddings.view(batch_size, num_attn_heads, tgt_len, -1)
+        return main_relative_pos_embeddings
+
+    def get_predict_relative_pos_embeddings(
+        self, hidden_states, attn_weights, position_ids, predict_relative_position_buckets
+    ):
+        # input hidden_states [batch_size, sequence_length, ngram, hidden_size]
+        # input attn_weights [batch_size, ngram, num_heads, sequence_length, 2*sequence_length]
+        # input position_ids [batch_size, sequence_length] or [1,1]
+        # input predict_relative_position_buckets [batch_size, sequence_length, 2*sequence_length] or None
+        batch_size, sequence_length = hidden_states.shape[0:2]
+
+        if predict_relative_position_buckets is None:
+            key_sequence_length = attn_weights.shape[-1]
+            assert position_ids[0][0] == key_sequence_length - 1, (
+                "`position_ids` are incorrect. They should be of the format 1 2 3 4 5 ... (key_sequence_length - 1)"
+            )
+            relative_positions = (
+                torch.arange(0, key_sequence_length)
+                .unsqueeze(0)
+                .unsqueeze(0)
+                .repeat(batch_size, sequence_length, 1)
+                .to(position_ids.device)
+            )
+
+            relative_positions = relative_positions - position_ids.unsqueeze(0).repeat(batch_size, sequence_length, 1)
+            predict_relative_position_buckets = compute_relative_buckets(
+                self.num_buckets, self.relative_max_distance, relative_positions, False
+            )
+
+        # [batch_size, ngram, sequence_length, hidden_size]
+        hidden_states = hidden_states.transpose(1, 2)
+        rel_pos_embeddings = self.relative_pos_embeddings(hidden_states)
+
+        # [batch_size, ngram, sequence_length, num_buckets, num_heads]
+        rel_pos_embeddings = rel_pos_embeddings.view(
+            hidden_states.shape[:-1] + (self.num_buckets, self.num_attn_heads)
+        )
+        rel_pos_embeddings = rel_pos_embeddings.permute(0, 2, 1, 4, 3)
+        # [batch_size * ngram * sequence_length * num_heads, num_buckets]
+        rel_pos_embeddings = rel_pos_embeddings.reshape(-1, self.num_buckets)
+        # [ngram, batch_size, num_heads * sequence_length, -1]
+        predict_relative_position_buckets = predict_relative_position_buckets.unsqueeze(0)
+        predict_relative_position_buckets = predict_relative_position_buckets.repeat(
+            self.ngram, 1, self.num_attn_heads, 1
+        )
+        # [ngram * batch_size * num_heads * sequence_length, -1]
+        predict_relative_position_buckets = predict_relative_position_buckets.view(
+            -1, predict_relative_position_buckets.size(-1)
+        ).long()
+
+        predict_relative_pos_embeddings = torch.gather(
+            rel_pos_embeddings, dim=1, index=predict_relative_position_buckets
+        )
+
+        # [batch_size, gram, num_heads, sequence_length, -1]
+        predict_relative_pos_embeddings = predict_relative_pos_embeddings.view(
+            batch_size, self.ngram, self.num_attn_heads, sequence_length, -1
+        )
+
+        return predict_relative_pos_embeddings
+
+
+class XLMProphetNetEncoderLayer(GradientCheckpointingLayer):
+    """
+    Encoder block for XLMProphetnet
+    """
+
+    def __init__(self, config: XLMProphetNetConfig):
+        super().__init__()
+        # 1st residual block
+        self.self_attn = XLMProphetNetAttention(config, config.num_encoder_attention_heads)
+        self.self_attn_layer_norm = LayerNorm(config.hidden_size)
+
+        # 2nd residual block
+        self.feed_forward = XLMProphetNetFeedForward(config, config.encoder_ffn_dim)
+        self.feed_forward_layer_norm = LayerNorm(config.hidden_size)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask,
+        layer_head_mask,
+        output_attentions: bool = False,
+    ):
+        # 1st residual block
+        attention_output, attn_weights, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self.self_attn_layer_norm(attention_output + hidden_states)
+
+        # 2nd residual block
+        feed_forward_output = self.feed_forward(hidden_states)
+        hidden_states = self.feed_forward_layer_norm(feed_forward_output + hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class XLMProphetNetDecoderLayer(GradientCheckpointingLayer):
+    """
+    Decoder block for XLMProphetnet
+    """
+
+    def __init__(self, config: XLMProphetNetConfig):
+        super().__init__()
+        # 1st residual block
+        self.self_attn = XLMProphetNetNgramSelfAttention(config)
+        self.self_attn_layer_norm = LayerNorm(config.hidden_size)
+
+        # 2nd residual block
+        if config.add_cross_attention:
+            self.cross_attn = XLMProphetNetAttention(config, config.num_decoder_attention_heads)
+            self.cross_attn_layer_norm = LayerNorm(config.hidden_size)
+
+        # 3rd residual block
+        self.feed_forward = XLMProphetNetFeedForward(config, config.decoder_ffn_dim)
+        self.feed_forward_layer_norm = LayerNorm(config.hidden_size)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attn_mask=None,
+        layer_head_mask=None,
+        cross_attn_layer_head_mask=None,
+        extended_predict_attention_mask=None,
+        main_relative_position_buckets=None,
+        predict_relative_position_buckets=None,
+        position_ids=None,
+        past_key_values=None,
+        use_cache: bool = True,
+        output_attentions: bool = False,
+    ):
+        # 1st residual block
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_values[:2] if past_key_values is not None else None
+        ngram_attention_output, self_attn_weights, self_attn_weights_ngram, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_values=self_attn_past_key_value,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            extended_predict_attention_mask=extended_predict_attention_mask,
+            main_relative_position_buckets=main_relative_position_buckets,
+            predict_relative_position_buckets=predict_relative_position_buckets,
+            position_ids=position_ids,
+        )
+        hidden_states = self.self_attn_layer_norm(hidden_states + ngram_attention_output)
+
+        # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
+        cross_attn_past_key_value = past_key_values[-2:] if past_key_values is not None else None
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            # 2nd residual block
+            attention_output, cross_attn_weights, cross_attn_present_key_value = self.cross_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attn_mask,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_values=cross_attn_past_key_value,
+                output_attentions=output_attentions,
+            )
+            hidden_states = self.cross_attn_layer_norm(attention_output + hidden_states)
+
+            # add cross-attn to positions 3,4 of present_key_value tuple
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        # 3rd residual block
+        feed_forward_output = self.feed_forward(hidden_states)
+        hidden_states = self.feed_forward_layer_norm(feed_forward_output + hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, self_attn_weights_ngram, cross_attn_weights)
+
+        if use_cache:
+            outputs += (present_key_value,)
+
+        return outputs
+
+
+@add_start_docstrings(
+    "The standalone encoder part of the XLMProphetNetModel.",
+    XLM_PROPHETNET_START_DOCSTRING,
+)
+class XLMProphetNetEncoder(XLMProphetNetPreTrainedModel):
+    r"""
+    word_embeddings  (`torch.nn.Embeddings` of shape `(config.vocab_size, config.hidden_size)`, *optional*):
+        The word embedding parameters. This can be used to initialize [`XLMProphetNetEncoder`] with pre-defined word
+        embeddings instead of randomly initialized word embeddings.
+    """
+
+    def __init__(self, config: XLMProphetNetConfig, word_embeddings: Optional[nn.Embedding] = None):
+        super().__init__(config)
+
+        self.word_embeddings = (
+            word_embeddings
+            if word_embeddings is not None
+            else nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        )
+        self.position_embeddings = XLMProphetNetPositionalEmbeddings(config)
+        self.embeddings_layer_norm = LayerNorm(config.hidden_size)
+
+        self.layers = nn.ModuleList([XLMProphetNetEncoderLayer(config) for _ in range(config.num_encoder_layers)])
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.word_embeddings = value
+
+    @add_start_docstrings_to_model_forward(XLM_PROPHETNET_STANDALONE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=BaseModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutput]:
+        r"""
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, XLMProphetNetEncoder
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("patrickvonplaten/xprophetnet-large-uncased-standalone")
+        >>> model = XLMProphetNetEncoder.from_pretrained("patrickvonplaten/prophetnet-large-uncased-standalone")
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is None and inputs_embeds is None:
+            raise ValueError("Either input_ids or inputs_embeds has to be passed.")
+        elif input_ids is not None and inputs_embeds is not None:
+            raise ValueError("Make sure to only pass input_ids or inputs_embeds.")
+        elif input_ids is not None and inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        # prepare attention mask
+        if attention_mask is not None:
+            extended_attention_mask = (
+                1.0 - attention_mask[:, None, None, :].repeat(1, self.config.num_encoder_attention_heads, 1, 1)
+            ) * torch.finfo(self.dtype).min
+            extended_attention_mask = extended_attention_mask.to(inputs_embeds.dtype)
+        else:
+            extended_attention_mask = None
+
+        position_embeddings, position_ids = self.position_embeddings(inputs_embeds.shape[:2], inputs_embeds.device)
+
+        hidden_states = inputs_embeds + position_embeddings
+        hidden_states = self.embeddings_layer_norm(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.config.dropout, training=self.training)
+
+        encoder_hidden_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        # check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            assert head_mask.size()[0] == (len(self.layers)), (
+                f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+            )
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_hidden_states = encoder_hidden_states + (hidden_states,)
+
+            layer_outputs = encoder_layer(
+                hidden_states,
+                attention_mask=extended_attention_mask,
+                layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                output_attentions=output_attentions,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_hidden_states = encoder_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_hidden_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_hidden_states, attentions=all_attentions
+        )
+
+
+@add_start_docstrings(
+    "The standalone decoder part of the XLMProphetNetModel.",
+    XLM_PROPHETNET_START_DOCSTRING,
+)
+class XLMProphetNetDecoder(XLMProphetNetPreTrainedModel):
+    r"""
+    word_embeddings  (`torch.nn.Embeddings` of shape `(config.vocab_size, config.hidden_size)`, *optional*):
+        The word embedding parameters. This can be used to initialize [`XLMProphetNetEncoder`] with pre-defined word
+        embeddings instead of randomly initialized word embeddings.
+    """
+
+    def __init__(self, config: XLMProphetNetConfig, word_embeddings: Optional[nn.Embedding] = None):
+        super().__init__(config)
+
+        self.ngram = config.ngram
+        self.num_buckets = config.num_buckets
+        self.relative_max_distance = config.relative_max_distance
+        self.dropout = config.dropout
+        self.max_target_positions = config.max_position_embeddings
+
+        self.word_embeddings = (
+            word_embeddings
+            if word_embeddings is not None
+            else nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        )
+        self.position_embeddings = XLMProphetNetPositionalEmbeddings(config)
+
+        self.ngram_embeddings = nn.Embedding(self.ngram, config.hidden_size, None)
+        self.layers = nn.ModuleList([XLMProphetNetDecoderLayer(config) for _ in range(config.num_decoder_layers)])
+        self.embeddings_layer_norm = LayerNorm(config.hidden_size)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.word_embeddings = value
+
+    @add_start_docstrings_to_model_forward(XLM_PROPHETNET_STANDALONE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=XLMProphetNetDecoderModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, XLMProphetNetDecoderModelOutput]:
+        r"""
+        encoder_hidden_states  (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, XLMProphetNetDecoder
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("patrickvonplaten/xprophetnet-large-uncased-standalone")
+        >>> model = XLMProphetNetDecoder.from_pretrained("patrickvonplaten/xprophetnet-large-uncased-standalone", add_cross_attention=False)
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is None and inputs_embeds is None:
+            raise ValueError("Either `decoder_input_ids` or `decoder_inputs_embeds` has to be passed.")
+        elif input_ids is not None and inputs_embeds is not None:
+            raise ValueError("Make sure to only pass `decoder_input_ids` or `decoder_inputs_embeds`.")
+        elif input_ids is not None and inputs_embeds is None:
+            inputs_embeds = self.word_embeddings(input_ids)
+
+        batch_size, sequence_length = inputs_embeds.shape[:2]
+
+        main_stream_pos_embed, position_ids = self.position_embeddings(
+            (batch_size, sequence_length),
+            device=inputs_embeds.device,
+            past_key_values=past_key_values,
+        )
+
+        if past_key_values is not None:
+            main_relative_position_buckets, predict_relative_position_buckets = None, None
+        else:
+            (
+                main_relative_position_buckets,
+                predict_relative_position_buckets,
+            ) = self.compute_buffered_relative_buckets(position_ids)
+        predicting_stream_pos_embed = self.position_embeddings._forward(position_ids + 1)
+
+        # add position embeddings
+        hidden_states = inputs_embeds + main_stream_pos_embed
+
+        ngram_embeddings = self.ngram_embeddings.weight
+
+        # prepare attention mask
+        if past_key_values is not None:
+            assert hidden_states.size(1) == 1, (
+                "At the moment `use_cache` is only supported for `decoder_input_ids` of length 1"
+            )
+
+            ngram_hidden_states = [
+                (ngram_embeddings[ngram - 1] + predicting_stream_pos_embed).repeat(batch_size, 1, 1)
+                for ngram in range(self.ngram)
+            ]
+            extended_attention_mask = None
+            extended_predict_attention_mask = None
+        else:
+            ngram_hidden_states = [
+                (ngram_embeddings[ngram - 1] + predicting_stream_pos_embed) for ngram in range(self.ngram)
+            ]
+            extended_attention_mask = self.prepare_attention_mask(hidden_states, attention_mask)
+            extended_predict_attention_mask = self.prepare_predict_attention_mask(hidden_states, attention_mask)
+
+        # prepare encoder attention mask
+        if encoder_attention_mask is not None:
+            extended_encoder_attention_mask = (
+                1.0 - encoder_attention_mask[:, None, None, :].repeat(1, self.config.num_decoder_attention_heads, 1, 1)
+            ) * torch.finfo(self.dtype).min
+            extended_encoder_attention_mask = extended_encoder_attention_mask.to(inputs_embeds.dtype)
+        else:
+            extended_encoder_attention_mask = None
+
+        hidden_states = torch.cat([hidden_states] + ngram_hidden_states, 1)
+
+        if self.embeddings_layer_norm:
+            hidden_states = self.embeddings_layer_norm(hidden_states)
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # init attentions, hidden_states and cache with empty tuples
+        all_main_stream_hidden_states = () if output_hidden_states else None
+        all_ngram_stream_hidden_states = () if output_hidden_states and self.config.ngram > 0 else None
+
+        all_main_stream_attns = () if output_attentions else None
+        all_ngram_stream_attns = () if output_attentions else None
+        all_cross_attns = () if output_attentions and self.config.add_cross_attention else None
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        present_key_values = () if use_cache else None
+
+        # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
+        for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
+            if attn_mask is not None:
+                assert attn_mask.size()[0] == (len(self.layers)), (
+                    f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
+                    f" {head_mask.size()[0]}."
+                )
+        for idx, decoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                # grad cannot be kept because tensor is sliced
+                all_main_stream_hidden_states += (hidden_states[:, :sequence_length],)
+                if self.config.ngram > 0:
+                    all_ngram_stream_hidden_states += (hidden_states[:, sequence_length:],)
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=extended_attention_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attn_mask=extended_encoder_attention_mask,
+                layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                cross_attn_layer_head_mask=(cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None),
+                extended_predict_attention_mask=extended_predict_attention_mask,
+                main_relative_position_buckets=main_relative_position_buckets,
+                predict_relative_position_buckets=predict_relative_position_buckets,
+                position_ids=position_ids,
+                past_key_values=past_key_values[idx] if past_key_values is not None else None,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if use_cache:
+                present_key_values += (layer_outputs[4 if output_attentions else 1],)
+
+            if output_attentions:
+                all_main_stream_attns += (layer_outputs[1],)
+                all_ngram_stream_attns += (layer_outputs[2],)
+
+                if self.config.add_cross_attention:
+                    all_cross_attns += (layer_outputs[3],)
+
+        if output_hidden_states:
+            all_main_stream_hidden_states += (hidden_states[:, :sequence_length],)
+            if self.config.ngram > 0:
+                all_ngram_stream_hidden_states += (hidden_states[:, sequence_length:],)
+
+        # split last_hidden_state for return
+        last_hidden_state = hidden_states[:, :sequence_length]
+        last_hidden_state_ngram = hidden_states[:, sequence_length:] if self.config.ngram > 0 else None
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    last_hidden_state,
+                    last_hidden_state_ngram,
+                    present_key_values,
+                    all_main_stream_hidden_states,
+                    all_ngram_stream_hidden_states,
+                    all_main_stream_attns,
+                    all_ngram_stream_attns,
+                    all_cross_attns,
+                ]
+                if v is not None
+            )
+        return XLMProphetNetDecoderModelOutput(
+            last_hidden_state=last_hidden_state,
+            last_hidden_state_ngram=last_hidden_state_ngram,
+            past_key_values=present_key_values,
+            hidden_states=all_main_stream_hidden_states,
+            hidden_states_ngram=all_ngram_stream_hidden_states,
+            attentions=all_main_stream_attns,
+            ngram_attentions=all_ngram_stream_attns,
+            cross_attentions=all_cross_attns,
+        )
+
+    def compute_buffered_relative_buckets(self, position_ids):
+        batch_size, sequence_length = position_ids.shape
+
+        position_ids = torch.arange(1, self.max_target_positions).to(position_ids.device).repeat(1, 1)
+        main_relative_buckets, predict_relative_buckets = compute_all_stream_relative_buckets(
+            self.num_buckets, self.relative_max_distance, position_ids
+        )
+
+        # buffer relative buckets
+        main_relative_buckets = main_relative_buckets[:, :sequence_length, :sequence_length].repeat(batch_size, 1, 1)
+        predict_relative_buckets = torch.cat(
+            [
+                predict_relative_buckets[:, :sequence_length, :sequence_length],
+                predict_relative_buckets[
+                    :, :sequence_length, self.max_target_positions : self.max_target_positions + sequence_length
+                ],
+            ],
+            2,
+        ).repeat(batch_size, 1, 1)
+
+        return main_relative_buckets, predict_relative_buckets
+
+    def prepare_attention_mask(self, hidden_states, attention_mask):
+        batch_size, seq_length = hidden_states.shape[:2]
+
+        # get causal mask
+        causal_mask = torch.full(
+            (seq_length, seq_length),
+            torch.finfo(hidden_states.dtype).min,
+            dtype=hidden_states.dtype,
+            device=hidden_states.device,
+        )
+        causal_mask = torch.triu(causal_mask, 1)
+
+        extended_causal_mask = causal_mask[:seq_length, :seq_length][None, None, :, :].expand(
+            (batch_size, self.config.num_decoder_attention_heads) + causal_mask.shape
+        )
+
+        # add usual attention mask
+        if attention_mask is not None:
+            extended_attention_mask = (1.0 - attention_mask[:, None, None, :]) * torch.finfo(self.dtype).min
+            extended_attention_mask = extended_causal_mask + extended_attention_mask
+        else:
+            extended_attention_mask = extended_causal_mask
+        return extended_attention_mask.to(hidden_states.dtype)
+
+    def prepare_predict_attention_mask(self, hidden_states, attention_mask):
+        batch_size, seq_length = hidden_states.shape[:2]
+
+        # get causal mask
+        predict_causal_mask = ngram_attention_bias(
+            self.max_target_positions, self.ngram, hidden_states.device, hidden_states.dtype
+        )
+        predict_causal_mask = torch.cat(
+            [
+                predict_causal_mask[:, :seq_length, :seq_length],
+                predict_causal_mask[
+                    :, :seq_length, self.max_target_positions : self.max_target_positions + seq_length
+                ],
+            ],
+            dim=-1,
+        )
+        extended_predict_causal_mask = predict_causal_mask[None, None, :, :, :].expand(
+            (batch_size, self.config.num_decoder_attention_heads) + predict_causal_mask.shape
+        )
+
+        # add usual attention mask
+        if attention_mask is not None:
+            extended_attention_mask = (1.0 - attention_mask[:, None, None, None, :]) * torch.finfo(self.dtype).min
+            extended_attention_mask = extended_attention_mask.expand(
+                (batch_size, self.config.num_decoder_attention_heads, self.ngram, seq_length, seq_length)
+            )
+            # predicted stream attention_mask should always be 0
+            extended_attention_mask = torch.cat(
+                [extended_attention_mask, torch.zeros_like(extended_attention_mask)], dim=-1
+            )
+            extended_predict_attention_mask = extended_predict_causal_mask + extended_attention_mask
+        else:
+            extended_predict_attention_mask = extended_predict_causal_mask
+        return extended_predict_attention_mask.to(hidden_states.dtype)
+
+
+@add_start_docstrings(
+    "The bare XLMProphetNet Model outputting raw hidden-states without any specific head on top.",
+    XLM_PROPHETNET_START_DOCSTRING,
+)
+class XLMProphetNetModel(XLMProphetNetPreTrainedModel):
+    _tied_weights_keys = ["encoder.word_embeddings.weight", "decoder.word_embeddings.weight"]
+
+    def __init__(self, config: XLMProphetNetConfig):
+        super().__init__(config)
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+
+        encoder_config = copy.deepcopy(config)
+        encoder_config.is_encoder_decoder = False
+        encoder_config.use_cache = False
+        self.encoder = XLMProphetNetEncoder(encoder_config, self.word_embeddings)
+
+        decoder_config = copy.deepcopy(config)
+        decoder_config.is_decoder = True
+        decoder_config.is_encoder_decoder = False
+        self.decoder = XLMProphetNetDecoder(decoder_config, self.word_embeddings)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.word_embeddings = value
+        self.encoder.word_embeddings = self.word_embeddings
+        self.decoder.word_embeddings = self.word_embeddings
+
+    def _tie_weights(self):
+        if self.config.tie_word_embeddings:
+            self._tie_or_clone_weights(self.encoder.word_embeddings, self.word_embeddings)
+            self._tie_or_clone_weights(self.decoder.word_embeddings, self.word_embeddings)
+
+    def get_encoder(self):
+        return self.encoder
+
+    @add_start_docstrings_to_model_forward(XLM_PROPHETNET_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=XLMProphetNetSeq2SeqModelOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.Tensor] = None,
+        decoder_attention_mask: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[tuple] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        decoder_inputs_embeds: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, XLMProphetNetSeq2SeqModelOutput]:
+        r"""
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, XLMProphetNetModel
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("patrickvonplaten/xprophetnet-large-uncased-standalone")
+        >>> model = XLMProphetNetModel.from_pretrained("patrickvonplaten/xprophetnet-large-uncased-standalone")
+
+        >>> input_ids = tokenizer(
+        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
+        ... ).input_ids  # Batch size 1
+        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
+        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+
+        >>> last_hidden_states = outputs.last_hidden_state  # main stream hidden states
+        >>> last_hidden_states_ngram = outputs.last_hidden_state_ngram  # predict hidden states
+        ```"""
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+
+        # decoder outputs consists of (dec_features, past_key_values, dec_hidden, dec_attn)
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=attention_mask,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            use_cache=use_cache,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+        return XLMProphetNetSeq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            last_hidden_state_ngram=decoder_outputs.last_hidden_state_ngram,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_ngram_hidden_states=decoder_outputs.hidden_states_ngram,
+            decoder_attentions=decoder_outputs.attentions,
+            decoder_ngram_attentions=decoder_outputs.ngram_attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+
+@add_start_docstrings(
+    "The XLMProphetNet Model with a language modeling head. Can be used for sequence generation tasks.",
+    XLM_PROPHETNET_START_DOCSTRING,
+)
+class XLMProphetNetForConditionalGeneration(XLMProphetNetPreTrainedModel):
+    _tied_weights_keys = ["encoder.word_embeddings.weight", "decoder.word_embeddings.weight", "lm_head.weight"]
+
+    def __init__(self, config: XLMProphetNetConfig):
+        super().__init__(config)
+        self.prophetnet = XLMProphetNetModel(config)
+        self.padding_idx = config.pad_token_id
+        self.disable_ngram_loss = config.disable_ngram_loss
+
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def _tie_weights(self):
+        if self.config.tie_word_embeddings:
+            self._tie_or_clone_weights(self.prophetnet.word_embeddings, self.lm_head)
+
+    def get_input_embeddings(self):
+        return self.prophetnet.word_embeddings
+
+    @add_start_docstrings_to_model_forward(XLM_PROPHETNET_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=XLMProphetNetSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.Tensor] = None,
+        decoder_attention_mask: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        decoder_inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, XLMProphetNetSeq2SeqLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size - 1]`. All labels set to `-100` are ignored (masked), the loss is only computed for
+            labels in `[0, ..., config.vocab_size]`
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, XLMProphetNetForConditionalGeneration
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("patrickvonplaten/xprophetnet-large-uncased-standalone")
+        >>> model = XLMProphetNetForConditionalGeneration.from_pretrained("patrickvonplaten/xprophetnet-large-uncased-standalone")
+
+        >>> input_ids = tokenizer(
+        ...     "Studies have been shown that owning a dog is good for you", return_tensors="pt"
+        ... ).input_ids  # Batch size 1
+        >>> decoder_input_ids = tokenizer("Studies show that", return_tensors="pt").input_ids  # Batch size 1
+        >>> outputs = model(input_ids=input_ids, decoder_input_ids=decoder_input_ids)
+
+        >>> logits_next_token = outputs.logits  # logits to predict next token as usual
+        >>> logits_ngram_next_tokens = outputs.logits_ngram  # logits to predict 2nd, 3rd, ... next tokens
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if labels is not None and decoder_input_ids is None and decoder_inputs_embeds is None:
+            # get decoder inputs from shifting lm labels to the right
+            decoder_input_ids = self._shift_right(labels)
+
+        outputs = self.prophetnet(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            encoder_outputs=encoder_outputs,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        batch_size, sequence_length = (
+            decoder_input_ids.shape if decoder_input_ids is not None else decoder_inputs_embeds.shape[:2]
+        )
+
+        predicting_streams = outputs[1].view(batch_size, self.config.ngram, sequence_length, -1)
+        predict_logits = self.lm_head(predicting_streams)
+
+        logits = predict_logits[:, 0]
+        logits_ngram = predict_logits[:, 1:] if self.config.ngram > 1 else None
+
+        # To use .view in loss computation, make sure that logits is contiguous.
+        if not logits.is_contiguous():
+            logits = logits.contiguous()
+
+        loss = None
+        if labels is not None:
+            loss = self._compute_loss(predict_logits, labels)
+
+        if not return_dict:
+            all_logits = tuple(v for v in [logits, logits_ngram] if v is not None)
+            return (loss,) + all_logits + outputs[2:] if loss is not None else all_logits + outputs[2:]
+        else:
+            return XLMProphetNetSeq2SeqLMOutput(
+                loss=loss,
+                logits=logits,
+                logits_ngram=logits_ngram,
+                past_key_values=outputs.past_key_values,
+                decoder_hidden_states=outputs.decoder_hidden_states,
+                decoder_ngram_hidden_states=outputs.decoder_ngram_hidden_states,
+                decoder_attentions=outputs.decoder_attentions,
+                decoder_ngram_attentions=outputs.decoder_ngram_attentions,
+                cross_attentions=outputs.cross_attentions,
+                encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+                encoder_hidden_states=outputs.encoder_hidden_states,
+                encoder_attentions=outputs.encoder_attentions,
+            )
+
+    def _compute_loss(self, logits, labels, ignore_index=-100):
+        expend_targets = labels.new_zeros(self.config.ngram, labels.size(0), labels.size(1)).fill_(ignore_index)
+
+        for i in range(self.config.ngram):
+            if i > 0 and self.disable_ngram_loss:
+                break
+            expend_targets[i, :, :] = labels
+
+        logits = logits.transpose(0, 1).contiguous()
+        lprobs = nn.functional.log_softmax(
+            logits.view(-1, logits.size(-1)),
+            dim=-1,
+            dtype=torch.float32,
+        )
+
+        loss = nn.functional.nll_loss(lprobs, expend_targets.view(-1), reduction="mean")
+
+        if self.config.eps > 0.0:
+            smooth_loss = -lprobs.sum(dim=-1, keepdim=True)
+            non_masked_tokens = expend_targets.ne(ignore_index).view(-1)
+            smooth_loss = smooth_loss[non_masked_tokens]
+            smooth_loss = smooth_loss.mean()
+
+            eps_i = self.config.eps / lprobs.size(-1)
+            loss = (1.0 - self.config.eps) * loss + eps_i * smooth_loss
+
+        return loss
+
+    def prepare_inputs_for_generation(
+        self,
+        decoder_input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+        use_cache=None,
+        encoder_outputs=None,
+        **kwargs,
+    ):
+        assert encoder_outputs is not None, "`encoder_outputs` have to be passed for generation."
+
+        if past_key_values:
+            decoder_input_ids = decoder_input_ids[:, -1:]
+        # first step, decoder_cached_states are empty
+        return {
+            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
+            "encoder_outputs": encoder_outputs,
+            "past_key_values": past_key_values,
+            "decoder_input_ids": decoder_input_ids,
+            "attention_mask": attention_mask,
+            "head_mask": head_mask,
+            "decoder_head_mask": decoder_head_mask,
+            "cross_attn_head_mask": cross_attn_head_mask,
+            "use_cache": use_cache,
+        }
+
+    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
+        return self._shift_right(labels)
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            # cached cross_attention states don't have to be reordered -> they are always the same
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past[:2])
+                + layer_past[2:],
+            )
+        return reordered_past
+
+    def get_encoder(self):
+        return self.prophetnet.encoder
+
+    def get_decoder(self):
+        return self.prophetnet.decoder
+
+
+@add_start_docstrings(
+    "The standalone decoder part of the XLMProphetNetModel with a lm head on top. The model can be used for causal"
+    " language modeling.",
+    XLM_PROPHETNET_START_DOCSTRING,
+)
+class XLMProphetNetForCausalLM(XLMProphetNetPreTrainedModel):
+    _tied_weights_keys = [
+        "prophetnet.word_embeddings.weight",
+        "prophetnet.decoder.word_embeddings.weight",
+        "lm_head.weight",
+    ]
+
+    def __init__(self, config: XLMProphetNetConfig):
+        # set config for CLM
+        config = copy.deepcopy(config)
+        config.is_decoder = True
+        config.is_encoder_decoder = False
+        super().__init__(config)
+        self.prophetnet = XLMProphetNetDecoderWrapper(config)
+
+        self.padding_idx = config.pad_token_id
+        self.disable_ngram_loss = config.disable_ngram_loss
+
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.prophetnet.decoder.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.prophetnet.decoder.word_embeddings = value
+
+    def _tie_weights(self):
+        if self.config.tie_word_embeddings:
+            self._tie_or_clone_weights(self.prophetnet.decoder.word_embeddings, self.lm_head)
+
+    def set_decoder(self, decoder):
+        self.prophetnet.decoder = decoder
+
+    def get_decoder(self):
+        return self.prophetnet.decoder
+
+    @add_start_docstrings_to_model_forward(XLM_PROPHETNET_STANDALONE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=XLMProphetNetDecoderLMOutput, config_class=_CONFIG_FOR_DOC)
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, XLMProphetNetDecoderLMOutput]:
+        r"""
+        encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention if
+            the model is configured as a decoder.
+        encoder_attention_mask (`torch.FloatTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on the padding token indices of the encoder input. This mask is used in
+            the cross-attention if the model is configured as a decoder. Mask values selected in `[0, 1]`:
+        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        past_key_values (`tuple(tuple(torch.FloatTensor))` of length `config.n_layers` with each tuple having 4 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+            Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up decoding.
+
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        use_cache (`bool`, *optional*):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`).
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`
+
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, XLMProphetNetForCausalLM
+        >>> import torch
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("patrickvonplaten/xprophetnet-large-uncased-standalone")
+        >>> model = XLMProphetNetForCausalLM.from_pretrained("patrickvonplaten/xprophetnet-large-uncased-standalone")
+        >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> logits = outputs.logits
+
+        >>> # Model can also be used with EncoderDecoder framework
+        >>> from transformers import BertTokenizer, EncoderDecoderModel, AutoTokenizer
+        >>> import torch
+
+        >>> tokenizer_enc = BertTokenizer.from_pretrained("google-bert/bert-large-uncased")
+        >>> tokenizer_dec = AutoTokenizer.from_pretrained("patrickvonplaten/xprophetnet-large-uncased-standalone")
+        >>> model = EncoderDecoderModel.from_encoder_decoder_pretrained(
+        ...     "google-bert/bert-large-uncased", "patrickvonplaten/xprophetnet-large-uncased-standalone"
+        ... )
+
+        >>> ARTICLE = (
+        ...     "the us state department said wednesday it had received no "
+        ...     "formal word from bolivia that it was expelling the us ambassador there "
+        ...     "but said the charges made against him are `` baseless ."
+        ... )
+        >>> input_ids = tokenizer_enc(ARTICLE, return_tensors="pt").input_ids
+        >>> labels = tokenizer_dec(
+        ...     "us rejects charges against its ambassador in bolivia", return_tensors="pt"
+        ... ).input_ids
+        >>> outputs = model(input_ids=input_ids, decoder_input_ids=labels[:, :-1], labels=labels[:, 1:])
+
+        >>> loss = outputs.loss
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, past_key_values, dec_hidden, dec_attn)
+        outputs = self.prophetnet.decoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            head_mask=head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        batch_size, sequence_length = input_ids.shape if input_ids is not None else inputs_embeds.shape[:2]
+
+        predicting_streams = outputs[1].view(batch_size, self.config.ngram, sequence_length, -1)
+        predict_logits = self.lm_head(predicting_streams)
+
+        logits = predict_logits[:, 0]
+        logits_ngram = predict_logits[:, 1:] if self.config.ngram > 1 else None
+
+        loss = None
+        if labels is not None:
+            loss = self._compute_loss(predict_logits, labels)
+
+        if not return_dict:
+            all_logits = tuple(v for v in [logits, logits_ngram] if v is not None)
+            return (loss,) + all_logits + outputs[2:] if loss is not None else all_logits + outputs[2:]
+        else:
+            return XLMProphetNetDecoderLMOutput(
+                loss=loss,
+                logits=logits,
+                logits_ngram=logits_ngram,
+                past_key_values=outputs.past_key_values,
+                hidden_states=outputs.hidden_states,
+                hidden_states_ngram=outputs.hidden_states_ngram,
+                attentions=outputs.attentions,
+                ngram_attentions=outputs.ngram_attentions,
+                cross_attentions=outputs.cross_attentions,
+            )
+
+    def _compute_loss(self, logits, labels, ignore_index=-100):
+        expend_targets = labels.new_zeros(self.config.ngram, labels.size(0), labels.size(1)).fill_(ignore_index)
+
+        for i in range(self.config.ngram):
+            if i > 0 and self.disable_ngram_loss:
+                break
+            expend_targets[i, :, :] = labels
+
+        logits = logits.transpose(0, 1).contiguous()
+        lprobs = nn.functional.log_softmax(
+            logits.view(-1, logits.size(-1)),
+            dim=-1,
+            dtype=torch.float32,
+        )
+
+        loss = nn.functional.nll_loss(lprobs, expend_targets.view(-1), reduction="mean")
+
+        if self.config.eps > 0.0:
+            smooth_loss = -lprobs.sum(dim=-1, keepdim=True)
+            non_masked_tokens = expend_targets.ne(ignore_index).view(-1)
+            smooth_loss = smooth_loss[non_masked_tokens]
+            smooth_loss = smooth_loss.mean()
+
+            eps_i = self.config.eps / lprobs.size(-1)
+            loss = (1.0 - self.config.eps) * loss + eps_i * smooth_loss
+
+        return loss
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        head_mask=None,
+        use_cache=None,
+        **kwargs,
+    ):
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_ids.shape)
+
+        if past_key_values:
+            input_ids = input_ids[:, -1:]
+        # first step, decoder_cached_states are empty
+        return {
+            "input_ids": input_ids,  # encoder_outputs is defined. input_ids not needed
+            "attention_mask": attention_mask,
+            "head_mask": head_mask,
+            "past_key_values": past_key_values,
+            "use_cache": use_cache,
+        }
+
+    @staticmethod
+    def _reorder_cache(past_key_values, beam_idx):
+        reordered_past = ()
+        for layer_past in past_key_values:
+            reordered_past += (
+                tuple(past_state.index_select(0, beam_idx.to(past_state.device)) for past_state in layer_past),
+            )
+        return reordered_past
+
+
+class XLMProphetNetDecoderWrapper(XLMProphetNetPreTrainedModel):
+    """
+    This is a wrapper class, so that [`XLMProphetNetForCausalLM`] can correctly be loaded from pretrained XLMProphetNet
+    classes.
+    """
+
+    def __init__(self, config: XLMProphetNetConfig):
+        super().__init__(config)
+
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.decoder = XLMProphetNetDecoder(config, word_embeddings=self.word_embeddings)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def _tie_weights(self):
+        self._tie_or_clone_weights(self.word_embeddings, self.decoder.get_input_embeddings())
+
+    def forward(self, *args, **kwargs):
+        return self.decoder(*args, **kwargs)
+
+
+__all__ = [
+    "XLMProphetNetDecoder",
+    "XLMProphetNetEncoder",
+    "XLMProphetNetForCausalLM",
+    "XLMProphetNetForConditionalGeneration",
+    "XLMProphetNetModel",
+    "XLMProphetNetPreTrainedModel",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/xlm_prophetnet/tokenization_xlm_prophetnet.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/xlm_prophetnet/tokenization_xlm_prophetnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..77431b13c49f47ac9df31b3d4e76ac9e2060895d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/deprecated/xlm_prophetnet/tokenization_xlm_prophetnet.py
@@ -0,0 +1,322 @@
+# coding=utf-8
+# Copyright 2020 The Microsoft Authors and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections
+import os
+from shutil import copyfile
+from typing import Any, Optional
+
+from ....tokenization_utils import PreTrainedTokenizer
+from ....utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+SPIECE_UNDERLINE = "▁"
+
+VOCAB_FILES_NAMES = {"vocab_file": "prophetnet.tokenizer"}
+
+
+def load_vocab(vocab_file):
+    """Loads a vocabulary file into a dictionary."""
+    vocab = collections.OrderedDict()
+    with open(vocab_file, "r", encoding="utf-8") as reader:
+        tokens = reader.readlines()
+    for index, token in enumerate(tokens):
+        token = token.rstrip("\n")
+        vocab[token] = index
+    return vocab
+
+
+class XLMProphetNetTokenizer(PreTrainedTokenizer):
+    """
+    Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
+    [SentencePiece](https://github.com/google/sentencepiece).
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        bos_token (`str`, *optional*, defaults to `"[SEP]"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"[SEP]"`):
+            The end of sequence token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
+            The token used is the `sep_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"[SEP]"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        unk_token (`str`, *optional*, defaults to `"[UNK]"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"[PAD]"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        cls_token (`str`, *optional*, defaults to `"[CLS]"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        mask_token (`str`, *optional*, defaults to `"[MASK]"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
+            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
+            to set:
+
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
+
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+                using forward-filtering-and-backward-sampling algorithm.
+
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+              BPE-dropout.
+
+    Attributes:
+        sp_model (`SentencePieceProcessor`):
+            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        bos_token="[SEP]",
+        eos_token="[SEP]",
+        sep_token="[SEP]",
+        unk_token="[UNK]",
+        pad_token="[PAD]",
+        cls_token="[CLS]",
+        mask_token="[MASK]",
+        sp_model_kwargs: Optional[dict[str, Any]] = None,
+        **kwargs,
+    ) -> None:
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+
+        try:
+            import sentencepiece as spm
+        except ImportError:
+            logger.warning(
+                "You need to install SentencePiece to use XLMRobertaTokenizer: https://github.com/google/sentencepiece"
+                " pip install sentencepiece"
+            )
+            raise
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(str(vocab_file))
+        self.vocab_file = vocab_file
+
+        # Original fairseq vocab and spm vocab must be "aligned":
+        # Vocab    |    0    |    1    |   2    |    3    |  4  |  5  |  6  |   7   |   8   |  9
+        # -------- | ------- | ------- | ------ | ------- | --- | --- | --- | ----- | ----- | ----
+        # fairseq  | '<s>'   | '<pad>' | '</s>' | '<unk>' | ',' | '.' | '▁' | 's'   | '▁de' | '-'
+        # spm      | '<unk>' | '<s>'   | '</s>' | ','     | '.' | '▁' | 's' | '▁de' | '-'   | '▁a'
+
+        # put special tokens and [unused] tokens into the vocab
+        self.fairseq_tokens_to_ids = {"[PAD]": 0, "[CLS]": 1, "[SEP]": 2, "[UNK]": 3, "[MASK]": 4}
+
+        for i in range(10):
+            tok = f"[unused{i}]"
+            self.fairseq_tokens_to_ids[tok] = 5 + i
+
+        # The first "real" token "," has position 15 in the embedding vocab and position 3 in the spm vocab
+        self.fairseq_offset = 12
+        self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
+
+        # TODO ArthurZ fairseq_ids_to_tokens should be removed
+
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            sep_token=sep_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            cls_token=cls_token,
+            mask_token=mask_token,
+            sp_model_kwargs=self.sp_model_kwargs,
+            **kwargs,
+        )
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__ = d
+        try:
+            import sentencepiece as spm
+        except ImportError:
+            logger.warning(
+                "You need to install SentencePiece to use XLMRobertaTokenizer: https://github.com/google/sentencepiece"
+                " pip install sentencepiece"
+            )
+            raise
+
+        # for backward compatibility
+        if not hasattr(self, "sp_model_kwargs"):
+            self.sp_model_kwargs = {}
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(self.vocab_file)
+
+    def get_special_tokens_mask(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
+    ) -> list[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if token_ids_1 is None:
+            return ([0] * len(token_ids_0)) + [1]
+        return ([0] * len(token_ids_0)) + [1] + ([0] * len(token_ids_1)) + [1]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLMProphetNet
+        does not make use of token type ids, therefore a list of zeros is returned.
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `list[int]`: List of zeros.
+
+        """
+
+        sep = [self.sep_token_id]
+
+        if token_ids_1 is None:
+            return len(token_ids_0 + sep) * [0]
+        return len(token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
+
+    @property
+    def vocab_size(self):
+        return len(self.sp_model) + self.fairseq_offset
+
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def _tokenize(self, text: str) -> str:
+        return self.sp_model.encode(text, out_type=str)
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        if token in self.fairseq_tokens_to_ids:
+            return self.fairseq_tokens_to_ids[token]
+        spm_id = self.sp_model.PieceToId(token)
+
+        # Need to return unknown token if the SP model returned 0
+        return spm_id + self.fairseq_offset if spm_id else self.unk_token_id
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        if index in self.fairseq_ids_to_tokens:
+            return self.fairseq_ids_to_tokens[index]
+        return self.sp_model.IdToPiece(index - self.fairseq_offset)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (strings for sub-words) in a single string."""
+        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
+        return out_string
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+
+        return (out_vocab_file,)
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. A XLMProphetNet sequence has the following format:
+
+        - single sequence: `X [SEP]`
+        - pair of sequences: `A [SEP] B [SEP]`
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs to which the special tokens will be added
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `list[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+
+        if token_ids_1 is None:
+            return token_ids_0 + [self.sep_token_id]
+        sep = [self.sep_token_id]
+        return token_ids_0 + sep + token_ids_1 + sep
+
+
+__all__ = ["XLMProphetNetTokenizer"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/depth_pro/image_processing_depth_pro.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/depth_pro/image_processing_depth_pro.py
new file mode 100644
index 0000000000000000000000000000000000000000..47f224f248bd1c222823c9ba2159999b68c641e1
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/depth_pro/image_processing_depth_pro.py
@@ -0,0 +1,389 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for DepthPro."""
+
+from typing import TYPE_CHECKING, Optional, Union
+
+import numpy as np
+
+from ...utils.import_utils import requires
+
+
+if TYPE_CHECKING:
+    from .modeling_depth_pro import DepthProDepthEstimatorOutput
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import to_channel_dimension_format
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    is_torch_available,
+    make_flat_list_of_images,
+    to_numpy_array,
+    valid_images,
+)
+from ...utils import TensorType, filter_out_non_signature_kwargs, is_torchvision_available, logging, requires_backends
+
+
+if is_torch_available():
+    import torch
+
+if is_torchvision_available():
+    from ...image_utils import pil_torch_interpolation_mapping
+
+
+logger = logging.get_logger(__name__)
+
+
+@requires(backends=("torchvision", "torch"))
+class DepthProImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a DepthPro image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `(size["height"],
+            size["width"])`. Can be overridden by the `do_resize` parameter in the `preprocess` method.
+        size (`dict`, *optional*, defaults to `{"height": 1536, "width": 1536}`):
+            Size of the output image after resizing. Can be overridden by the `size` parameter in the `preprocess`
+            method.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
+            Resampling filter to use if resizing the image. Can be overridden by the `resample` parameter in the
+            `preprocess` method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by the `do_rescale`
+            parameter in the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by the `rescale_factor` parameter in the
+            `preprocess` method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method.
+        image_mean (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `list[float]`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 1536, "width": 1536}
+        size = get_size_dict(size)
+        self.do_resize = do_resize
+        self.do_rescale = do_rescale
+        self.do_normalize = do_normalize
+        self.size = size
+        self.resample = resample
+        self.rescale_factor = rescale_factor
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+
+    def resize(
+        self,
+        image: np.ndarray,
+        size: dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image to `(size["height"], size["width"])`.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`dict[str, int]`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+
+        Returns:
+            `np.ndarray`: The resized images.
+        """
+        requires_backends(self, "torch")
+
+        size = get_size_dict(size)
+        if "height" not in size or "width" not in size:
+            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
+        output_size = (size["height"], size["width"])
+
+        # we use torch interpolation instead of image.resize because DepthProImageProcessor
+        # rescales, then normalizes, which may cause some values to become negative, before resizing the image.
+        # image.resize expects all values to be in range [0, 1] or [0, 255] and throws an exception otherwise,
+        # however pytorch interpolation works with negative values.
+        # relevant issue here: https://github.com/huggingface/transformers/issues/34920
+        # input should be (B, C, H, W)
+        image_tensor = torch.from_numpy(image).unsqueeze(0)
+        resized_image = torch.nn.functional.interpolate(
+            input=image_tensor,
+            size=output_size,
+            mode=pil_torch_interpolation_mapping[resample].value,
+        )
+        resized_image = resized_image.squeeze(0).numpy()
+        return resized_image
+
+    def _validate_input_arguments(
+        self,
+        do_resize: bool,
+        size: dict[str, int],
+        resample: PILImageResampling,
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Union[float, list[float]],
+        image_std: Union[float, list[float]],
+        data_format: Union[str, ChannelDimension],
+    ):
+        if do_resize and None in (size, resample):
+            raise ValueError("Size and resample must be specified if do_resize is True.")
+
+        if do_rescale and rescale_factor is None:
+            raise ValueError("Rescale factor must be specified if do_rescale is True.")
+
+        if do_normalize and None in (image_mean, image_std):
+            raise ValueError("Image mean and standard deviation must be specified if do_normalize is True.")
+
+    @filter_out_non_signature_kwargs()
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample: Optional[PILImageResampling] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Union[str, ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Dictionary in the format `{"height": h, "width": w}` specifying the size of the output image after
+                resizing.
+            resample (`PILImageResampling` filter, *optional*, defaults to `self.resample`):
+                `PILImageResampling` filter to use if resizing the image e.g. `PILImageResampling.BILINEAR`. Only has
+                an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image values between [0 - 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use if `do_normalize` is set to `True`.
+            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use if `do_normalize` is set to `True`.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        resample = resample if resample is not None else self.resample
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+
+        size = size if size is not None else self.size
+
+        images = make_flat_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        self._validate_input_arguments(
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            data_format=data_format,
+        )
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        all_images = []
+        for image in images:
+            if do_rescale:
+                image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+
+            if do_normalize:
+                image = self.normalize(
+                    image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
+                )
+
+            # depth-pro rescales and normalizes the image before resizing it
+            # uses torch interpolation which requires ChannelDimension.FIRST
+            if do_resize:
+                image = to_channel_dimension_format(image, ChannelDimension.FIRST, input_channel_dim=input_data_format)
+                image = self.resize(image=image, size=size, resample=resample)
+                image = to_channel_dimension_format(image, data_format, input_channel_dim=ChannelDimension.FIRST)
+            else:
+                image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+
+            all_images.append(image)
+
+        data = {"pixel_values": all_images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+    def post_process_depth_estimation(
+        self,
+        outputs: "DepthProDepthEstimatorOutput",
+        target_sizes: Optional[Union[TensorType, list[tuple[int, int]], None]] = None,
+    ) -> list[dict[str, TensorType]]:
+        """
+        Post-processes the raw depth predictions from the model to generate
+        final depth predictions which is caliberated using the field of view if provided
+        and resized to specified target sizes if provided.
+
+        Args:
+            outputs ([`DepthProDepthEstimatorOutput`]):
+                Raw outputs of the model.
+            target_sizes (`Optional[Union[TensorType, list[tuple[int, int]], None]]`, *optional*, defaults to `None`):
+                Target sizes to resize the depth predictions. Can be a tensor of shape `(batch_size, 2)`
+                or a list of tuples `(height, width)` for each image in the batch. If `None`, no resizing
+                is performed.
+
+        Returns:
+            `list[dict[str, TensorType]]`: A list of dictionaries of tensors representing the processed depth
+            predictions, and field of view (degrees) and focal length (pixels) if `field_of_view` is given in `outputs`.
+
+        Raises:
+            `ValueError`:
+                If the lengths of `predicted_depths`, `fovs`, or `target_sizes` are mismatched.
+        """
+        requires_backends(self, "torch")
+
+        predicted_depth = outputs.predicted_depth
+        fov = outputs.field_of_view
+
+        batch_size = len(predicted_depth)
+
+        if target_sizes is not None and batch_size != len(target_sizes):
+            raise ValueError(
+                "Make sure that you pass in as many fov values as the batch dimension of the predicted depth"
+            )
+
+        results = []
+        fov = [None] * batch_size if fov is None else fov
+        target_sizes = [None] * batch_size if target_sizes is None else target_sizes
+        for depth, fov_value, target_size in zip(predicted_depth, fov, target_sizes):
+            focal_length = None
+            if target_size is not None:
+                # scale image w.r.t fov
+                if fov_value is not None:
+                    width = target_size[1]
+                    focal_length = 0.5 * width / torch.tan(0.5 * torch.deg2rad(fov_value))
+                    depth = depth * width / focal_length
+
+                # interpolate
+                depth = torch.nn.functional.interpolate(
+                    # input should be (B, C, H, W)
+                    input=depth.unsqueeze(0).unsqueeze(1),
+                    size=target_size,
+                    mode=pil_torch_interpolation_mapping[self.resample].value,
+                ).squeeze()
+
+            # inverse the depth
+            depth = 1.0 / torch.clamp(depth, min=1e-4, max=1e4)
+
+            results.append(
+                {
+                    "predicted_depth": depth,
+                    "field_of_view": fov_value,
+                    "focal_length": focal_length,
+                }
+            )
+
+        return results
+
+
+__all__ = ["DepthProImageProcessor"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/depth_pro/modeling_depth_pro.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/depth_pro/modeling_depth_pro.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c32703b7c25526bc87beabf1f6cbb835f2b2478
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/depth_pro/modeling_depth_pro.py
@@ -0,0 +1,1131 @@
+# coding=utf-8
+# Copyright 2024 The Apple Research Team Authors and The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch DepthPro model."""
+
+import math
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from ...modeling_utils import PreTrainedModel
+from ...utils import ModelOutput, auto_docstring, logging, torch_int
+from ..auto import AutoModel
+from .configuration_depth_pro import DepthProConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for DepthPro's outputs.
+    """
+)
+class DepthProOutput(ModelOutput):
+    r"""
+    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, n_patches_per_batch, sequence_length, hidden_size)`):
+        Sequence of hidden-states at the output of the last layer of the model.
+    features (`Union[torch.FloatTensor, List[torch.FloatTensor]]`, *optional*):
+        Features from encoders. Can be a single feature or a list of features.
+    """
+
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    features: Union[torch.FloatTensor, list[torch.FloatTensor]] = None
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for DepthProForDepthEstimation's output.
+    """
+)
+class DepthProDepthEstimatorOutput(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Classification (or regression if config.num_labels==1) loss.
+    field_of_view (`torch.FloatTensor` of shape `(batch_size,)`, *optional*, returned when `use_fov_model` is provided):
+        Field of View Scaler.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    predicted_depth: Optional[torch.FloatTensor] = None
+    field_of_view: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+def split_to_patches(pixel_values: torch.Tensor, patch_size: int, overlap_ratio: float) -> torch.Tensor:
+    """Creates Patches from Batch."""
+    batch_size, num_channels, height, width = pixel_values.shape
+
+    if height == width == patch_size:
+        # create patches only if scaled image is not already equal to patch size
+        return pixel_values
+
+    stride = torch_int(patch_size * (1 - overlap_ratio))
+
+    patches = F.unfold(pixel_values, kernel_size=(patch_size, patch_size), stride=(stride, stride))
+    patches = patches.permute(2, 0, 1)
+    patches = patches.reshape(-1, num_channels, patch_size, patch_size)
+
+    return patches
+
+
+def reshape_features(hidden_states: torch.Tensor) -> torch.Tensor:
+    """Discard class token and reshape 1D feature map to a 2D grid."""
+    n_samples, seq_len, hidden_size = hidden_states.shape
+    size = torch_int(seq_len**0.5)
+
+    hidden_states = hidden_states[:, -(size**2) :, :]  # remove special tokens if there are any
+    hidden_states = hidden_states.reshape(n_samples, size, size, hidden_size)
+    hidden_states = hidden_states.permute(0, 3, 1, 2)
+
+    return hidden_states
+
+
+def merge_patches(patches: torch.Tensor, batch_size: int, padding: int) -> torch.Tensor:
+    """Merges smaller patches into image-like feature map."""
+    n_patches, hidden_size, out_size, out_size = patches.shape
+    n_patches_per_batch = n_patches // batch_size
+    sqrt_n_patches_per_batch = torch_int(n_patches_per_batch**0.5)
+    new_out_size = sqrt_n_patches_per_batch * out_size
+
+    if n_patches == batch_size:
+        # merge only if the patches were created from scaled image
+        # patches are not created when scaled image size is equal to patch size
+        return patches
+
+    if n_patches_per_batch < 4:
+        # for each batch, at least 4 small patches are required to
+        # recreate a large square patch from merging them and later padding is applied
+        # 3 x (8x8) patches becomes 1 x ( 8x8 ) patch (extra patch ignored, no padding)
+        # 4 x (8x8) patches becomes 1 x (16x16) patch (padding later)
+        # 5 x (8x8) patches becomes 1 x (16x16) patch (extra patch ignored, padding later)
+        # 9 x (8x8) patches becomes 1 x (24x24) patch (padding later)
+        # thus the following code only rearranges the patches and removes extra ones
+        padding = 0
+
+    # make sure padding is not large enough to remove more than half of the patch
+    padding = min(out_size // 4, padding)
+
+    if padding == 0:
+        # faster when no padding is required
+        merged = patches.reshape(n_patches_per_batch, batch_size, hidden_size, out_size, out_size)
+        merged = merged.permute(1, 2, 0, 3, 4)
+        merged = merged[:, :, : sqrt_n_patches_per_batch**2, :, :]
+        merged = merged.reshape(
+            batch_size, hidden_size, sqrt_n_patches_per_batch, sqrt_n_patches_per_batch, out_size, out_size
+        )
+        merged = merged.permute(0, 1, 2, 4, 3, 5)
+        merged = merged.reshape(batch_size, hidden_size, new_out_size, new_out_size)
+    else:
+        # padding example:
+        # let out_size = 8, new_out_size = 32, padding = 2
+        # each patch is separated by "|"
+        # and padding is applied to the merging edges of each patch
+        # 00 01 02 03 04 05 06 07 | 08 09 10 11 12 13 14 15 | 16 17 18 19 20 21 22 23 | 24 25 26 27 28 29 30 31
+        # 00 01 02 03 04 05 -- -- | -- -- 10 11 12 13 -- -- | -- -- 18 19 20 21 -- -- | -- -- 26 27 28 29 30 31
+        i = 0
+        boxes = []
+        for h in range(sqrt_n_patches_per_batch):
+            boxes_in_row = []
+            for w in range(sqrt_n_patches_per_batch):
+                box = patches[batch_size * i : batch_size * (i + 1)]
+
+                # collect paddings
+                paddings = [0, 0, 0, 0]
+                if h != 0:
+                    # remove pad from height if box is not at top border
+                    paddings[0] = padding
+                if w != 0:
+                    # remove pad from width if box is not at left border
+                    paddings[2] = padding
+                if h != sqrt_n_patches_per_batch - 1:
+                    # remove pad from height if box is not at bottom border
+                    paddings[1] = padding
+                if w != sqrt_n_patches_per_batch - 1:
+                    # remove pad from width if box is not at right border
+                    paddings[3] = padding
+
+                # remove paddings
+                _, _, box_h, box_w = box.shape
+                pad_top, pad_bottom, pad_left, pad_right = paddings
+                box = box[:, :, pad_top : box_h - pad_bottom, pad_left : box_w - pad_right]
+
+                boxes_in_row.append(box)
+                i += 1
+            boxes_in_row = torch.cat(boxes_in_row, dim=-1)
+            boxes.append(boxes_in_row)
+        merged = torch.cat(boxes, dim=-2)
+
+    return merged
+
+
+def reconstruct_feature_maps(
+    hidden_state: torch.Tensor, batch_size: int, padding: int, output_size: tuple[float, float]
+) -> torch.Tensor:
+    """
+    Reconstructs feature maps from the hidden state produced by any of the encoder. Converts the hidden state of shape
+    `(n_patches_per_batch * batch_size, seq_len, hidden_size)` to feature maps of shape
+    `(batch_size, hidden_size, output_size[0], output_size[1])`.
+
+    Args:
+        hidden_state (torch.Tensor): Input tensor of shape `(n_patches_per_batch * batch_size, seq_len, hidden_size)`
+            representing the encoded patches.
+        batch_size (int): The number of samples in a batch.
+        padding (int): The amount of padding to be removed when merging patches.
+        output_size (tuple[float, float]): The desired output size for the feature maps, specified as `(height, width)`.
+
+    Returns:
+        torch.Tensor: Reconstructed feature maps of shape `(batch_size, hidden_size, output_size[0], output_size[1])`.
+    """
+    # reshape back to image like
+    features = reshape_features(hidden_state)
+
+    # merge all patches in a batch to create one large patch per batch
+    features = merge_patches(
+        features,
+        batch_size=batch_size,
+        padding=padding,
+    )
+
+    # interpolate patches to base size
+    features = F.interpolate(
+        features,
+        size=output_size,
+        mode="bilinear",
+        align_corners=False,
+    )
+
+    return features
+
+
+class DepthProPatchEncoder(nn.Module):
+    def __init__(self, config: DepthProConfig):
+        super().__init__()
+        self.config = config
+
+        self.intermediate_hook_ids = config.intermediate_hook_ids
+        self.intermediate_feature_dims = config.intermediate_feature_dims
+        self.scaled_images_ratios = config.scaled_images_ratios
+        self.scaled_images_overlap_ratios = config.scaled_images_overlap_ratios
+        self.scaled_images_feature_dims = config.scaled_images_feature_dims
+        self.merge_padding_value = config.merge_padding_value
+
+        self.n_scaled_images = len(config.scaled_images_ratios)
+        self.n_intermediate_hooks = len(config.intermediate_hook_ids)
+        self.out_size = config.image_model_config.image_size // config.image_model_config.patch_size
+
+        self.model = AutoModel.from_config(config.patch_model_config)
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+    ) -> list[torch.Tensor]:
+        batch_size, num_channels, height, width = pixel_values.shape
+
+        if min(self.scaled_images_ratios) * min(height, width) < self.config.patch_size:
+            raise ValueError(
+                f"Image size {height}x{width} is too small to be scaled "
+                f"with scaled_images_ratios={self.scaled_images_ratios} "
+                f"when patch_size={self.config.patch_size}."
+            )
+
+        # STEP 1: create 3-level image
+
+        scaled_images = []
+        for ratio in self.scaled_images_ratios:
+            scaled_images.append(
+                F.interpolate(
+                    pixel_values,
+                    scale_factor=ratio,
+                    mode="bilinear",
+                    align_corners=False,
+                )
+            )
+
+        # STEP 2: create patches
+
+        for i in range(self.n_scaled_images):
+            scaled_images[i] = split_to_patches(
+                scaled_images[i],
+                patch_size=self.config.patch_size,
+                overlap_ratio=self.scaled_images_overlap_ratios[i],
+            )
+        n_patches_per_scaled_image = [len(i) for i in scaled_images]
+        patches = torch.cat(scaled_images[::-1], dim=0)  # -1 as patch encoder expects high res patches first
+
+        # STEP 3: apply patch encoder
+
+        encodings = self.model(
+            # each patch is processed as a separate batch
+            patches,
+            head_mask=head_mask,
+            # required for intermediate features
+            output_hidden_states=self.n_intermediate_hooks > 0,
+        )
+
+        scaled_images_last_hidden_state = torch.split_with_sizes(encodings[0], n_patches_per_scaled_image[::-1])
+        # -1 (reverse list) as patch encoder returns high res patches first, we need low res first
+        scaled_images_last_hidden_state = scaled_images_last_hidden_state[::-1]
+
+        # calculate base height and width
+        # base height and width are the dimensions of the lowest resolution features
+        exponent_value = torch_int(math.log2(width / self.out_size))
+        base_height = height // 2**exponent_value
+        base_width = width // 2**exponent_value
+
+        # STEP 4: get patch features (high_res, med_res, low_res) - (3-5) in diagram
+
+        scaled_images_features = []
+        for i in range(self.n_scaled_images):
+            hidden_state = scaled_images_last_hidden_state[i]
+            padding = torch_int(self.merge_padding_value * (1 / self.scaled_images_ratios[i]))
+            output_height = base_height * 2**i
+            output_width = base_width * 2**i
+            features = reconstruct_feature_maps(
+                hidden_state,
+                batch_size=batch_size,
+                padding=padding,
+                output_size=(output_height, output_width),
+            )
+            scaled_images_features.append(features)
+
+        # STEP 5: get intermediate features - (1-2) in diagram
+
+        intermediate_features = []
+        for i in range(self.n_intermediate_hooks):
+            # +1 to correct index position as hidden_states contain embedding output as well
+            hidden_state = encodings[2][self.intermediate_hook_ids[i] + 1]
+            padding = torch_int(self.merge_padding_value * (1 / self.scaled_images_ratios[-1]))
+            output_height = base_height * 2 ** (self.n_scaled_images - 1)
+            output_width = base_width * 2 ** (self.n_scaled_images - 1)
+            features = reconstruct_feature_maps(
+                hidden_state,
+                batch_size=batch_size,
+                padding=padding,
+                output_size=(output_height, output_width),
+            )
+            intermediate_features.append(features)
+
+        # STEP 7: combine all features
+        features = [*scaled_images_features, *intermediate_features]
+
+        return features
+
+
+class DepthProImageEncoder(nn.Module):
+    def __init__(self, config: DepthProConfig):
+        super().__init__()
+        self.config = config
+        self.out_size = config.image_model_config.image_size // config.image_model_config.patch_size
+
+        self.model = AutoModel.from_config(config.image_model_config)
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, DepthProOutput]:
+        batch_size, num_channels, height, width = pixel_values.shape
+
+        # scale the image for image_encoder
+        size = self.config.image_model_config.image_size
+        pixel_values = F.interpolate(
+            pixel_values,
+            size=(size, size),
+            mode="bilinear",
+            align_corners=False,
+        )
+        encodings = self.model(
+            pixel_values=pixel_values,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+
+        # calculate base height and width
+        # base height and width are the dimensions of the lowest resolution features
+        exponent_value = torch_int(math.log2(width / self.out_size))
+        base_height = height // 2**exponent_value
+        base_width = width // 2**exponent_value
+
+        features = reconstruct_feature_maps(
+            encodings[0],
+            batch_size=batch_size,
+            padding=0,
+            output_size=(base_height, base_width),
+        )
+
+        if not return_dict:
+            return (encodings[0], features) + encodings[2:]  # ignore last_hidden_state and poooler output
+
+        return DepthProOutput(
+            last_hidden_state=encodings.last_hidden_state,
+            features=features,
+            hidden_states=encodings.hidden_states,
+            attentions=encodings.attentions,
+        )
+
+
+class DepthProEncoder(nn.Module):
+    def __init__(self, config: DepthProConfig):
+        super().__init__()
+        self.config = config
+        self.intermediate_hook_ids = config.intermediate_hook_ids
+        self.intermediate_feature_dims = config.intermediate_feature_dims
+        self.scaled_images_ratios = config.scaled_images_ratios
+        self.scaled_images_overlap_ratios = config.scaled_images_overlap_ratios
+        self.scaled_images_feature_dims = config.scaled_images_feature_dims
+        self.merge_padding_value = config.merge_padding_value
+
+        self.n_scaled_images = len(self.scaled_images_ratios)
+        self.n_intermediate_hooks = len(self.intermediate_hook_ids)
+
+        self.patch_encoder = DepthProPatchEncoder(config)
+        self.image_encoder = DepthProImageEncoder(config)
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, DepthProOutput]:
+        batch_size, num_channels, height, width = pixel_values.shape
+
+        patch_features = self.patch_encoder(
+            pixel_values,
+            head_mask=head_mask,
+        )
+        image_encodings = self.image_encoder(
+            pixel_values,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        image_features = image_encodings[1]  # index 1 contains features
+
+        features = [image_features, *patch_features]
+
+        if not return_dict:
+            return (image_encodings[0], features) + image_encodings[2:]
+
+        return DepthProOutput(
+            last_hidden_state=image_encodings.last_hidden_state,
+            features=features,
+            hidden_states=image_encodings.hidden_states,
+            attentions=image_encodings.attentions,
+        )
+
+
+class DepthProFeatureUpsampleBlock(nn.Module):
+    def __init__(
+        self,
+        config: DepthProConfig,
+        input_dims: int,
+        intermediate_dims: int,
+        output_dims: int,
+        n_upsample_layers: int,
+        use_proj: bool = True,
+        bias: bool = False,
+    ):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList()
+
+        # create first projection layer
+        if use_proj:
+            proj = nn.Conv2d(
+                in_channels=input_dims,
+                out_channels=intermediate_dims,
+                kernel_size=1,
+                stride=1,
+                padding=0,
+                bias=bias,
+            )
+            self.layers.append(proj)
+
+        # create following upsample layers
+        for i in range(n_upsample_layers):
+            in_channels = intermediate_dims if i == 0 else output_dims
+            layer = nn.ConvTranspose2d(
+                in_channels=in_channels,
+                out_channels=output_dims,
+                kernel_size=2,
+                stride=2,
+                padding=0,
+                bias=bias,
+            )
+            self.layers.append(layer)
+
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        for layer in self.layers:
+            features = layer(features)
+        return features
+
+
+class DepthProFeatureUpsample(nn.Module):
+    def __init__(self, config: DepthProConfig):
+        super().__init__()
+        self.config = config
+        self.n_scaled_images = len(self.config.scaled_images_ratios)
+        self.n_intermediate_hooks = len(self.config.intermediate_hook_ids)
+
+        # for image_features
+        self.image_block = DepthProFeatureUpsampleBlock(
+            config=config,
+            input_dims=config.image_model_config.hidden_size,
+            intermediate_dims=config.image_model_config.hidden_size,
+            output_dims=config.scaled_images_feature_dims[0],
+            n_upsample_layers=1,
+            use_proj=False,
+            bias=True,
+        )
+
+        # for scaled_images_features
+        self.scaled_images = nn.ModuleList()
+        for i, feature_dims in enumerate(config.scaled_images_feature_dims):
+            block = DepthProFeatureUpsampleBlock(
+                config=config,
+                input_dims=config.patch_model_config.hidden_size,
+                intermediate_dims=feature_dims,
+                output_dims=feature_dims,
+                n_upsample_layers=1,
+            )
+            self.scaled_images.append(block)
+
+        # for intermediate_features
+        self.intermediate = nn.ModuleList()
+        for i, feature_dims in enumerate(config.intermediate_feature_dims):
+            intermediate_dims = config.fusion_hidden_size if i == 0 else feature_dims
+            block = DepthProFeatureUpsampleBlock(
+                config=config,
+                input_dims=config.patch_model_config.hidden_size,
+                intermediate_dims=intermediate_dims,
+                output_dims=feature_dims,
+                n_upsample_layers=2 + i,
+            )
+            self.intermediate.append(block)
+
+    def forward(self, features: list[torch.Tensor]) -> list[torch.Tensor]:
+        features[0] = self.image_block(features[0])
+
+        for i in range(self.n_scaled_images):
+            features[i + 1] = self.scaled_images[i](features[i + 1])
+
+        for i in range(self.n_intermediate_hooks):
+            features[self.n_scaled_images + i + 1] = self.intermediate[i](features[self.n_scaled_images + i + 1])
+
+        return features
+
+
+class DepthProFeatureProjection(nn.Module):
+    def __init__(self, config: DepthProConfig):
+        super().__init__()
+        self.config = config
+
+        combined_feature_dims = config.scaled_images_feature_dims + config.intermediate_feature_dims
+        self.projections = nn.ModuleList()
+        for i, in_channels in enumerate(combined_feature_dims):
+            if i == len(combined_feature_dims) - 1 and in_channels == config.fusion_hidden_size:
+                # projection for last layer can be ignored if input and output channels already match
+                self.projections.append(nn.Identity())
+            else:
+                self.projections.append(
+                    nn.Conv2d(
+                        in_channels=in_channels,
+                        out_channels=config.fusion_hidden_size,
+                        kernel_size=3,
+                        stride=1,
+                        padding=1,
+                        bias=False,
+                    )
+                )
+
+    def forward(self, features: list[torch.Tensor]) -> list[torch.Tensor]:
+        projected_features = []
+        for i, projection in enumerate(self.projections):
+            upsampled_feature = projection(features[i])
+            projected_features.append(upsampled_feature)
+        return projected_features
+
+
+class DepthProNeck(nn.Module):
+    def __init__(self, config: DepthProConfig):
+        super().__init__()
+        self.config = config
+
+        self.feature_upsample = DepthProFeatureUpsample(config)
+        self.fuse_image_with_low_res = nn.Conv2d(
+            in_channels=config.scaled_images_feature_dims[0] * 2,
+            out_channels=config.scaled_images_feature_dims[0],
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=True,
+        )
+        self.feature_projection = DepthProFeatureProjection(config)
+
+    def forward(self, features: list[torch.Tensor]) -> list[torch.Tensor]:
+        features = self.feature_upsample(features)
+        # global features = low res features + image features
+        global_features = torch.cat((features[1], features[0]), dim=1)
+        global_features = self.fuse_image_with_low_res(global_features)
+        features = [global_features, *features[2:]]
+        features = self.feature_projection(features)
+        return features
+
+
+# General docstring
+
+
+@auto_docstring
+class DepthProPreTrainedModel(PreTrainedModel):
+    config: DepthProConfig
+    base_model_prefix = "depth_pro"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    _supports_sdpa = True
+    _no_split_modules = ["DepthProPreActResidualLayer"]
+    _keys_to_ignore_on_load_unexpected = ["fov_model.*"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, (nn.Conv2d, nn.ConvTranspose2d)):
+            nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
+            if module.bias is not None:
+                module.bias.data.zero_()
+
+
+@auto_docstring
+class DepthProModel(DepthProPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+        self.encoder = DepthProEncoder(config)
+        self.neck = DepthProNeck(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.encoder.image_encoder.model.get_input_embeddings()
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, DepthProOutput]:
+        r"""
+        Examples:
+
+        ```python
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, DepthProModel
+
+        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> checkpoint = "apple/DepthPro-hf"
+        >>> processor = AutoProcessor.from_pretrained(checkpoint)
+        >>> model = DepthProModel.from_pretrained(checkpoint)
+
+        >>> # prepare image for the model
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> with torch.no_grad():
+        ...     output = model(**inputs)
+
+        >>> output.last_hidden_state.shape
+        torch.Size([1, 35, 577, 1024])
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encodings = self.encoder(
+            pixel_values,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        features = encodings[1]  # index 1 contains features
+        features = self.neck(features)
+
+        if not return_dict:
+            return (encodings[0], features) + encodings[2:]
+
+        return DepthProOutput(
+            last_hidden_state=encodings.last_hidden_state,
+            features=features,
+            hidden_states=encodings.hidden_states,
+            attentions=encodings.attentions,
+        )
+
+
+# Copied from transformers.models.dpt.modeling_dpt.DPTPreActResidualLayer DPT->DepthPro
+class DepthProPreActResidualLayer(nn.Module):
+    """
+    ResidualConvUnit, pre-activate residual unit.
+
+    Args:
+        config (`[DepthProConfig]`):
+            Model configuration class defining the model architecture.
+    """
+
+    def __init__(self, config: DepthProConfig):
+        super().__init__()
+
+        self.use_batch_norm = config.use_batch_norm_in_fusion_residual
+        use_bias_in_fusion_residual = (
+            config.use_bias_in_fusion_residual
+            if config.use_bias_in_fusion_residual is not None
+            else not self.use_batch_norm
+        )
+
+        self.activation1 = nn.ReLU()
+        self.convolution1 = nn.Conv2d(
+            config.fusion_hidden_size,
+            config.fusion_hidden_size,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=use_bias_in_fusion_residual,
+        )
+
+        self.activation2 = nn.ReLU()
+        self.convolution2 = nn.Conv2d(
+            config.fusion_hidden_size,
+            config.fusion_hidden_size,
+            kernel_size=3,
+            stride=1,
+            padding=1,
+            bias=use_bias_in_fusion_residual,
+        )
+
+        if self.use_batch_norm:
+            self.batch_norm1 = nn.BatchNorm2d(config.fusion_hidden_size)
+            self.batch_norm2 = nn.BatchNorm2d(config.fusion_hidden_size)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        residual = hidden_state
+        hidden_state = self.activation1(hidden_state)
+
+        hidden_state = self.convolution1(hidden_state)
+
+        if self.use_batch_norm:
+            hidden_state = self.batch_norm1(hidden_state)
+
+        hidden_state = self.activation2(hidden_state)
+        hidden_state = self.convolution2(hidden_state)
+
+        if self.use_batch_norm:
+            hidden_state = self.batch_norm2(hidden_state)
+
+        return hidden_state + residual
+
+
+# Modified from transformers.models.dpt.modeling_dpt.DPTFeatureFusionLayer
+# except it uses deconv and skip_add and needs no interpolation
+class DepthProFeatureFusionLayer(nn.Module):
+    def __init__(self, config: DepthProConfig, use_deconv: bool = True):
+        super().__init__()
+        self.config = config
+        self.use_deconv = use_deconv
+
+        self.residual_layer1 = DepthProPreActResidualLayer(config)
+        self.residual_layer2 = DepthProPreActResidualLayer(config)
+
+        if self.use_deconv:
+            self.deconv = nn.ConvTranspose2d(
+                in_channels=config.fusion_hidden_size,
+                out_channels=config.fusion_hidden_size,
+                kernel_size=2,
+                stride=2,
+                padding=0,
+                bias=False,
+            )
+
+        self.projection = nn.Conv2d(config.fusion_hidden_size, config.fusion_hidden_size, kernel_size=1, bias=True)
+
+    def forward(self, hidden_state: torch.Tensor, residual: Optional[torch.Tensor] = None) -> torch.Tensor:
+        if residual is not None:
+            residual = self.residual_layer1(residual)
+            hidden_state = hidden_state + residual
+
+        hidden_state = self.residual_layer2(hidden_state)
+        if self.use_deconv:
+            hidden_state = self.deconv(hidden_state)
+        hidden_state = self.projection(hidden_state)
+
+        return hidden_state
+
+
+# Modified from transformers.models.dpt.modeling_dpt.DPTFeatureFusionStage with DPT->DepthPro
+# with deconv and reversed layers
+class DepthProFeatureFusionStage(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+
+        self.num_layers = len(config.intermediate_hook_ids) + len(config.scaled_images_ratios)
+        self.intermediate = nn.ModuleList()
+        for _ in range(self.num_layers - 1):
+            self.intermediate.append(DepthProFeatureFusionLayer(config))
+
+        # final layer does not require deconvolution
+        self.final = DepthProFeatureFusionLayer(config, use_deconv=False)
+
+    def forward(self, hidden_states: list[torch.Tensor]) -> list[torch.Tensor]:
+        if self.num_layers != len(hidden_states):
+            raise ValueError(
+                f"num_layers={self.num_layers} in DepthProFeatureFusionStage"
+                f"does not match len(hidden_states)={len(hidden_states)}"
+            )
+
+        fused_hidden_states = []
+        fused_hidden_state = None
+        for hidden_state, layer in zip(hidden_states[:-1], self.intermediate):
+            if fused_hidden_state is None:
+                # first layer only uses the last hidden_state
+                fused_hidden_state = layer(hidden_state)
+            else:
+                fused_hidden_state = layer(fused_hidden_state, hidden_state)
+            fused_hidden_states.append(fused_hidden_state)
+
+        hidden_state = hidden_states[-1]
+        fused_hidden_state = self.final(fused_hidden_state, hidden_state)
+        fused_hidden_states.append(fused_hidden_state)
+
+        return fused_hidden_states
+
+
+class DepthProFovEncoder(nn.Module):
+    def __init__(self, config: DepthProConfig):
+        super().__init__()
+        self.config = config
+        self.out_size = config.image_model_config.image_size // config.image_model_config.patch_size
+
+        self.model = AutoModel.from_config(config.fov_model_config)
+        self.neck = nn.Linear(config.fov_model_config.hidden_size, config.fusion_hidden_size // 2)
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        batch_size, num_channels, height, width = pixel_values.shape
+
+        # scale the image for fov_encoder
+        size = self.config.fov_model_config.image_size
+        pixel_values = F.interpolate(
+            pixel_values,
+            size=(size, size),
+            mode="bilinear",
+            align_corners=False,
+        )
+        encodings = self.model(
+            pixel_values=pixel_values,
+            head_mask=head_mask,
+        )
+        hidden_state = encodings[0]
+        hidden_state = self.neck(hidden_state)
+
+        # calculate base height and width
+        # base height and width are the dimensions of the lowest resolution features
+        exponent_value = torch_int(math.log2(width / self.out_size))
+        base_height = height // 2**exponent_value
+        base_width = width // 2**exponent_value
+
+        features = reconstruct_feature_maps(
+            hidden_state,
+            batch_size=batch_size,
+            padding=0,
+            output_size=(base_height, base_width),
+        )
+
+        return features
+
+
+class DepthProFovHead(nn.Module):
+    def __init__(self, config: DepthProConfig):
+        super().__init__()
+        self.config = config
+        self.fusion_hidden_size = config.fusion_hidden_size
+        self.out_size = config.image_model_config.image_size // config.image_model_config.patch_size
+
+        # create initial head layers
+        self.layers = nn.ModuleList()
+        for i in range(config.num_fov_head_layers):
+            self.layers.append(
+                nn.Conv2d(
+                    math.ceil(self.fusion_hidden_size / 2 ** (i + 1)),
+                    math.ceil(self.fusion_hidden_size / 2 ** (i + 2)),
+                    kernel_size=3,
+                    stride=2,
+                    padding=1,
+                )
+            )
+            self.layers.append(nn.ReLU(True))
+        # calculate expected shapes to finally generate a scalar output from final head layer
+        final_in_channels = math.ceil(self.fusion_hidden_size / 2 ** (config.num_fov_head_layers + 1))
+        final_kernel_size = torch_int((self.out_size - 1) / 2**config.num_fov_head_layers + 1)
+        self.layers.append(
+            nn.Conv2d(
+                in_channels=final_in_channels, out_channels=1, kernel_size=final_kernel_size, stride=1, padding=0
+            )
+        )
+
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        features = F.interpolate(
+            features,
+            size=(self.out_size, self.out_size),
+            mode="bilinear",
+            align_corners=False,
+        )
+        for layer in self.layers:
+            features = layer(features)
+        return features
+
+
+class DepthProFovModel(nn.Module):
+    def __init__(self, config: DepthProConfig):
+        super().__init__()
+        self.config = config
+        self.fusion_hidden_size = config.fusion_hidden_size
+
+        self.fov_encoder = DepthProFovEncoder(config)
+        self.conv = nn.Conv2d(
+            self.fusion_hidden_size, self.fusion_hidden_size // 2, kernel_size=3, stride=2, padding=1
+        )
+        self.activation = nn.ReLU(inplace=True)
+        self.head = DepthProFovHead(config)
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        global_features: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        fov_features = self.fov_encoder(pixel_values, head_mask)
+
+        global_features = self.conv(global_features)
+        global_features = self.activation(global_features)
+
+        fov_features = fov_features + global_features
+        fov_output = self.head(fov_features)
+        fov_output = fov_output.flatten()
+
+        return fov_output
+
+
+class DepthProDepthEstimationHead(nn.Module):
+    """
+    The DepthProDepthEstimationHead module serves as the output head for depth estimation tasks.
+    This module comprises a sequence of convolutional and transposed convolutional layers
+    that process the feature map from the fusion to produce a single-channel depth map.
+    Key operations include dimensionality reduction and upsampling to match the input resolution.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+
+        features = config.fusion_hidden_size
+        self.layers = nn.ModuleList(
+            [
+                nn.Conv2d(features, features // 2, kernel_size=3, stride=1, padding=1),
+                nn.ConvTranspose2d(
+                    in_channels=features // 2,
+                    out_channels=features // 2,
+                    kernel_size=2,
+                    stride=2,
+                    padding=0,
+                    bias=True,
+                ),
+                nn.Conv2d(features // 2, 32, kernel_size=3, stride=1, padding=1),
+                nn.ReLU(True),
+                nn.Conv2d(32, 1, kernel_size=1, stride=1, padding=0),
+                nn.ReLU(),
+            ]
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        for layer in self.layers:
+            hidden_states = layer(hidden_states)
+
+        predicted_depth = hidden_states.squeeze(dim=1)
+        return predicted_depth
+
+
+@auto_docstring(
+    custom_intro="""
+    DepthPro Model with a depth estimation head on top (consisting of 3 convolutional layers).
+    """
+)
+class DepthProForDepthEstimation(DepthProPreTrainedModel):
+    def __init__(self, config, use_fov_model=None):
+        r"""
+        use_fov_model (bool, *optional*):
+            Whether to use the field of view model.
+        """
+        super().__init__(config)
+        self.config = config
+        self.use_fov_model = use_fov_model if use_fov_model is not None else self.config.use_fov_model
+
+        # dinov2 (vit) like encoders
+        self.depth_pro = DepthProModel(config)
+
+        # dpt (vit) like fusion stage
+        self.fusion_stage = DepthProFeatureFusionStage(config)
+
+        # depth estimation head
+        self.head = DepthProDepthEstimationHead(config)
+
+        # dinov2 (vit) like encoder
+        self.fov_model = DepthProFovModel(config) if self.use_fov_model else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        head_mask: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], DepthProDepthEstimatorOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
+            Ground truth depth estimation maps for computing the loss.
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, DepthProForDepthEstimation
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> checkpoint = "apple/DepthPro-hf"
+        >>> processor = AutoImageProcessor.from_pretrained(checkpoint)
+        >>> model = DepthProForDepthEstimation.from_pretrained(checkpoint)
+
+        >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        >>> model.to(device)
+
+        >>> # prepare image for the model
+        >>> inputs = processor(images=image, return_tensors="pt").to(device)
+
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+
+        >>> # interpolate to original size
+        >>> post_processed_output = processor.post_process_depth_estimation(
+        ...     outputs, target_sizes=[(image.height, image.width)],
+        ... )
+
+        >>> # get the field of view (fov) predictions
+        >>> field_of_view = post_processed_output[0]["field_of_view"]
+        >>> focal_length = post_processed_output[0]["focal_length"]
+
+        >>> # visualize the prediction
+        >>> predicted_depth = post_processed_output[0]["predicted_depth"]
+        >>> depth = predicted_depth * 255 / predicted_depth.max()
+        >>> depth = depth.detach().cpu().numpy()
+        >>> depth = Image.fromarray(depth.astype("uint8"))
+        ```"""
+        loss = None
+        if labels is not None:
+            raise NotImplementedError("Training is not implemented yet")
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        depth_pro_outputs = self.depth_pro(
+            pixel_values=pixel_values,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+        )
+        features = depth_pro_outputs.features
+        fused_hidden_states = self.fusion_stage(features)
+        predicted_depth = self.head(fused_hidden_states[-1])
+
+        if self.use_fov_model:
+            # frozen features from encoder are used
+            features_for_fov = features[0].detach()
+            fov = self.fov_model(
+                pixel_values=pixel_values,
+                global_features=features_for_fov,
+                head_mask=head_mask,
+            )
+        else:
+            fov = None
+
+        if not return_dict:
+            outputs = [loss, predicted_depth, fov, depth_pro_outputs.hidden_states, depth_pro_outputs.attentions]
+            return tuple(v for v in outputs if v is not None)
+
+        return DepthProDepthEstimatorOutput(
+            loss=loss,
+            predicted_depth=predicted_depth,
+            field_of_view=fov,
+            hidden_states=depth_pro_outputs.hidden_states,
+            attentions=depth_pro_outputs.attentions,
+        )
+
+
+__all__ = ["DepthProPreTrainedModel", "DepthProModel", "DepthProForDepthEstimation"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5c1abeb3b4be1f35d8ebc613c9d19ec8e3f75f24
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/__pycache__/configuration_detr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/__pycache__/configuration_detr.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a0c97458ae20ac023eb69977d3e9bdcdc1ffabbe
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/__pycache__/configuration_detr.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/__pycache__/feature_extraction_detr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/__pycache__/feature_extraction_detr.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5066fb0d8d67fe8964d1fb860075e4c62083a605
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/__pycache__/feature_extraction_detr.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/__pycache__/image_processing_detr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/__pycache__/image_processing_detr.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a017d6b6cf53f71381b44f9dd654b4cbf0432c8a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/__pycache__/image_processing_detr.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/__pycache__/image_processing_detr_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/__pycache__/image_processing_detr_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..63825ea821e2eab4aa56993a9ef83ce5d589f7a0
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/__pycache__/image_processing_detr_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/__pycache__/modeling_detr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/__pycache__/modeling_detr.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7bab463964990fd41c09ed5f65311e093a071c19
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/detr/__pycache__/modeling_detr.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dia/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dia/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d738fbc087888597da19735271366d4e35ab708c
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dia/__init__.py
@@ -0,0 +1,31 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_dia import *
+    from .feature_extraction_dia import *
+    from .generation_dia import *
+    from .modeling_dia import *
+    from .processing_dia import *
+    from .tokenization_dia import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dia/configuration_dia.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dia/configuration_dia.py
new file mode 100644
index 0000000000000000000000000000000000000000..d4dec60b3e4853574e4d528e7b641507a8c0b414
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dia/configuration_dia.py
@@ -0,0 +1,376 @@
+# coding=utf-8
+# Copyright 2025 The Nari Labs and HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Dia model configuration"""
+
+from typing import Optional
+
+from ...configuration_utils import PretrainedConfig
+from ...modeling_rope_utils import rope_config_validation
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class DiaEncoderConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DiaEncoder`]. It is used to instantiate a Dia
+    encoder according to the specified arguments, defining the encoder architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        max_position_embeddings (`int`, *optional*, defaults to 1024):
+            The maximum sequence length that this model might ever be used with.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        hidden_size (`int`, *optional*, defaults to 1024):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 16):
+            Number of key and value heads for each attention layer in the Transformer encoder.
+        head_dim (`int`, *optional*, defaults to 128):
+            Dimensionality of the attention head.
+        intermediate_size (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the normalization layers.
+        vocab_size (`int`, *optional*, defaults to 256):
+            Vocabulary size of the Dia model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`DiaModel`].
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"swish"` and `"gelu_new"` are supported.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+    """
+
+    model_type = "dia_encoder"
+
+    def __init__(
+        self,
+        max_position_embeddings: int = 1024,
+        num_hidden_layers: int = 12,
+        hidden_size: int = 1024,
+        num_attention_heads: int = 16,
+        num_key_value_heads: int = 16,
+        head_dim: int = 128,
+        intermediate_size: int = 4096,
+        norm_eps: float = 1e-5,
+        vocab_size: int = 256,
+        hidden_act: str = "silu",
+        rope_theta: float = 10000.0,
+        rope_scaling: Optional[dict] = None,
+        initializer_range: float = 0.02,
+        **kwargs,
+    ):
+        self.max_position_embeddings = max_position_embeddings
+        self.num_hidden_layers = num_hidden_layers
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_attention_heads = num_attention_heads
+        self.head_dim = head_dim
+        self.norm_eps = norm_eps
+        self.vocab_size = vocab_size
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, copy it it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+        self.initializer_range = initializer_range
+        super().__init__(**kwargs)
+
+
+class DiaDecoderConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DiaDecoder`]. It is used to instantiate a Dia
+    decoder according to the specified arguments, defining the decoder architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        max_position_embeddings (`int`, *optional*, defaults to 3072):
+            The maximum sequence length that this model might ever be used with.
+        num_hidden_layers (`int`, *optional*, defaults to 18):
+            Number of hidden layers in the Transformer decoder.
+        hidden_size (`int`, *optional*, defaults to 2048):
+            Dimensionality of the decoder layers and the pooler layer.
+        intermediate_size (`int`, *optional*, defaults to 8192):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*, defaults to 4):
+            Number of key and value heads for each attention layer in the Transformer decoder.
+        head_dim (`int`, *optional*, defaults to 128):
+            Dimensionality of the attention head.
+        cross_num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each cross-attention layer in the Transformer decoder.
+        cross_head_dim (`int`, *optional*, defaults to 128):
+            Dimensionality of the cross-attention head.
+        cross_num_key_value_heads (`int`, *optional*, defaults to 16):
+            Number of key and value heads for each cross-attention layer in the Transformer decoder.
+        cross_hidden_size (`int`, *optional*, defaults to 1024):
+            Dimensionality of the cross-attention layers.
+        norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the normalization layers.
+        vocab_size (`int`, *optional*, defaults to 1028):
+            Vocabulary size of the Dia model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`DiaModel`].
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder. If string, `"gelu"`, `"relu"`,
+            `"swish"` and `"gelu_new"` are supported.
+        num_channels (`int`, *optional*, defaults to 9):
+            Number of channels for the Dia decoder.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`List[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        is_encoder_decoder (`bool`, *optional*, defaults to `True`):
+            Indicating that this model is part of an encoder-decoder architecture.
+    """
+
+    model_type = "dia_decoder"
+
+    def __init__(
+        self,
+        max_position_embeddings: int = 3072,
+        num_hidden_layers: int = 18,
+        hidden_size: int = 2048,
+        intermediate_size: int = 8192,
+        num_attention_heads: int = 16,
+        num_key_value_heads: int = 4,
+        head_dim: int = 128,
+        cross_num_attention_heads: int = 16,
+        cross_head_dim: int = 128,
+        cross_num_key_value_heads: int = 16,
+        cross_hidden_size: int = 1024,
+        norm_eps: float = 1e-5,
+        vocab_size: int = 1028,
+        hidden_act: str = "silu",
+        num_channels: int = 9,
+        rope_theta: float = 10000.0,
+        rope_scaling: Optional[dict] = None,
+        initializer_range: float = 0.02,
+        use_cache: bool = True,
+        is_encoder_decoder: bool = True,
+        **kwargs,
+    ):
+        self.max_position_embeddings = max_position_embeddings
+        self.num_hidden_layers = num_hidden_layers
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_attention_heads = num_attention_heads
+        self.num_key_value_heads = num_key_value_heads
+        self.head_dim = head_dim
+        self.cross_num_key_value_heads = cross_num_key_value_heads
+        self.cross_num_attention_heads = cross_num_attention_heads
+        self.cross_head_dim = cross_head_dim
+        self.cross_hidden_size = cross_hidden_size
+        self.norm_eps = norm_eps
+        self.vocab_size = vocab_size
+        self.hidden_act = hidden_act
+        self.num_channels = num_channels
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, copy it it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+        super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
+
+
+class DiaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DiaModel`]. It is used to instantiate a
+    Dia model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the
+    [nari-labs/Dia-1.6B](https://huggingface.co/nari-labs/Dia-1.6B) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        encoder_config (`DiaEncoderConfig`, *optional*):
+            Configuration for the encoder part of the model. If not provided, a default `DiaEncoderConfig` will be used.
+        decoder_config (`DiaDecoderConfig`, *optional*):
+            Configuration for the decoder part of the model. If not provided, a default `DiaDecoderConfig` will be used.
+        norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the normalization layers.
+        is_encoder_decoder (`bool`, *optional*, defaults to `True`):
+            Indicating that this model uses an encoder-decoder architecture.
+        pad_token_id (`int`, *optional*, defaults to 1025):
+            Padding token id.
+        eos_token_id (`int`, *optional*, defaults to 1024):
+            End of stream token id.
+        bos_token_id (`int`, *optional*, defaults to 1026):
+            Beginning of stream token id.
+        delay_pattern (`list[int]`, *optional*, defaults to `[0, 8, 9, 10, 11, 12, 13, 14, 15]`):
+            The delay pattern for the decoder. The length of this list must match `decoder_config.num_channels`.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+
+    Example:
+
+    ```python
+    >>> from transformers import DiaConfig, DiaModel
+
+    >>> # Initializing a DiaConfig with default values
+    >>> configuration = DiaConfig()
+
+    >>> # Initializing a DiaModel (with random weights) from the configuration
+    >>> model = DiaModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+
+    model_type = "dia"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    sub_configs = {"encoder_config": DiaEncoderConfig, "decoder_config": DiaDecoderConfig}
+
+    def __init__(
+        self,
+        encoder_config: Optional[DiaEncoderConfig] = None,
+        decoder_config: Optional[DiaDecoderConfig] = None,
+        norm_eps: float = 1e-5,
+        is_encoder_decoder: bool = True,
+        pad_token_id: int = 1025,
+        eos_token_id: int = 1024,
+        bos_token_id: int = 1026,
+        delay_pattern: Optional[list[int]] = None,
+        initializer_range: float = 0.02,
+        use_cache: bool = True,
+        **kwargs,
+    ):
+        if isinstance(encoder_config, dict):
+            encoder_config = DiaEncoderConfig(**encoder_config)
+        if isinstance(decoder_config, dict):
+            decoder_config = DiaDecoderConfig(**decoder_config)
+        self.encoder_config = encoder_config if encoder_config is not None else DiaEncoderConfig()
+        self.decoder_config = decoder_config if decoder_config is not None else DiaDecoderConfig()
+        self.norm_eps = norm_eps
+        self.delay_pattern = delay_pattern if delay_pattern is not None else [0, 8, 9, 10, 11, 12, 13, 14, 15]
+        self.initializer_range = initializer_range
+        self.use_cache = use_cache
+
+        assert self.decoder_config.num_channels == len(self.delay_pattern), (
+            "Number of channels must match delay pattern length."
+        )
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            eos_token_id=eos_token_id,
+            bos_token_id=bos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            **kwargs,
+        )
+
+    def get_text_config(self, *args, **kwargs):
+        """Defaulting to audio config as it's the decoder in this case which is usually the text backbone"""
+        return self.decoder_config
+
+
+__all__ = ["DiaConfig", "DiaEncoderConfig", "DiaDecoderConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dia/feature_extraction_dia.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dia/feature_extraction_dia.py
new file mode 100644
index 0000000000000000000000000000000000000000..b4376b773b27365932774a35746b2928cf0af707
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dia/feature_extraction_dia.py
@@ -0,0 +1,183 @@
+# coding=utf-8
+# Copyright 2025 The Nari Labs and HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for Dia"""
+
+from typing import Optional, Union
+
+import numpy as np
+
+from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
+from ...feature_extraction_utils import BatchFeature
+from ...utils import PaddingStrategy, TensorType, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class DiaFeatureExtractor(SequenceFeatureExtractor):
+    r"""
+    Constructs an Dia feature extractor.
+
+    This feature extractor inherits from [`~feature_extraction_sequence_utils.SequenceFeatureExtractor`] which contains
+    most of the main methods. Users should refer to this superclass for more information regarding those methods.
+
+    Args:
+        feature_size (`int`, *optional*, defaults to 1):
+            The feature dimension of the extracted features. Use 1 for mono, 2 for stereo.
+        sampling_rate (`int`, *optional*, defaults to 16000):
+            The sampling rate at which the audio waveform should be digitalized, expressed in hertz (Hz).
+        padding_value (`float`, *optional*, defaults to 0.0):
+            The value that is used for padding.
+        hop_length (`int`, *optional*, defaults to 512):
+            Overlap length between successive windows.
+    """
+
+    model_input_names = ["input_values", "n_quantizers"]
+
+    def __init__(
+        self,
+        feature_size: int = 1,
+        sampling_rate: int = 16000,
+        padding_value: float = 0.0,
+        hop_length: int = 512,
+        **kwargs,
+    ):
+        super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
+        self.hop_length = hop_length
+
+    def __call__(
+        self,
+        raw_audio: Union[np.ndarray, list[float], list[np.ndarray], list[list[float]]],
+        padding: Optional[Union[bool, str, PaddingStrategy]] = None,
+        truncation: Optional[bool] = False,
+        max_length: Optional[int] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        sampling_rate: Optional[int] = None,
+    ) -> BatchFeature:
+        """
+        Main method to featurize and prepare for the model one or several sequence(s).
+
+        Args:
+            raw_audio (`np.ndarray`, `list[float]`, `list[np.ndarray]`, `list[list[float]]`):
+                The sequence or batch of sequences to be processed. Each sequence can be a numpy array, a list of float
+                values, a list of numpy arrays or a list of list of float values. The numpy array must be of shape
+                `(num_samples,)` for mono audio (`feature_size = 1`), or `(2, num_samples)` for stereo audio
+                (`feature_size = 2`).
+            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
+                Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                index) among:
+
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            truncation (`bool`, *optional*, defaults to `False`):
+                Activates truncation to cut input sequences longer than `max_length` to `max_length`.
+            max_length (`int`, *optional*):
+                Maximum length of the returned list and optionally padding length (see above).
+            return_tensors (`str` or [`~utils.TensorType`], *optional*, default to 'pt'):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+            sampling_rate (`int`, *optional*):
+                The sampling rate at which the `audio` input was sampled. It is strongly recommended to pass
+                `sampling_rate` at the forward call to prevent silent errors.
+        """
+        if sampling_rate is not None:
+            if sampling_rate != self.sampling_rate:
+                raise ValueError(
+                    f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
+                    f" {self.sampling_rate}. Please make sure that the provided audio input was sampled with"
+                    f" {self.sampling_rate} and not {sampling_rate}."
+                )
+        else:
+            logger.warning(
+                f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
+                "Failing to do so can result in silent errors that might be hard to debug."
+            )
+
+        if padding and truncation:
+            raise ValueError("Both padding and truncation were set. Make sure you only set one.")
+        elif padding is None:
+            # by default let's pad the inputs
+            padding = True
+
+        is_batched = bool(
+            isinstance(raw_audio, (list, tuple)) and (isinstance(raw_audio[0], (np.ndarray, tuple, list)))
+        )
+
+        if is_batched:
+            raw_audio = [np.asarray(audio, dtype=np.float32).T for audio in raw_audio]
+        elif not is_batched and not isinstance(raw_audio, np.ndarray):
+            raw_audio = np.asarray(raw_audio, dtype=np.float32)
+        elif isinstance(raw_audio, np.ndarray) and raw_audio.dtype is np.dtype(np.float64):
+            raw_audio = raw_audio.astype(np.float32)
+
+        # always return batch
+        if not is_batched:
+            raw_audio = [np.asarray(raw_audio).T]
+
+        # convert stereo to mono if necessary, unique to Dia
+        for idx, example in enumerate(raw_audio):
+            if self.feature_size == 2 and example.ndim == 2:
+                raw_audio[idx] = np.mean(example, -1)
+
+        # verify inputs are valid
+        for idx, example in enumerate(raw_audio):
+            if example.ndim > 2:
+                raise ValueError(f"Expected input shape (channels, length) but got shape {example.shape}")
+            if self.feature_size == 1 and example.ndim != 1:
+                raise ValueError(f"Expected mono audio but example has {example.shape[-1]} channels")
+            if self.feature_size == 2 and example.ndim != 1:  # note the conversion before
+                raise ValueError(f"Expected stereo audio but example has {example.shape[-1]} channels")
+
+        input_values = BatchFeature({"input_values": raw_audio})
+
+        # temporarily treat it as if we were mono as we also convert stereo to mono
+        original_feature_size = self.feature_size
+        self.feature_size = 1
+
+        # normal padding on batch
+        padded_inputs = self.pad(
+            input_values,
+            max_length=max_length,
+            truncation=truncation,
+            padding=padding,
+            return_attention_mask=True,
+            pad_to_multiple_of=self.hop_length,
+        )
+        padded_inputs["padding_mask"] = padded_inputs.pop("attention_mask")
+
+        input_values = []
+        for example in padded_inputs.pop("input_values"):
+            if self.feature_size == 1:
+                example = example[..., None]
+            input_values.append(example.T)
+
+        padded_inputs["input_values"] = input_values
+        if return_tensors is not None:
+            padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
+
+        # rewrite back to original feature size
+        self.feature_size = original_feature_size
+
+        return padded_inputs
+
+
+__all__ = ["DiaFeatureExtractor"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dia/generation_dia.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dia/generation_dia.py
new file mode 100644
index 0000000000000000000000000000000000000000..c297de7203d4e5b30a189047233976d310179907
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dia/generation_dia.py
@@ -0,0 +1,463 @@
+# coding=utf-8
+# Copyright 2025 The Nari Labs and HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Any, Callable, Optional, Union
+
+import torch
+import torch.distributed as dist
+
+from ...generation.logits_process import (
+    DiaClassifierFreeGuidanceLogitsProcessor,
+    DiaEOSChannelFilterLogitsProcessor,
+    DiaEOSDelayPatternLogitsProcessor,
+    LogitsProcessorList,
+    TemperatureLogitsWarper,
+)
+from ...generation.stopping_criteria import StoppingCriteriaList
+from ...generation.streamers import BaseStreamer
+from ...generation.utils import GenerateOutput, GenerationConfig, GenerationMixin, GenerationMode
+from ...integrations.deepspeed import is_deepspeed_zero3_enabled
+from ...integrations.fsdp import is_fsdp_managed_module
+from ...modeling_utils import PreTrainedModel
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class DiaGenerationMixin(GenerationMixin):
+    # Indicates CFG which needs preparation to be properly handled by repeats
+    _uses_cfg = None
+
+    def _get_logits_processor(
+        self,
+        generation_config: GenerationConfig,
+        input_ids_seq_length: Optional[int] = None,
+        encoder_input_ids: Optional[torch.LongTensor] = None,
+        prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], list[int]]] = None,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        device: Optional[str] = None,
+        model_kwargs: Optional[dict[str, Any]] = None,
+        negative_prompt_ids: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
+    ) -> LogitsProcessorList:
+        # Need either custom order or custom processor instead
+        # (Temporarily disabling those for the super function)
+        original_guidance_scale = generation_config.guidance_scale
+        original_temperature = generation_config.temperature
+        generation_config.guidance_scale = None
+        generation_config.temperature = None
+
+        # Get base processors and those we can integrate easily
+        custom_processors = LogitsProcessorList()
+
+        if original_temperature is not None and original_temperature != 1.0:
+            custom_processors.append(TemperatureLogitsWarper(original_temperature))
+
+        custom_processors.append(
+            DiaEOSChannelFilterLogitsProcessor(
+                num_channels=len(self.config.delay_pattern),
+                eos_token_id=self.config.eos_token_id,
+            )
+        )
+
+        merged_processors = super()._get_logits_processor(
+            generation_config=generation_config,
+            input_ids_seq_length=input_ids_seq_length,
+            encoder_input_ids=encoder_input_ids,
+            prefix_allowed_tokens_fn=None,
+            logits_processor=custom_processors,
+            device=device,
+            model_kwargs=model_kwargs,
+            negative_prompt_ids=negative_prompt_ids,
+            negative_prompt_attention_mask=negative_prompt_attention_mask,
+        )
+
+        # Custom processors we need at specific positions
+        if original_guidance_scale is not None and original_guidance_scale != 1:
+            cfg_processor = DiaClassifierFreeGuidanceLogitsProcessor(
+                guidance_scale=original_guidance_scale,
+                guidance_top_k=generation_config.top_k,
+            )
+            merged_processors.insert(0, cfg_processor)
+
+        merged_processors.append(
+            DiaEOSDelayPatternLogitsProcessor(
+                delay_pattern=self.config.delay_pattern,
+                eos_token_id=self.config.eos_token_id,
+                max_generation_len=generation_config.max_length,
+                device=device,
+            )
+        )
+
+        # Enable temporarily disabled values back
+        generation_config.guidance_scale = original_guidance_scale
+        generation_config.temperature = original_temperature
+
+        return merged_processors
+
+    def _prepare_generation_config(
+        self, generation_config: Optional[GenerationConfig], use_model_defaults: Optional[bool] = None, **kwargs: Any
+    ) -> tuple[GenerationConfig, dict]:
+        generation_config, model_kwargs = super()._prepare_generation_config(
+            generation_config, use_model_defaults, **kwargs
+        )
+
+        # We allow generation up to max length + max delay pattern
+        # (will revert back to max length after generation)
+        generation_config.max_length += max(self.config.delay_pattern)
+
+        # Internal flag to indicate CFG that needs to prepare unconditioned input
+        self._uses_cfg = generation_config.guidance_scale is not None and generation_config.guidance_scale != 1
+
+        return generation_config, model_kwargs
+
+    def _prepare_model_inputs(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        bos_token_id: Optional[torch.Tensor] = None,
+        model_kwargs: Optional[dict[str, torch.Tensor]] = None,
+    ) -> tuple[torch.Tensor, Optional[str], dict[str, torch.Tensor]]:
+        inputs, input_name, model_kwargs = super()._prepare_model_inputs(
+            inputs=inputs,
+            bos_token_id=bos_token_id,
+            model_kwargs=model_kwargs,
+        )
+
+        # If CFG is requested we fill in the unconditioned parts
+        if self._uses_cfg:
+            unconditioned_inputs = torch.zeros_like(inputs)
+            inputs = torch.cat([inputs, unconditioned_inputs], dim=0)
+
+            if model_kwargs.get("attention_mask", None) is not None:
+                model_kwargs["attention_mask"] = model_kwargs["attention_mask"].repeat(2, 1)
+
+        return inputs, input_name, model_kwargs
+
+    def _prepare_decoder_input_ids_for_generation(
+        self,
+        batch_size: int,
+        model_input_name: str,
+        model_kwargs: dict[str, torch.Tensor],
+        decoder_start_token_id: torch.Tensor,
+        device: Optional[torch.device] = None,
+    ) -> tuple[torch.LongTensor, dict[str, torch.Tensor]]:
+        """Prepares `decoder_input_ids` for generation with encoder-decoder models"""
+        # 1. Check whether the user has defined `decoder_input_ids` and `decoder_attention_mask`; if not error out
+        decoder_input_ids = decoder_attention_mask = None
+        if model_kwargs is not None and "decoder_input_ids" in model_kwargs:
+            decoder_input_ids = model_kwargs.pop("decoder_input_ids")
+        if model_kwargs is not None and "decoder_attention_mask" in model_kwargs:
+            decoder_attention_mask = model_kwargs.pop("decoder_attention_mask")
+
+        # We allow generating without preparation (no proper delay) but discourage it
+        if decoder_input_ids is None or decoder_attention_mask is None:
+            logger.warning_once(
+                "In order to generate with Dia, we need the processed audio input: Got `decoder_input_ids`:"
+                f" {decoder_input_ids is not None} and got `decoder_attention_mask`={decoder_attention_mask is not None}."
+                f" This can be achieved via the [`DiaProcessor`] but now defaulting to non-delayed generation."
+            )
+
+            num_channels = self.config.decoder_config.num_channels
+            real_batch_size = batch_size // 2 if self._uses_cfg else batch_size
+
+            if decoder_input_ids is None:
+                decoder_input_ids = torch.full(
+                    (real_batch_size, 1, num_channels), decoder_start_token_id, dtype=torch.long, device=device
+                )
+
+            decoder_attention_mask = torch.ones(
+                size=(real_batch_size, decoder_input_ids.shape[1]), dtype=torch.long, device=device
+            )
+
+        # 2. Determine the valid input and what works as mask within the input
+        delay_mask = decoder_input_ids.long()
+        valid_input_size = (
+            decoder_input_ids.shape[1] - (decoder_input_ids[:, :, 0] == self.config.pad_token_id).sum(dim=-1).max()
+        )
+        decoder_input_ids = delay_mask[:, :valid_input_size].transpose(1, 2).long()
+        decoder_attention_mask = decoder_attention_mask[:, :valid_input_size].long()
+
+        # 3. Overwrite into model kwargs
+        model_kwargs["decoder_attention_mask"] = decoder_attention_mask
+        model_kwargs["decoder_delay_mask"] = delay_mask
+
+        return decoder_input_ids, model_kwargs
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        encoder_outputs=None,  # Using this to easily get the batch size
+        decoder_delay_mask=None,
+        **kwargs,
+    ):
+        # Reshape decoder input_ids to 3D to be compile friendly and to fit the expected model input shape
+        batch_size = encoder_outputs[0].shape[0] // 2 if self._uses_cfg else encoder_outputs[0].shape[0]
+        input_ids = input_ids.reshape(batch_size, self.config.decoder_config.num_channels, -1).transpose(1, 2)
+
+        # Base method handles most things except CFG and the delay pattern mask
+        model_inputs = super().prepare_inputs_for_generation(input_ids, encoder_outputs=encoder_outputs, **kwargs)
+
+        # Post processing for CFG and overwriting via delay pattern mask
+        # 1. Delay pattern mask -- force tokens if not allowed to predict (!= pad_token in mask)
+        model_inputs["decoder_input_ids"] = self.apply_delay_mask(
+            input_ids, self.config.pad_token_id, decoder_delay_mask
+        )
+
+        # Depending on cache usage we need to pass all or just one
+        if model_inputs.get("use_cache", False) and model_inputs["cache_position"][0] > 0:
+            model_inputs["decoder_input_ids"] = model_inputs["decoder_input_ids"][:, -1, :][:, None, :]
+
+        # Be compile friendly
+        model_inputs["decoder_input_ids"] = model_inputs["decoder_input_ids"].contiguous()
+
+        # 2. Apply CFG duplication if needed
+        if self._uses_cfg:
+            for key in ["decoder_input_ids", "decoder_attention_mask", "decoder_position_ids"]:
+                if model_inputs.get(key, None) is not None:
+                    # double first dimension and keep everything else the same
+                    repeat_pattern = tuple([2] + [1] * (model_inputs[key].ndim - 1))
+                    model_inputs[key] = model_inputs[key].repeat(*repeat_pattern)
+
+        return model_inputs
+
+    @staticmethod
+    def apply_delay_mask(input_ids: torch.Tensor, pad_id: int, delay_mask: Optional[torch.Tensor]) -> torch.Tensor:
+        if delay_mask is None:
+            return input_ids
+
+        mask_len = min(input_ids.shape[1], delay_mask.shape[1])
+        valid_mask = delay_mask[:, :mask_len, :]
+        valid_input = input_ids[:, :mask_len, :]
+
+        # Overwrite the respective parts of the input
+        input_ids[:, :mask_len, :] = torch.where(valid_mask == pad_id, valid_input, valid_mask)
+
+        return input_ids
+
+    def _main_generate_loop(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        generation_config: Optional[GenerationConfig] = None,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], list[int]]] = None,
+        synced_gpus: Optional[bool] = None,
+        assistant_model: Optional["PreTrainedModel"] = None,
+        streamer: Optional["BaseStreamer"] = None,
+        negative_prompt_ids: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
+        use_model_defaults: Optional[bool] = None,
+        custom_generate: Optional[str] = None,
+        **kwargs,
+    ):
+        # ********** mostly taken from main generate function up to calling the different methods (see NOTE) **********
+        # 1. Handle `generation_config` and kwargs that might update it, and validate the `.generate()` call
+        generation_mode_kwargs = self._extract_generation_mode_kwargs(
+            custom_generate,
+            kwargs,
+            synced_gpus,
+            assistant_model,
+            streamer,
+        )
+        generation_config, model_kwargs = self._prepare_generation_config(
+            generation_config, use_model_defaults, **kwargs
+        )
+        generation_mode = generation_config.get_generation_mode(assistant_model)
+
+        self._validate_model_kwargs(model_kwargs.copy())
+        self._validate_generation_mode(generation_mode, generation_config, generation_mode_kwargs)
+
+        # 2. Set generation parameters if not already defined
+        if synced_gpus is None:
+            synced_gpus = (is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)) and dist.get_world_size() > 1
+
+        logits_processor = logits_processor if logits_processor is not None else LogitsProcessorList()
+        stopping_criteria = stopping_criteria if stopping_criteria is not None else StoppingCriteriaList()
+
+        # 3. Define model inputs
+        kwargs_has_attention_mask = model_kwargs.get("attention_mask", None) is not None
+        inputs_tensor, model_input_name, model_kwargs = self._prepare_model_inputs(
+            inputs, generation_config.bos_token_id, model_kwargs
+        )
+        batch_size = inputs_tensor.shape[0]
+
+        device = inputs_tensor.device
+        self._prepare_special_tokens(generation_config, kwargs_has_attention_mask, device=device)
+
+        # 4. Define other model kwargs
+        if "encoder_outputs" not in model_kwargs:
+            # if model is encoder decoder encoder_outputs are created and added to `model_kwargs`
+            model_kwargs = self._prepare_encoder_decoder_kwargs_for_generation(
+                inputs_tensor, model_kwargs, model_input_name, generation_config
+            )
+
+        # 5. Prepare `input_ids` which will be used for auto-regressive generation
+        input_ids, model_kwargs = self._prepare_decoder_input_ids_for_generation(
+            batch_size=batch_size,
+            model_input_name=model_input_name,
+            model_kwargs=model_kwargs,
+            decoder_start_token_id=generation_config._decoder_start_token_tensor,
+            device=inputs_tensor.device,
+        )
+
+        if generation_config.token_healing:
+            input_ids = self.heal_tokens(input_ids, generation_mode_kwargs.get("tokenizer"))
+
+        if streamer is not None:
+            streamer.put(input_ids.cpu())
+
+        # 6. Prepare `max_length` depending on other stopping criteria.
+        # NOTE: incorrect `input_ids.shape[1]` previously
+        input_ids_length = input_ids.shape[-1]
+        has_default_max_length = kwargs.get("max_length") is None and generation_config.max_length is not None
+        has_default_min_length = kwargs.get("min_length") is None and generation_config.min_length is not None
+        generation_config = self._prepare_generated_length(
+            generation_config=generation_config,
+            has_default_max_length=has_default_max_length,
+            has_default_min_length=has_default_min_length,
+            model_input_name=model_input_name,
+            inputs_tensor=inputs_tensor,
+            input_ids_length=input_ids_length,
+        )
+
+        # If the model supports `logits_to_keep` in forward(), set it to 1 to avoid computing the whole
+        # logit matrix. This can save a lot of memory during the first forward pass. Note that assisted decoding
+        # dynamically overrides this value as it can need more than the last token logits
+        if self._supports_logits_to_keep() and "logits_to_keep" not in model_kwargs:
+            model_kwargs["logits_to_keep"] = 1
+
+        self._validate_generated_length(generation_config, input_ids_length, has_default_max_length)
+
+        # 7. Prepare the cache.
+        # - `model_kwargs` may be updated in place with a cache as defined by the parameters in `generation_config`.
+        # - different models have a different cache name expected by the model (default = "past_key_values")
+        # - `max_length`, prepared above, is used to determine the maximum cache length
+        max_cache_length = generation_config.max_length - 1
+        if (
+            inputs_tensor.shape[1] != input_ids_length
+            and model_input_name == "inputs_embeds"
+            and not self.config.is_encoder_decoder
+        ):
+            max_cache_length += inputs_tensor.shape[1]
+        self._prepare_cache_for_generation(
+            generation_config, model_kwargs, generation_mode, batch_size, max_cache_length
+        )
+
+        # 8. prepare logits processors and stopping criteria
+        prepared_logits_processor = self._get_logits_processor(
+            generation_config=generation_config,
+            input_ids_seq_length=input_ids_length,
+            encoder_input_ids=inputs_tensor,
+            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
+            logits_processor=logits_processor,
+            device=inputs_tensor.device,
+            model_kwargs=model_kwargs,
+            negative_prompt_ids=negative_prompt_ids,
+            negative_prompt_attention_mask=negative_prompt_attention_mask,
+        )
+        prepared_stopping_criteria = self._get_stopping_criteria(
+            generation_config=generation_config,
+            stopping_criteria=stopping_criteria,
+            tokenizer=generation_mode_kwargs.get("tokenizer"),
+        )
+
+        # Set model_kwargs `use_cache` so we can use it later in forward runs
+        model_kwargs["use_cache"] = generation_config.use_cache
+        # ******************* taken from main generate function up to calling the different methods *******************
+
+        # Prepare inner 2D logic in generation loop
+        input_ids = input_ids.reshape(-1, input_ids.shape[-1])
+
+        # 10. go into different generation modes
+        if generation_mode in (GenerationMode.SAMPLE, GenerationMode.GREEDY_SEARCH):
+            # 11. expand input_ids with `num_return_sequences` additional sequences per batch
+            if generation_config.num_return_sequences > 1:
+                raise ValueError("`num_return_sequences>1` is incompatible with Dia.")
+
+            # 12. run sample (it degenerates to greedy search when `generation_config.do_sample=False`)
+            return self._sample(
+                input_ids,
+                logits_processor=prepared_logits_processor,
+                stopping_criteria=prepared_stopping_criteria,
+                generation_config=generation_config,
+                **generation_mode_kwargs,
+                **model_kwargs,
+            )
+        else:
+            raise ValueError(
+                "Got incompatible mode for generation, should be one of greedy or sampling. "
+                "Ensure that beam search is de-activated by setting `num_beams=1`."
+            )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        inputs: Optional[torch.Tensor] = None,
+        generation_config: Optional[GenerationConfig] = None,
+        logits_processor: Optional[LogitsProcessorList] = None,
+        stopping_criteria: Optional[StoppingCriteriaList] = None,
+        prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], list[int]]] = None,
+        synced_gpus: Optional[bool] = None,
+        assistant_model: Optional["PreTrainedModel"] = None,
+        streamer: Optional["BaseStreamer"] = None,
+        negative_prompt_ids: Optional[torch.Tensor] = None,
+        negative_prompt_attention_mask: Optional[torch.Tensor] = None,
+        use_model_defaults: Optional[bool] = None,
+        custom_generate: Optional[str] = None,
+        **kwargs,
+    ) -> Union[GenerateOutput, torch.LongTensor]:
+        # We expect the initial input ids to be the complete mask (delayed input)
+        delay_mask = kwargs.get("decoder_input_ids")
+        if delay_mask is not None:
+            delay_mask = delay_mask.clone()
+
+        output = self._main_generate_loop(
+            inputs=inputs,
+            generation_config=generation_config,
+            logits_processor=logits_processor,
+            stopping_criteria=stopping_criteria,
+            prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
+            synced_gpus=synced_gpus,
+            assistant_model=assistant_model,
+            streamer=streamer,
+            negative_prompt_ids=negative_prompt_ids,
+            negative_prompt_attention_mask=negative_prompt_attention_mask,
+            use_model_defaults=use_model_defaults,
+            custom_generate=custom_generate,
+            **kwargs,
+        )
+
+        return_dict_in_generate = not isinstance(output, torch.Tensor)
+
+        if return_dict_in_generate:
+            output_sequences = output.sequences
+        else:
+            output_sequences = output
+
+        # Reshape from 2D (bsz * channels, seq_len) to 3D (bsz, seq_len, channels)
+        num_channels = self.config.decoder_config.num_channels
+        bsz = output_sequences.shape[0] // num_channels
+        output_sequences = output_sequences.reshape(bsz, num_channels, -1).transpose(1, 2)
+
+        # Apply delay mask
+        output_sequences = self.apply_delay_mask(output_sequences, self.config.pad_token_id, delay_mask)
+
+        if return_dict_in_generate:
+            output.sequences = output_sequences
+        else:
+            output = output_sequences
+
+        return output
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dia/modeling_dia.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dia/modeling_dia.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf662b224aabd884521d7e14b8a167886377f4b5
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dia/modeling_dia.py
@@ -0,0 +1,958 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/dia/modular_dia.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_dia.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 The Nari Labs and HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, Optional, Union
+
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
+from ...integrations import use_kernel_forward_from_hub
+from ...masking_utils import create_causal_mask
+from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_attention_mask_for_sdpa
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    Seq2SeqLMOutput,
+    Seq2SeqModelOutput,
+)
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import (
+    TransformersKwargs,
+    auto_docstring,
+    can_return_tuple,
+    is_torch_flex_attn_available,
+    is_torchdynamo_compiling,
+    logging,
+)
+from ...utils.deprecation import deprecate_kwarg
+from .configuration_dia import DiaConfig, DiaDecoderConfig, DiaEncoderConfig
+from .generation_dia import DiaGenerationMixin
+
+
+if is_torch_flex_attn_available():
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
+logger = logging.get_logger(__name__)
+
+
+@auto_docstring
+class DiaPreTrainedModel(PreTrainedModel):
+    config: DiaConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _can_compile_fullgraph = True
+    main_input_name = "input_ids"
+    _no_split_modules = ["DiaEncoderLayer", "DiaDecoderLayer"]
+
+
+class DiaMultiChannelEmbedding(nn.Module):
+    """In order to efficiently compute the audio embedding from the 9 different channels,
+    we vectorize the embedding process by using a single embedding layer and an offset.
+    Example:
+    - num_embeds = 4
+    - vocab_size = 8
+    - num_channels = 3
+    We would have offsets = [0, 8, 16]
+    If audio_codes = [0, 1, 2, 3], [1, 3, 4, 7], [5, 6, 7, 8],
+    then tokens = audio_codes + offsets
+                = [0, 1, 2, 3, 9, 11, 12, 15, 21, 22, 23, 24]
+    This allows us to use a single embedding layer for all channels.
+    """
+
+    def __init__(self, config: DiaDecoderConfig):
+        super().__init__()
+        self.embed = nn.Embedding(config.vocab_size * config.num_channels, config.hidden_size)
+        self.hidden_size = config.hidden_size
+        self.num_channels = config.num_channels
+        offsets = torch.arange(config.num_channels, dtype=torch.long) * config.vocab_size  # (C,)
+        self.register_buffer("offsets", offsets, persistent=False)
+
+    def forward(self, audio_codes: torch.Tensor) -> torch.Tensor:
+        tokens = (audio_codes + self.offsets.to(audio_codes.device)).squeeze(1)
+        embeds = self.embed(tokens).view(tokens.shape[0], audio_codes.shape[1], -1, self.hidden_size)
+        return embeds.sum(dim=2)
+
+
+class DiaMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        self.config = config
+        self.gate_up_proj = nn.Linear(config.hidden_size, 2 * config.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
+        self.activation_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
+        up_states = self.gate_up_proj(hidden_states)
+
+        gate, up_states = up_states.chunk(2, dim=-1)
+        up_states = up_states * self.activation_fn(gate)
+
+        return self.down_proj(up_states)
+
+
+@use_kernel_forward_from_hub("RMSNorm")
+class DiaRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        DiaRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class DiaRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: DiaConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs: Unpack[TransformersKwargs],
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class DiaSelfAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: Union[DiaEncoderConfig, DiaDecoderConfig], layer_idx: int, is_causal: bool = False):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.hidden_size = config.hidden_size
+        self.num_heads = self.config.num_attention_heads
+        self.num_key_value_heads = self.config.num_key_value_heads or self.num_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // self.num_heads)
+        self.scaling = 1
+        self.attention_dropout = 0.0
+        self.is_causal = is_causal
+
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class DiaCrossAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: DiaDecoderConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.hidden_size = config.hidden_size
+        self.cross_hidden_size = config.cross_hidden_size
+        self.num_heads = self.config.cross_num_attention_heads
+        self.num_key_value_heads = self.config.cross_num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.head_dim = config.cross_head_dim
+        self.scaling = 1
+        self.attention_dropout = 0.0
+        self.is_causal = False
+
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.cross_hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.cross_hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cross_attention_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[EncoderDecoderCache] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+        cross_shape = (*cross_attention_states.shape[:-1], -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        is_updated = past_key_values.is_updated.get(self.layer_idx) if past_key_values is not None else False
+        if past_key_values is not None and is_updated:
+            # reuse k,v, cross_attentions
+            key_states = past_key_values.cross_attention_cache.layers[self.layer_idx].keys
+            value_states = past_key_values.cross_attention_cache.layers[self.layer_idx].values
+        else:
+            key_states = self.k_proj(cross_attention_states).view(cross_shape).transpose(1, 2)
+            value_states = self.v_proj(cross_attention_states).view(cross_shape).transpose(1, 2)
+
+            if past_key_values is not None:
+                # save all states to the cache
+                key_states, value_states = past_key_values.cross_attention_cache.update(
+                    key_states,
+                    value_states,
+                    self.layer_idx,
+                )
+                # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls
+                past_key_values.is_updated[self.layer_idx] = True
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape((*input_shape, -1)).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class DiaEncoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: DiaEncoderConfig, layer_idx: int):
+        super().__init__()
+        self.pre_sa_norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps)
+        self.self_attention = DiaSelfAttention(config, layer_idx, is_causal=False)
+        self.post_sa_norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps)
+        self.mlp = DiaMLP(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        attention_mask: Optional[torch.Tensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        residual = hidden_states
+        normed_states = self.pre_sa_norm(hidden_states)
+        self_attn_output, self_attn_weights = self.self_attention(
+            normed_states,
+            position_embeddings=position_embeddings,
+            attention_mask=attention_mask,
+            **kwargs,
+        )
+        hidden_states = residual + self_attn_output
+
+        residual = hidden_states
+        normed_states = self.post_sa_norm(hidden_states)
+        mlp_out = self.mlp(normed_states)
+        hidden_states = residual + mlp_out
+
+        return hidden_states, self_attn_weights
+
+
+class DiaEncoder(DiaPreTrainedModel):
+    def __init__(self, config: DiaEncoderConfig):
+        super().__init__(config)
+        self.config = config
+
+        self.embedding = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.layers = nn.ModuleList(
+            [DiaEncoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps)
+        self.rotary_embeddings = DiaRotaryEmbedding(config)
+
+    @auto_docstring
+    @can_return_tuple
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Union[BaseModelOutput, tuple]:
+        hidden_states = self.embedding(input_ids)
+
+        # RoPE
+        # Note: We expect right padding and hence always generate
+        # the position ids on the fly to reduce preparation overhead
+        position_ids = torch.arange(input_ids.shape[-1], device=input_ids.device)[None, :]
+        position_embeddings = self.rotary_embeddings(hidden_states, position_ids)
+
+        attention_mask = self._update_full_mask(
+            attention_mask,
+            hidden_states,
+        )
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        for encoder_layer in self.layers:
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+
+            layer_outputs = encoder_layer(
+                hidden_states,
+                position_embeddings=position_embeddings,
+                attention_mask=attention_mask,
+                **kwargs,
+            )
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        if output_hidden_states:
+            encoder_states += (hidden_states,)
+
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+    # Copied from transformers.models.bart.modeling_bart.BartPreTrainedModel._update_full_mask
+    def _update_full_mask(
+        self,
+        attention_mask: Union[torch.Tensor, None],
+        inputs_embeds: torch.Tensor,
+    ):
+        if attention_mask is not None:
+            if self.config._attn_implementation == "flash_attention_2":
+                attention_mask = attention_mask if 0 in attention_mask else None
+            elif self.config._attn_implementation == "sdpa":
+                # output_attentions=True & head_mask can not be supported when using SDPA, fall back to
+                # the manual implementation that requires a 4D causal mask in all cases.
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                attention_mask = _prepare_4d_attention_mask_for_sdpa(attention_mask, inputs_embeds.dtype)
+            elif self.config._attn_implementation == "flex_attention":
+                if isinstance(attention_mask, torch.Tensor):
+                    attention_mask = make_flex_block_causal_mask(attention_mask, is_causal=False)
+            else:
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype)
+
+        return attention_mask
+
+
+class DiaDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: DiaDecoderConfig, layer_idx: int):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attention = DiaSelfAttention(config, layer_idx, is_causal=True)
+        self.cross_attention = DiaCrossAttention(config, layer_idx)
+        self.pre_sa_norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps)
+        self.pre_ca_norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps)
+        self.pre_mlp_norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps)
+        self.mlp = DiaMLP(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[EncoderDecoderCache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+        self_attn_cache = past_key_values
+        if isinstance(self_attn_cache, EncoderDecoderCache):
+            self_attn_cache = self_attn_cache.self_attention_cache
+
+        residual = hidden_states
+        normed_states = self.pre_sa_norm(hidden_states)
+        self_attn_output, self_attn_weights = self.self_attention(
+            normed_states,
+            position_embeddings,
+            attention_mask,
+            # Needs to be an arg in order to function properly
+            # on inplace operations to be carried (e.g. compile)
+            self_attn_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        hidden_states = residual + self_attn_output
+
+        residual = hidden_states
+        normed_states = self.pre_ca_norm(hidden_states)
+        cross_states, cross_attn_weights = self.cross_attention(
+            normed_states,
+            encoder_hidden_states,
+            attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            **kwargs,
+        )
+        hidden_states = residual + cross_states
+
+        residual = hidden_states
+        normed_states = self.pre_mlp_norm(hidden_states)
+        mlp_out = self.mlp(normed_states)
+        hidden_states = residual + mlp_out
+
+        return hidden_states, self_attn_weights, cross_attn_weights
+
+
+class DiaDecoder(DiaPreTrainedModel):
+    """Transformer Decoder Stack using DenseGeneral."""
+
+    def __init__(self, config: DiaDecoderConfig):
+        super().__init__(config)
+        self.num_channels = config.num_channels
+        self.vocab_size = config.vocab_size
+        self.embeddings = DiaMultiChannelEmbedding(config)
+        self.rotary_embeddings = DiaRotaryEmbedding(config)
+        self.layers = nn.ModuleList(
+            [DiaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps)
+
+    @auto_docstring
+    @can_return_tuple
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[EncoderDecoderCache] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Union[BaseModelOutputWithPastAndCrossAttentions, tuple]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks)`):
+            The original `decoder_input_ids` in 3D shape to facilitate more efficient computations.
+
+            [What are input IDs?](../glossary#input-ids)
+        """
+
+        batch_size, seq_length = input_ids.size()[:-1]
+        past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0
+        if cache_position is None:
+            cache_position = torch.arange(
+                past_key_values_length, past_key_values_length + seq_length, device=input_ids.device
+            )
+        if position_ids is None:
+            position_ids = cache_position[None, :]
+
+        # RoPE
+        hidden_states = self.embeddings(input_ids)
+        position_embeddings = self.rotary_embeddings(hidden_states, position_ids)
+
+        if attention_mask is None and not is_torchdynamo_compiling():
+            # required mask seq length can be calculated via length of past cache
+            mask_seq_length = past_key_values_length + seq_length
+            attention_mask = torch.ones(batch_size, mask_seq_length, device=input_ids.device)
+
+        attention_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=hidden_states,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+        encoder_attention_mask = self._update_cross_attn_mask(
+            encoder_hidden_states,
+            encoder_attention_mask,
+            hidden_states.shape[:2],
+            hidden_states,
+        )
+
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+
+        for layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = layer(
+                hidden_states,
+                position_embeddings,
+                attention_mask,
+                encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                past_key_values=past_key_values,
+                cache_position=cache_position,
+                **kwargs,
+            )
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns = all_self_attns + (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        hidden_states = self.norm(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+    # Copied from transformers.models.bart.modeling_bart.BartPreTrainedModel._update_cross_attn_mask
+    def _update_cross_attn_mask(
+        self,
+        encoder_hidden_states: Union[torch.Tensor, None],
+        encoder_attention_mask: Union[torch.Tensor, None],
+        input_shape: torch.Size,
+        inputs_embeds: torch.Tensor,
+    ):
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            if self.config._attn_implementation == "flash_attention_2":
+                encoder_attention_mask = encoder_attention_mask if 0 in encoder_attention_mask else None
+            elif self.config._attn_implementation == "sdpa":
+                # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on
+                # the manual implementation that requires a 4D causal mask in all cases.
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                encoder_attention_mask = _prepare_4d_attention_mask_for_sdpa(
+                    encoder_attention_mask,
+                    inputs_embeds.dtype,
+                    tgt_len=input_shape[-1],
+                )
+            elif self.config._attn_implementation == "flex_attention":
+                if isinstance(encoder_attention_mask, torch.Tensor):
+                    encoder_attention_mask = make_flex_block_causal_mask(
+                        encoder_attention_mask,
+                        query_length=input_shape[-1],
+                        is_causal=False,
+                    )
+            else:
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                encoder_attention_mask = _prepare_4d_attention_mask(
+                    encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+                )
+
+        return encoder_attention_mask
+
+
+@auto_docstring(
+    custom_intro="""
+    The bare Dia model outputting raw hidden-states without any specific head on top.
+    """
+)
+class DiaModel(DiaPreTrainedModel):
+    def __init__(self, config: DiaConfig):
+        super().__init__(config)
+        self.config = config
+        self.encoder = DiaEncoder(config.encoder_config)
+        self.decoder = DiaDecoder(config.decoder_config)
+        self.post_init()
+
+    def get_encoder(self):
+        return self.encoder
+
+    @auto_docstring
+    @can_return_tuple
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_position_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        encoder_outputs: Optional[Union[BaseModelOutput, tuple]] = None,
+        past_key_values: Optional[EncoderDecoderCache] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Union[tuple, Seq2SeqModelOutput]:
+        r"""
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length)
+        or (batch_size, target_sequence_length, num_codebooks)`, *optional*):
+            1. (batch_size * num_codebooks, target_sequence_length): corresponds to the general use case where
+            the audio input codebooks are flattened into the batch dimension. This also aligns with the flat-
+            tened audio logits which are used to calculate the loss.
+
+            2. (batch_size, sequence_length, num_codebooks): corresponds to the internally used shape of
+            Dia to calculate embeddings and subsequent steps more efficiently.
+
+            If no `decoder_input_ids` are provided, it will create a tensor of `bos_token_id` with shape
+            `(batch_size, 1, num_codebooks)`. Indices can be obtained using the [`DiaProcessor`]. See
+            [`DiaProcessor.__call__`] for more details.
+
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`.
+
+            [What are position IDs?](../glossary#position-ids)
+        """
+
+        if input_ids is None and encoder_outputs is None:
+            raise ValueError(
+                "You should either provide text ids or the cached text encodings. Neither has been found."
+            )
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        if self.is_gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        if use_cache and past_key_values is None:
+            past_key_values = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config))
+
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                **kwargs,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput
+        elif not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        # On default we initialize the decoder with bos tokens if nothing has been provided
+        bsz, seq_len, channels = (encoder_outputs[0].shape[0], -1, self.config.decoder_config.num_channels)
+        if decoder_input_ids is None:
+            decoder_input_ids = torch.full(
+                size=(bsz, 1, channels), fill_value=self.config.bos_token_id, device=self.device
+            )
+        # Ensure 3D
+        if decoder_input_ids.ndim == 2:
+            decoder_input_ids = decoder_input_ids.reshape(bsz, channels, seq_len).transpose(1, 2)
+
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            position_ids=decoder_position_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        return Seq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs[0],
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    The Dia model consisting of a (byte) text encoder and audio decoder with a prediction head on top.
+    """
+)
+class DiaForConditionalGeneration(DiaPreTrainedModel, DiaGenerationMixin):
+    base_model_prefix = "model"
+
+    def __init__(self, config: DiaConfig):
+        super().__init__(config)
+        self.config = config
+        self.model = DiaModel(config)
+
+        self.num_channels = config.decoder_config.num_channels
+        self.vocab_size = config.decoder_config.vocab_size
+        self.logits_dense = nn.Linear(
+            config.decoder_config.hidden_size, (self.num_channels * self.vocab_size), bias=False
+        )
+        self.loss_type = "ForMaskedLM"
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_encoder(self):
+        return self.model.get_encoder()
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    @auto_docstring
+    @can_return_tuple
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_position_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        encoder_outputs: Optional[Union[BaseModelOutput, tuple]] = None,
+        past_key_values: Optional[EncoderDecoderCache] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        labels: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Union[tuple, Seq2SeqLMOutput]:
+        r"""
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length)
+        or (batch_size, target_sequence_length, num_codebooks)`, *optional*):
+            1. (batch_size * num_codebooks, target_sequence_length): corresponds to the general use case where
+            the audio input codebooks are flattened into the batch dimension. This also aligns with the flat-
+            tened audio logits which are used to calculate the loss.
+
+            2. (batch_size, sequence_length, num_codebooks): corresponds to the internally used shape of
+            Dia to calculate embeddings and subsequent steps more efficiently.
+
+            If no `decoder_input_ids` are provided, it will create a tensor of `bos_token_id` with shape
+            `(batch_size, 1, num_codebooks)`. Indices can be obtained using the [`DiaProcessor`]. See
+            [`DiaProcessor.__call__`] for more details.
+
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`.
+
+            [What are position IDs?](../glossary#position-ids)
+        labels (`torch.LongTensor` of shape `(batch_size * num_codebooks,)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in
+            `[0, ..., config.decoder_config.vocab_size - 1]` or -100. Tokens with indices set to `-100`
+            are ignored (masked).
+        """
+
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_position_ids=decoder_position_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            encoder_outputs=encoder_outputs,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        last_hidden_state = outputs[0]
+        batch_size = last_hidden_state.shape[0]
+        # 3D <-> 2D makes it necessary to prioritize channel dim
+        audio_logits = (
+            self.logits_dense(last_hidden_state)
+            .view((batch_size, -1, self.num_channels, self.vocab_size))
+            .transpose(1, 2)
+            .contiguous()
+            .view(batch_size * self.num_channels, -1, self.vocab_size)
+        )
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=audio_logits, labels=labels, vocab_size=self.vocab_size, **kwargs)
+
+        return Seq2SeqLMOutput(
+            loss=loss,
+            logits=audio_logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+
+__all__ = ["DiaModel", "DiaPreTrainedModel", "DiaForConditionalGeneration"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dia/modular_dia.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dia/modular_dia.py
new file mode 100644
index 0000000000000000000000000000000000000000..f99d32a01d9cffab79bd72852d776447e084681b
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dia/modular_dia.py
@@ -0,0 +1,773 @@
+# coding=utf-8
+# Copyright 2025 The Nari Labs and HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Dia model."""
+
+from typing import Callable, Optional, Union
+
+import torch
+from torch import nn
+
+from ...cache_utils import DynamicCache, EncoderDecoderCache
+from ...masking_utils import create_causal_mask
+from ...modeling_attn_mask_utils import (
+    _prepare_4d_attention_mask,
+    _prepare_4d_attention_mask_for_sdpa,
+)
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    Seq2SeqLMOutput,
+    Seq2SeqModelOutput,
+)
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import auto_docstring, can_return_tuple, is_torch_flex_attn_available, is_torchdynamo_compiling, logging
+from ..llama.modeling_llama import (
+    LlamaAttention,
+    LlamaRMSNorm,
+    LlamaRotaryEmbedding,
+    eager_attention_forward,
+)
+from ..phi3.modeling_phi3 import Phi3MLP
+from .configuration_dia import DiaConfig, DiaDecoderConfig, DiaEncoderConfig
+from .generation_dia import DiaGenerationMixin
+
+
+if is_torch_flex_attn_available():
+    from ...integrations.flex_attention import make_flex_block_causal_mask
+
+
+logger = logging.get_logger(__name__)
+
+
+@auto_docstring
+class DiaPreTrainedModel(PreTrainedModel):
+    config: DiaConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _can_compile_fullgraph = True
+    main_input_name = "input_ids"
+    _no_split_modules = ["DiaEncoderLayer", "DiaDecoderLayer"]
+
+
+class DiaMultiChannelEmbedding(nn.Module):
+    """In order to efficiently compute the audio embedding from the 9 different channels,
+    we vectorize the embedding process by using a single embedding layer and an offset.
+    Example:
+    - num_embeds = 4
+    - vocab_size = 8
+    - num_channels = 3
+    We would have offsets = [0, 8, 16]
+    If audio_codes = [0, 1, 2, 3], [1, 3, 4, 7], [5, 6, 7, 8],
+    then tokens = audio_codes + offsets
+                = [0, 1, 2, 3, 9, 11, 12, 15, 21, 22, 23, 24]
+    This allows us to use a single embedding layer for all channels.
+    """
+
+    def __init__(self, config: DiaDecoderConfig):
+        super().__init__()
+        self.embed = nn.Embedding(config.vocab_size * config.num_channels, config.hidden_size)
+        self.hidden_size = config.hidden_size
+        self.num_channels = config.num_channels
+        offsets = torch.arange(config.num_channels, dtype=torch.long) * config.vocab_size  # (C,)
+        self.register_buffer("offsets", offsets, persistent=False)
+
+    def forward(self, audio_codes: torch.Tensor) -> torch.Tensor:
+        tokens = (audio_codes + self.offsets.to(audio_codes.device)).squeeze(1)
+        embeds = self.embed(tokens).view(tokens.shape[0], audio_codes.shape[1], -1, self.hidden_size)
+        return embeds.sum(dim=2)
+
+
+class DiaMLP(Phi3MLP):
+    pass
+
+
+class DiaRMSNorm(LlamaRMSNorm):
+    pass
+
+
+class DiaRotaryEmbedding(LlamaRotaryEmbedding):
+    pass
+
+
+class DiaSelfAttention(LlamaAttention):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: Union[DiaEncoderConfig, DiaDecoderConfig], layer_idx: int, is_causal: bool = False):
+        nn.Module.__init__(self)
+        self.config = config
+        self.layer_idx = layer_idx
+        self.hidden_size = config.hidden_size
+        self.num_heads = self.config.num_attention_heads
+        self.num_key_value_heads = self.config.num_key_value_heads or self.num_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // self.num_heads)
+        self.scaling = 1
+        self.attention_dropout = 0.0
+        self.is_causal = is_causal
+
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+
+
+class DiaCrossAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: DiaDecoderConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.hidden_size = config.hidden_size
+        self.cross_hidden_size = config.cross_hidden_size
+        self.num_heads = self.config.cross_num_attention_heads
+        self.num_key_value_heads = self.config.cross_num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        self.head_dim = config.cross_head_dim
+        self.scaling = 1
+        self.attention_dropout = 0.0
+        self.is_causal = False
+
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(self.cross_hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(self.cross_hidden_size, self.num_key_value_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=False)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cross_attention_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[EncoderDecoderCache] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+        cross_shape = (*cross_attention_states.shape[:-1], -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        is_updated = past_key_values.is_updated.get(self.layer_idx) if past_key_values is not None else False
+        if past_key_values is not None and is_updated:
+            # reuse k,v, cross_attentions
+            key_states = past_key_values.cross_attention_cache.layers[self.layer_idx].keys
+            value_states = past_key_values.cross_attention_cache.layers[self.layer_idx].values
+        else:
+            key_states = self.k_proj(cross_attention_states).view(cross_shape).transpose(1, 2)
+            value_states = self.v_proj(cross_attention_states).view(cross_shape).transpose(1, 2)
+
+            if past_key_values is not None:
+                # save all states to the cache
+                key_states, value_states = past_key_values.cross_attention_cache.update(
+                    key_states,
+                    value_states,
+                    self.layer_idx,
+                )
+                # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls
+                past_key_values.is_updated[self.layer_idx] = True
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape((*input_shape, -1)).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class DiaEncoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: DiaEncoderConfig, layer_idx: int):
+        super().__init__()
+        self.pre_sa_norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps)
+        self.self_attention = DiaSelfAttention(config, layer_idx, is_causal=False)
+        self.post_sa_norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps)
+        self.mlp = DiaMLP(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        attention_mask: Optional[torch.Tensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        residual = hidden_states
+        normed_states = self.pre_sa_norm(hidden_states)
+        self_attn_output, self_attn_weights = self.self_attention(
+            normed_states,
+            position_embeddings=position_embeddings,
+            attention_mask=attention_mask,
+            **kwargs,
+        )
+        hidden_states = residual + self_attn_output
+
+        residual = hidden_states
+        normed_states = self.post_sa_norm(hidden_states)
+        mlp_out = self.mlp(normed_states)
+        hidden_states = residual + mlp_out
+
+        return hidden_states, self_attn_weights
+
+
+class DiaEncoder(DiaPreTrainedModel):
+    def __init__(self, config: DiaEncoderConfig):
+        super().__init__(config)
+        self.config = config
+
+        self.embedding = nn.Embedding(config.vocab_size, config.hidden_size)
+        self.layers = nn.ModuleList(
+            [DiaEncoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps)
+        self.rotary_embeddings = DiaRotaryEmbedding(config)
+
+    @auto_docstring
+    @can_return_tuple
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Union[BaseModelOutput, tuple]:
+        hidden_states = self.embedding(input_ids)
+
+        # RoPE
+        # Note: We expect right padding and hence always generate
+        # the position ids on the fly to reduce preparation overhead
+        position_ids = torch.arange(input_ids.shape[-1], device=input_ids.device)[None, :]
+        position_embeddings = self.rotary_embeddings(hidden_states, position_ids)
+
+        attention_mask = self._update_full_mask(
+            attention_mask,
+            hidden_states,
+        )
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        for encoder_layer in self.layers:
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+
+            layer_outputs = encoder_layer(
+                hidden_states,
+                position_embeddings=position_embeddings,
+                attention_mask=attention_mask,
+                **kwargs,
+            )
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        hidden_states = self.norm(hidden_states)
+
+        if output_hidden_states:
+            encoder_states += (hidden_states,)
+
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+    # Copied from transformers.models.bart.modeling_bart.BartPreTrainedModel._update_full_mask
+    def _update_full_mask(
+        self,
+        attention_mask: Union[torch.Tensor, None],
+        inputs_embeds: torch.Tensor,
+    ):
+        if attention_mask is not None:
+            if self.config._attn_implementation == "flash_attention_2":
+                attention_mask = attention_mask if 0 in attention_mask else None
+            elif self.config._attn_implementation == "sdpa":
+                # output_attentions=True & head_mask can not be supported when using SDPA, fall back to
+                # the manual implementation that requires a 4D causal mask in all cases.
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                attention_mask = _prepare_4d_attention_mask_for_sdpa(attention_mask, inputs_embeds.dtype)
+            elif self.config._attn_implementation == "flex_attention":
+                if isinstance(attention_mask, torch.Tensor):
+                    attention_mask = make_flex_block_causal_mask(attention_mask, is_causal=False)
+            else:
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype)
+
+        return attention_mask
+
+
+class DiaDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: DiaDecoderConfig, layer_idx: int):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attention = DiaSelfAttention(config, layer_idx, is_causal=True)
+        self.cross_attention = DiaCrossAttention(config, layer_idx)
+        self.pre_sa_norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps)
+        self.pre_ca_norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps)
+        self.pre_mlp_norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps)
+        self.mlp = DiaMLP(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[EncoderDecoderCache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[torch.Tensor]]:
+        self_attn_cache = past_key_values
+        if isinstance(self_attn_cache, EncoderDecoderCache):
+            self_attn_cache = self_attn_cache.self_attention_cache
+
+        residual = hidden_states
+        normed_states = self.pre_sa_norm(hidden_states)
+        self_attn_output, self_attn_weights = self.self_attention(
+            normed_states,
+            position_embeddings,
+            attention_mask,
+            # Needs to be an arg in order to function properly
+            # on inplace operations to be carried (e.g. compile)
+            self_attn_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+        hidden_states = residual + self_attn_output
+
+        residual = hidden_states
+        normed_states = self.pre_ca_norm(hidden_states)
+        cross_states, cross_attn_weights = self.cross_attention(
+            normed_states,
+            encoder_hidden_states,
+            attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            **kwargs,
+        )
+        hidden_states = residual + cross_states
+
+        residual = hidden_states
+        normed_states = self.pre_mlp_norm(hidden_states)
+        mlp_out = self.mlp(normed_states)
+        hidden_states = residual + mlp_out
+
+        return hidden_states, self_attn_weights, cross_attn_weights
+
+
+class DiaDecoder(DiaPreTrainedModel):
+    """Transformer Decoder Stack using DenseGeneral."""
+
+    def __init__(self, config: DiaDecoderConfig):
+        super().__init__(config)
+        self.num_channels = config.num_channels
+        self.vocab_size = config.vocab_size
+        self.embeddings = DiaMultiChannelEmbedding(config)
+        self.rotary_embeddings = DiaRotaryEmbedding(config)
+        self.layers = nn.ModuleList(
+            [DiaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = DiaRMSNorm(config.hidden_size, eps=config.norm_eps)
+
+    @auto_docstring
+    @can_return_tuple
+    def forward(
+        self,
+        input_ids: torch.Tensor,
+        position_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[EncoderDecoderCache] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Union[BaseModelOutputWithPastAndCrossAttentions, tuple]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length, num_codebooks)`):
+            The original `decoder_input_ids` in 3D shape to facilitate more efficient computations.
+
+            [What are input IDs?](../glossary#input-ids)
+        """
+
+        batch_size, seq_length = input_ids.size()[:-1]
+        past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0
+        if cache_position is None:
+            cache_position = torch.arange(
+                past_key_values_length, past_key_values_length + seq_length, device=input_ids.device
+            )
+        if position_ids is None:
+            position_ids = cache_position[None, :]
+
+        # RoPE
+        hidden_states = self.embeddings(input_ids)
+        position_embeddings = self.rotary_embeddings(hidden_states, position_ids)
+
+        if attention_mask is None and not is_torchdynamo_compiling():
+            # required mask seq length can be calculated via length of past cache
+            mask_seq_length = past_key_values_length + seq_length
+            attention_mask = torch.ones(batch_size, mask_seq_length, device=input_ids.device)
+
+        attention_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=hidden_states,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+        encoder_attention_mask = self._update_cross_attn_mask(
+            encoder_hidden_states,
+            encoder_attention_mask,
+            hidden_states.shape[:2],
+            hidden_states,
+        )
+
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+
+        for layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = layer(
+                hidden_states,
+                position_embeddings,
+                attention_mask,
+                encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                past_key_values=past_key_values,
+                cache_position=cache_position,
+                **kwargs,
+            )
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns = all_self_attns + (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+        hidden_states = self.norm(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+    # Copied from transformers.models.bart.modeling_bart.BartPreTrainedModel._update_cross_attn_mask
+    def _update_cross_attn_mask(
+        self,
+        encoder_hidden_states: Union[torch.Tensor, None],
+        encoder_attention_mask: Union[torch.Tensor, None],
+        input_shape: torch.Size,
+        inputs_embeds: torch.Tensor,
+    ):
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            if self.config._attn_implementation == "flash_attention_2":
+                encoder_attention_mask = encoder_attention_mask if 0 in encoder_attention_mask else None
+            elif self.config._attn_implementation == "sdpa":
+                # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on
+                # the manual implementation that requires a 4D causal mask in all cases.
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                encoder_attention_mask = _prepare_4d_attention_mask_for_sdpa(
+                    encoder_attention_mask,
+                    inputs_embeds.dtype,
+                    tgt_len=input_shape[-1],
+                )
+            elif self.config._attn_implementation == "flex_attention":
+                if isinstance(encoder_attention_mask, torch.Tensor):
+                    encoder_attention_mask = make_flex_block_causal_mask(
+                        encoder_attention_mask,
+                        query_length=input_shape[-1],
+                        is_causal=False,
+                    )
+            else:
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                encoder_attention_mask = _prepare_4d_attention_mask(
+                    encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+                )
+
+        return encoder_attention_mask
+
+
+@auto_docstring(
+    custom_intro="""
+    The bare Dia model outputting raw hidden-states without any specific head on top.
+    """
+)
+class DiaModel(DiaPreTrainedModel):
+    def __init__(self, config: DiaConfig):
+        super().__init__(config)
+        self.config = config
+        self.encoder = DiaEncoder(config.encoder_config)
+        self.decoder = DiaDecoder(config.decoder_config)
+        self.post_init()
+
+    def get_encoder(self):
+        return self.encoder
+
+    @auto_docstring
+    @can_return_tuple
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_position_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        encoder_outputs: Optional[Union[BaseModelOutput, tuple]] = None,
+        past_key_values: Optional[EncoderDecoderCache] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Union[tuple, Seq2SeqModelOutput]:
+        r"""
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length)
+        or (batch_size, target_sequence_length, num_codebooks)`, *optional*):
+            1. (batch_size * num_codebooks, target_sequence_length): corresponds to the general use case where
+            the audio input codebooks are flattened into the batch dimension. This also aligns with the flat-
+            tened audio logits which are used to calculate the loss.
+
+            2. (batch_size, sequence_length, num_codebooks): corresponds to the internally used shape of
+            Dia to calculate embeddings and subsequent steps more efficiently.
+
+            If no `decoder_input_ids` are provided, it will create a tensor of `bos_token_id` with shape
+            `(batch_size, 1, num_codebooks)`. Indices can be obtained using the [`DiaProcessor`]. See
+            [`DiaProcessor.__call__`] for more details.
+
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`.
+
+            [What are position IDs?](../glossary#position-ids)
+        """
+
+        if input_ids is None and encoder_outputs is None:
+            raise ValueError(
+                "You should either provide text ids or the cached text encodings. Neither has been found."
+            )
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        if self.is_gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        if use_cache and past_key_values is None:
+            past_key_values = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config))
+
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                **kwargs,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput
+        elif not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        # On default we initialize the decoder with bos tokens if nothing has been provided
+        bsz, seq_len, channels = (encoder_outputs[0].shape[0], -1, self.config.decoder_config.num_channels)
+        if decoder_input_ids is None:
+            decoder_input_ids = torch.full(
+                size=(bsz, 1, channels), fill_value=self.config.bos_token_id, device=self.device
+            )
+        # Ensure 3D
+        if decoder_input_ids.ndim == 2:
+            decoder_input_ids = decoder_input_ids.reshape(bsz, channels, seq_len).transpose(1, 2)
+
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            position_ids=decoder_position_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        return Seq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs[0],
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    The Dia model consisting of a (byte) text encoder and audio decoder with a prediction head on top.
+    """
+)
+class DiaForConditionalGeneration(DiaPreTrainedModel, DiaGenerationMixin):
+    base_model_prefix = "model"
+
+    def __init__(self, config: DiaConfig):
+        super().__init__(config)
+        self.config = config
+        self.model = DiaModel(config)
+
+        self.num_channels = config.decoder_config.num_channels
+        self.vocab_size = config.decoder_config.vocab_size
+        self.logits_dense = nn.Linear(
+            config.decoder_config.hidden_size, (self.num_channels * self.vocab_size), bias=False
+        )
+        self.loss_type = "ForMaskedLM"
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_encoder(self):
+        return self.model.get_encoder()
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    @auto_docstring
+    @can_return_tuple
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_position_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        encoder_outputs: Optional[Union[BaseModelOutput, tuple]] = None,
+        past_key_values: Optional[EncoderDecoderCache] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        labels: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> Union[tuple, Seq2SeqLMOutput]:
+        r"""
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size * num_codebooks, target_sequence_length)
+        or (batch_size, target_sequence_length, num_codebooks)`, *optional*):
+            1. (batch_size * num_codebooks, target_sequence_length): corresponds to the general use case where
+            the audio input codebooks are flattened into the batch dimension. This also aligns with the flat-
+            tened audio logits which are used to calculate the loss.
+
+            2. (batch_size, sequence_length, num_codebooks): corresponds to the internally used shape of
+            Dia to calculate embeddings and subsequent steps more efficiently.
+
+            If no `decoder_input_ids` are provided, it will create a tensor of `bos_token_id` with shape
+            `(batch_size, 1, num_codebooks)`. Indices can be obtained using the [`DiaProcessor`]. See
+            [`DiaProcessor.__call__`] for more details.
+
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+        decoder_position_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`):
+            Indices of positions of each input sequence tokens in the position embeddings.
+            Used to calculate the position embeddings up to `config.decoder_config.max_position_embeddings`.
+
+            [What are position IDs?](../glossary#position-ids)
+        labels (`torch.LongTensor` of shape `(batch_size * num_codebooks,)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in
+            `[0, ..., config.decoder_config.vocab_size - 1]` or -100. Tokens with indices set to `-100`
+            are ignored (masked).
+        """
+
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_position_ids=decoder_position_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            encoder_outputs=encoder_outputs,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        last_hidden_state = outputs[0]
+        batch_size = last_hidden_state.shape[0]
+        # 3D <-> 2D makes it necessary to prioritize channel dim
+        audio_logits = (
+            self.logits_dense(last_hidden_state)
+            .view((batch_size, -1, self.num_channels, self.vocab_size))
+            .transpose(1, 2)
+            .contiguous()
+            .view(batch_size * self.num_channels, -1, self.vocab_size)
+        )
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=audio_logits, labels=labels, vocab_size=self.vocab_size, **kwargs)
+
+        return Seq2SeqLMOutput(
+            loss=loss,
+            logits=audio_logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+
+__all__ = ["DiaModel", "DiaPreTrainedModel", "DiaForConditionalGeneration"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dia/processing_dia.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dia/processing_dia.py
new file mode 100644
index 0000000000000000000000000000000000000000..402f5152a64bda378ccdf5edd512c86fe643145c
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dia/processing_dia.py
@@ -0,0 +1,474 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Processor class for Dia"""
+
+import math
+from pathlib import Path
+from typing import Optional, Union
+
+from ...audio_utils import AudioInput, make_list_of_audio
+from ...feature_extraction_utils import BatchFeature
+from ...processing_utils import AudioKwargs, ProcessingKwargs, ProcessorMixin, Unpack
+from ...utils import is_soundfile_available, is_torch_available
+
+
+if is_torch_available():
+    import torch
+
+if is_soundfile_available():
+    import soundfile as sf
+
+
+class DiaAudioKwargs(AudioKwargs, total=False):
+    bos_token_id: int
+    eos_token_id: int
+    pad_token_id: int
+    delay_pattern: list[int]
+    generation: bool
+
+
+class DiaProcessorKwargs(ProcessingKwargs, total=False):
+    audio_kwargs: DiaAudioKwargs
+    _defaults = {
+        "text_kwargs": {
+            "padding": True,
+            "padding_side": "right",
+            "add_special_tokens": False,
+        },
+        "audio_kwargs": {
+            "eos_token_id": 1024,
+            "pad_token_id": 1025,
+            "bos_token_id": 1026,
+            "delay_pattern": [0, 8, 9, 10, 11, 12, 13, 14, 15],
+            "generation": True,
+            "sampling_rate": 44100,
+        },
+        "common_kwargs": {"return_tensors": "pt"},
+    }
+
+
+class DiaProcessor(ProcessorMixin):
+    r"""
+    Constructs a Dia processor which wraps a [`DiaFeatureExtractor`], [`DiaTokenizer`], and a [`DacModel`] into
+    a single processor. It inherits, the audio feature extraction, tokenizer, and audio encode/decode functio-
+    nalities. See [`~DiaProcessor.__call__`], [`~DiaProcessor.encode`], and [`~DiaProcessor.decode`] for more
+    information.
+
+    Args:
+        feature_extractor (`DiaFeatureExtractor`):
+            An instance of [`DiaFeatureExtractor`]. The feature extractor is a required input.
+        tokenizer (`DiaTokenizer`):
+            An instance of [`DiaTokenizer`]. The tokenizer is a required input.
+        audio_tokenizer (`DacModel`):
+            An instance of [`DacModel`] used to encode/decode audio into/from codebooks. It is is a required input.
+    """
+
+    feature_extractor_class = "DiaFeatureExtractor"
+    tokenizer_class = "DiaTokenizer"
+    audio_tokenizer_class = "DacModel"
+
+    def __init__(self, feature_extractor, tokenizer, audio_tokenizer):
+        super().__init__(feature_extractor, tokenizer, audio_tokenizer=audio_tokenizer)
+
+    def __call__(
+        self,
+        text: Union[str, list[str]],
+        audio: Optional[AudioInput] = None,
+        output_labels: Optional[bool] = False,
+        **kwargs: Unpack[DiaProcessorKwargs],
+    ):
+        """
+        Main method to prepare text(s) and audio to be fed as input to the model. The `audio` argument is
+        forwarded to the DiaFeatureExtractor's [`~DiaFeatureExtractor.__call__`] and subsequently to the
+        DacModel's [`~DacModel.encode`]. The `text` argument to [`~DiaTokenizer.__call__`]. Please refer
+        to the docstring of the above methods for more information.
+        """
+        if not is_torch_available():
+            raise ValueError(
+                "The `DiaProcessor` relies on the `audio_tokenizer` which requires `torch` but we couldn't "
+                "find it in your environment. You can install torch via `pip install torch`."
+            )
+
+        if text is None:
+            raise ValueError("You need to specify the `text` input to process.")
+
+        output_kwargs = self._merge_kwargs(
+            DiaProcessorKwargs,
+            **kwargs,
+        )
+
+        text_kwargs = output_kwargs["text_kwargs"]
+        audio_kwargs = output_kwargs["audio_kwargs"]
+        common_kwargs = output_kwargs["common_kwargs"]
+
+        return_tensors = common_kwargs.pop("return_tensors", None)
+        if return_tensors != "pt":
+            raise ValueError(f"{self.__class__.__name__} only supports `return_tensors='pt'`.")
+
+        data = {}
+
+        # Text
+        if isinstance(text, str):
+            text = [text]
+        elif not (isinstance(text, (list, tuple)) and all(isinstance(t, str) for t in text)):
+            raise ValueError("Invalid input text. Please provide a string, or a list of strings")
+
+        encodings = self.tokenizer(text, **text_kwargs)
+        data.update(encodings)
+
+        # Audio
+        delay_pattern = audio_kwargs.pop("delay_pattern", None)
+        audio_bos_token_id = audio_kwargs.pop("bos_token_id", None)
+        audio_eos_token_id = audio_kwargs.pop("eos_token_id", None)
+        audio_pad_token_id = audio_kwargs.pop("pad_token_id", None)
+        generation = audio_kwargs.pop("generation", True)
+        if (
+            audio_bos_token_id is None
+            or audio_eos_token_id is None
+            or audio_pad_token_id is None
+            or delay_pattern is None
+        ):
+            raise ValueError(
+                "To enable processing for Dia, we need the `bos_token_id`, `eos_token_id`, "
+                "`pad_token_id`, and `delay_pattern`. You may have accidentally overwritten one of those."
+            )
+
+        if generation and output_labels:
+            raise ValueError(
+                f"Labels with `generation` is incompatible, got generation={generation}, output_labels={output_labels}."
+            )
+
+        batch_size = data["input_ids"].shape[0]
+        num_channels = len(delay_pattern)
+        max_delay = max(delay_pattern)
+
+        # Voice cloning generation / general training
+        if audio is not None:
+            audio = make_list_of_audio(audio)
+            input_audios = self.feature_extractor(audio, **audio_kwargs)
+
+            compression_rate = math.prod(self.audio_tokenizer.config.downsampling_ratios)
+            max_encoded_sequence_len = input_audios["padding_mask"][0].shape[-1] // compression_rate
+
+            decoder_input_ids = []
+            decoder_attention_mask = []
+            # TODO: dac with batching is currently broken, but non-batch is working
+            # refer to https://gist.github.com/vasqu/643a45b680cf39fd7467271ee2eb6f80 for a validation script
+            for padding_mask, audio in zip(input_audios["padding_mask"], input_audios["input_values"]):
+                # get current length with hop length in mind (as if it were sampled as a single audio)
+                base_pad_len = self.feature_extractor.hop_length
+                current_audio_len = math.ceil(padding_mask.sum(dim=-1) / base_pad_len) * base_pad_len
+
+                encoded_sequence_len = current_audio_len // compression_rate
+                padding_len = max_encoded_sequence_len - encoded_sequence_len
+
+                # compute non-padded forward pass; one extra bos (and eos if training) is added
+                with torch.no_grad():
+                    audio = audio[None, ..., :current_audio_len].to(self.audio_tokenizer.device)
+                    input_ids = self.audio_tokenizer.encode(audio).audio_codes.transpose(1, 2)
+
+                if not generation:
+                    input_ids = torch.nn.functional.pad(
+                        input_ids, pad=(0, 0, 0, 1, 0, 0), mode="constant", value=audio_eos_token_id
+                    )
+
+                # apply padding
+                # +1 for the bos within the real sequence
+                input_ids = torch.nn.functional.pad(
+                    input_ids, pad=(0, 0, padding_len + 1, 0, 0, 0), mode="constant", value=audio_bos_token_id
+                )
+                num_valid_inputs = encoded_sequence_len + 1 + max_delay  # sequence + bos + delay
+                num_valid_inputs += 0 if generation else 1  # eos if training
+                attention_mask = torch.tensor([0] * padding_len + [1] * num_valid_inputs, dtype=torch.long)[None, :]
+
+                decoder_input_ids.append(input_ids)
+                decoder_attention_mask.append(attention_mask)
+
+            decoder_input_ids = torch.cat(decoder_input_ids, dim=0)
+            decoder_attention_mask = torch.cat(decoder_attention_mask, dim=0)
+        # TTS generation
+        elif generation:
+            # all bos to start with TTS
+            decoder_input_ids = torch.full((batch_size, 1, num_channels), audio_bos_token_id, dtype=torch.long)
+
+            # we preemptively add the delay
+            decoder_attention_mask = torch.ones(size=(batch_size, 1 + max_delay), dtype=torch.long)
+        else:
+            raise ValueError("If you try to train, you should provide audio data as well.")
+
+        if batch_size != decoder_input_ids.shape[0]:
+            raise ValueError(
+                f"Need the same amount of samples for both text and audio, but got text samples={batch_size} and "
+                f"audio samples = {decoder_input_ids.shape[0]} instead."
+            )
+
+        # prepare shift indices per delay
+        max_seq_len = decoder_attention_mask.shape[-1]
+        max_audio_len = max_seq_len - max_delay
+        precomputed_idx = self.build_indices(
+            bsz=batch_size,
+            seq_len=max_seq_len,
+            num_channels=num_channels,
+            delay_pattern=delay_pattern,
+            revert=False,
+        )
+
+        # create delay pattern input
+        # the pad token will be used for masking which input is valid for prediction during generation
+        prefill = torch.full(
+            (batch_size, max_seq_len, num_channels),
+            fill_value=audio_pad_token_id,
+            dtype=torch.int,
+        )
+        prefill[:, :max_audio_len] = decoder_input_ids
+
+        delayed_decoder_input_ids = self.apply_audio_delay(
+            audio=prefill,
+            pad_token_id=audio_pad_token_id,
+            bos_token_id=audio_bos_token_id,
+            precomputed_idx=precomputed_idx,
+        )
+
+        data.update({"decoder_input_ids": delayed_decoder_input_ids, "decoder_attention_mask": decoder_attention_mask})
+
+        if output_labels:
+            # Base idea is to shift on the sequence dim
+            labels = data["decoder_input_ids"].clone()[:, 1:]
+            labels[labels == audio_pad_token_id] = -100
+            labels[labels == audio_bos_token_id] = -100
+
+            data["labels"] = labels.transpose(1, 2).reshape(batch_size * num_channels, -1).contiguous().long()
+            data["decoder_input_ids"] = data["decoder_input_ids"][:, :-1]
+            data["decoder_attention_mask"] = data["decoder_attention_mask"][:, :-1]
+
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+    def batch_decode(
+        self,
+        decoder_input_ids: "torch.Tensor",
+        audio_prompt_len: Optional[int] = None,
+        **kwargs: Unpack[DiaProcessorKwargs],
+    ) -> list["torch.Tensor"]:
+        """
+        Decodes a batch of audio codebook sequences into their respective audio waveforms via the
+        `audio_tokenizer`. See [`~DacModel.decode`] for more information.
+
+        Args:
+            decoder_input_ids (`torch.Tensor`): The complete output sequence of the decoder.
+            audio_prompt_len (`int`): The audio prefix length (e.g. when using voice cloning).
+        """
+        output_kwargs = self._merge_kwargs(
+            DiaProcessorKwargs,
+            **kwargs,
+        )
+        audio_kwargs = output_kwargs["audio_kwargs"]
+
+        delay_pattern = audio_kwargs.pop("delay_pattern", None)
+        audio_bos_token_id = audio_kwargs.pop("bos_token_id", None)
+        audio_pad_token_id = audio_kwargs.pop("pad_token_id", None)
+        if audio_bos_token_id is None or audio_pad_token_id is None or delay_pattern is None:
+            raise ValueError(
+                "To enable decoding for Dia, we need the `bos_token_id`, `pad_token_id`, "
+                "and `delay_pattern`. You may have accidentally overwritten one of those."
+            )
+
+        # either decode the whole audio sequence or only the generated parts
+        if audio_prompt_len is not None:
+            audio_prompt_len = torch.tensor(audio_prompt_len, device=decoder_input_ids.device, dtype=torch.long)
+            start_of_generation_idx = audio_prompt_len[None].expand(decoder_input_ids.shape[0])
+        else:
+            start_of_generation_idx = (decoder_input_ids[:, :, 0] == audio_bos_token_id).sum(dim=-1)
+        # -1 for the eos token
+        end_of_generation_idx = (
+            decoder_input_ids.shape[1] - (decoder_input_ids[:, :, 0] == audio_pad_token_id).sum(dim=-1) - 1
+        )
+
+        # revert delay
+        bsz, seq_len, num_channels = decoder_input_ids.shape
+        precomputed_idx = self.build_indices(
+            bsz=bsz,
+            seq_len=seq_len,
+            num_channels=num_channels,
+            delay_pattern=delay_pattern,
+            revert=True,
+        )
+
+        output_sequences = self.apply_audio_delay(
+            audio=decoder_input_ids,
+            # We do not care about these values as we cut them out
+            # with `start_of_generation_idx` and `end_of_generation_idx`
+            pad_token_id=-1,
+            bos_token_id=-1,
+            precomputed_idx=precomputed_idx,
+        ).transpose(1, 2)
+
+        # retrieve the correct sequences each
+        audios = []
+        # TODO: see above, dac doesn't work in batches yet
+        with torch.no_grad():
+            for i in range(start_of_generation_idx.shape[0]):
+                output_i = output_sequences[i, :, start_of_generation_idx[i] : end_of_generation_idx[i]][None, ...]
+                output_i = output_i.to(self.audio_tokenizer.device)
+                audio_i = self.audio_tokenizer.decode(audio_codes=output_i).audio_values.cpu().squeeze()
+                audios.append(audio_i)
+
+        return audios
+
+    def decode(
+        self,
+        decoder_input_ids: "torch.Tensor",
+        audio_prompt_len: Optional[int] = None,
+        **kwargs: Unpack[DiaProcessorKwargs],
+    ) -> "torch.Tensor":
+        """
+        Decodes a single sequence of audio codebooks into the respective audio waveform via the
+        `audio_tokenizer`. See [`~DacModel.decode`] and [`~DiaProcessor.batch_decode`] for more information.
+        """
+        if decoder_input_ids.shape[0] != 1:
+            raise ValueError(
+                f"Expecting a single output to be decoded but received {decoder_input_ids.shape[0]} samples instead."
+            )
+
+        return self.batch_decode(decoder_input_ids, audio_prompt_len, **kwargs)[0]
+
+    def get_audio_prompt_len(
+        self,
+        decoder_attention_mask: "torch.Tensor",
+        **kwargs: Unpack[DiaProcessorKwargs],
+    ) -> int:
+        """Utility function to get the audio prompt length."""
+        output_kwargs = self._merge_kwargs(
+            DiaProcessorKwargs,
+            **kwargs,
+        )
+        audio_kwargs = output_kwargs["audio_kwargs"]
+
+        delay_pattern = audio_kwargs.pop("delay_pattern", None)
+        if delay_pattern is None:
+            raise ValueError(
+                "To enable the utility of retrieving the prompt length for Dia, we need the "
+                "`delay_pattern`. You may have accidentally overwritten this."
+            )
+        return decoder_attention_mask.shape[1] - max(delay_pattern)
+
+    # Copied from transformers.models.csm.processing_csm.CsmProcessor.save_audio with Csm->Dia
+    def save_audio(
+        self,
+        audio: AudioInput,
+        saving_path: Union[str, Path, list[Union[str, Path]]],
+        **kwargs: Unpack[DiaProcessorKwargs],
+    ):
+        # TODO: @eustlb, this should be in AudioProcessor
+        if not is_soundfile_available():
+            raise ImportError("Please install `soundfile` to save audio files.")
+
+        # ensure correct audio input
+        audio = make_list_of_audio(audio)
+
+        # ensure correct saving path
+        if isinstance(saving_path, (str, Path)):
+            saving_path = [saving_path]
+        elif not (isinstance(saving_path, (list, tuple)) and all(isinstance(p, (str, Path)) for p in saving_path)):
+            raise ValueError("Invalid input path. Please provide a string, or a list of strings")
+
+        if len(audio) != len(saving_path):
+            raise ValueError("The number of audio and saving paths must be the same")
+
+        output_kwargs = self._merge_kwargs(
+            DiaProcessorKwargs,
+            **kwargs,
+        )
+        audio_kwargs = output_kwargs["audio_kwargs"]
+        sampling_rate = audio_kwargs["sampling_rate"]
+
+        for audio_value, p in zip(audio, saving_path):
+            if isinstance(audio_value, torch.Tensor):
+                audio_value = audio_value.cpu().float().numpy()
+            sf.write(p, audio_value, sampling_rate)
+
+    @staticmethod
+    def build_indices(
+        bsz: int,
+        seq_len: int,
+        num_channels: int,
+        delay_pattern: list[int],
+        revert: bool = False,
+    ) -> tuple["torch.Tensor", "torch.Tensor"]:
+        """
+        Precompute (sequence_idx, all_idx) so that out[seq, channel] = in[seq - delay[channel], channel]
+        or in[seq, channel] = out[seq + delay[channel], channel] if `revert`.
+        Negative sequence_idx => BOS; sequence_idx >= seq_len => PAD.
+        """
+        delay_array = torch.tensor(delay_pattern, dtype=torch.int32)
+
+        # (0..seq_len-1)
+        sequence_idx = torch.arange(seq_len, dtype=torch.int32)[None, :].expand(bsz, seq_len)[..., None]
+        # + or - delay depending if we delay or revert the delay
+        if not revert:
+            sequence_idx = sequence_idx - delay_array[None, None, :]
+        else:
+            sequence_idx = sequence_idx + delay_array[None, None, :]
+        # if delay goes over the range we clamp back to valid values
+        valid_sequence_idx = torch.clamp(sequence_idx, 0, seq_len - 1)
+
+        batch_idx = torch.arange(bsz, dtype=torch.int32)[:, None, None].expand(bsz, seq_len, num_channels)
+        channel_idx = torch.arange(num_channels, dtype=torch.int32)[None, None, :].expand(bsz, seq_len, num_channels)
+
+        all_idx = torch.stack(
+            [batch_idx.reshape(-1), valid_sequence_idx.reshape(-1), channel_idx.reshape(-1)],
+            dim=1,
+        ).long()
+
+        return sequence_idx, all_idx
+
+    @staticmethod
+    def apply_audio_delay(
+        audio: "torch.Tensor",
+        pad_token_id: int,
+        bos_token_id: int,
+        precomputed_idx: tuple["torch.Tensor", "torch.Tensor"],
+    ) -> "torch.Tensor":
+        """
+        Applies or reverts the delay pattern to batched audio tokens using precomputed indices,
+        inserting BOS where sequence_idx < 0 and PAD where sequence_idx >= seq_len.
+
+        Args:
+            audio: audio tokens of shape [bsz, seq_len, num_channels]
+            pad_token_id: the PAD token
+            bos_token_id: the BOS token
+            precomputed_idx: from `build_indices`
+
+        Returns:
+            final_audio: delayed or reverted audio tokens of shape [bsz, seq_len, num_channels]
+        """
+        # Move everything to the same device
+        device = audio.device
+        sequence_idx, all_idx = precomputed_idx
+        sequence_idx = sequence_idx.to(device)
+        all_idx = all_idx.to(device)
+
+        # Gather per precomputed indices
+        batch_idx, valid_sequence_idx, channel_idx = torch.unbind(all_idx, dim=-1)
+        gathered_audio = audio[batch_idx, valid_sequence_idx, channel_idx].view(audio.size())
+
+        # Mask according to negative sequence_idx => BOS; sequence_idx >= seq_len => PAD
+        mask_bos = sequence_idx < 0
+        mask_pad = sequence_idx >= audio.shape[1]
+        final_audio = torch.where(mask_bos, bos_token_id, torch.where(mask_pad, pad_token_id, gathered_audio))
+
+        return final_audio
+
+
+__all__ = ["DiaProcessor"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dia/tokenization_dia.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dia/tokenization_dia.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e205906ea709ee2c20f25b0bf6f4fa66ab1f4a4
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dia/tokenization_dia.py
@@ -0,0 +1,118 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization class for Dia."""
+
+from typing import Optional
+
+from ...tokenization_utils import AddedToken, PreTrainedTokenizer
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class DiaTokenizer(PreTrainedTokenizer):
+    """
+    Construct a Dia tokenizer. Dia simply uses raw bytes utf-8 encoding except for special tokens `[S1]` and `[S2]`.
+
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
+
+    Args:
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        unk_token (`str`, *optional*, defaults to `"<pad>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        max_length (`int`, *optional*, defaults to 1024):
+            The maximum length of the sequences when encoding. Sequences longer than this will be truncated.
+        offset (`int`, *optional*, defaults to 0):
+            The offset of the tokenizer.
+    """
+
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        pad_token: Optional[str] = "<pad>",
+        unk_token: Optional[str] = "<pad>",
+        max_length: Optional[int] = 1024,
+        offset: int = 0,
+        **kwargs,
+    ):
+        # We have no eos/bos tokens but allow padding -- no l/r strip as we treat them as tokens as well
+        pad_token = AddedToken(pad_token) if isinstance(pad_token, str) else pad_token
+        unk_token = AddedToken(unk_token) if isinstance(unk_token, str) else unk_token
+
+        self._utf_vocab_size = 2**8  # utf is 8 bits
+        self._added_tokens_decoder = {0: pad_token, 1: AddedToken("[S1]"), 2: AddedToken("[S2]")}
+        self.offset = offset
+        super().__init__(
+            unk_token=unk_token,
+            pad_token=pad_token,
+            max_length=max_length,
+            **kwargs,
+        )
+
+    @property
+    def vocab_size(self):
+        return self._utf_vocab_size
+
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size + self.offset)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def _tokenize(self, text: str) -> list[str]:
+        """Take as input a string and return a list of strings (tokens) for words/sub-words"""
+        tokens = [chr(i) for i in text.encode("utf-8")]
+        return tokens
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+
+        if len(token) != 1:
+            token_id = None
+        else:
+            token_id = ord(token) + self.offset
+
+        return token_id
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        token = chr(index - self.offset)
+        return token
+
+    def convert_tokens_to_string(self, tokens: list[str]) -> str:
+        """Converts a sequence of tokens (string) in a single string."""
+        bstring = b""
+        for token in tokens:
+            if token in self.added_tokens_decoder:
+                added_token_obj = self.added_tokens_decoder[token]
+                tok_string = str(added_token_obj).encode("utf-8")
+            elif token in self.added_tokens_encoder:
+                tok_string = token.encode("utf-8")
+            else:
+                tok_string = token.encode("utf-8")  # Assume general string token
+            bstring += tok_string
+        string = bstring.decode("utf-8", errors="ignore")
+        return string
+
+    # No vocab file
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        return ()
+
+
+__all__ = ["DiaTokenizer"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/diffllama/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/diffllama/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..c162fce0a48bd164bd0e0a615b942ee4805a12aa
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/diffllama/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_diffllama import *
+    from .modeling_diffllama import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/diffllama/configuration_diffllama.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/diffllama/configuration_diffllama.py
new file mode 100644
index 0000000000000000000000000000000000000000..210607271927ab2f3a7aa1ec1e874fb296c32a73
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/diffllama/configuration_diffllama.py
@@ -0,0 +1,199 @@
+# coding=utf-8
+# Copyright 2024 weak-kajuma and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on Llama implementations in this library and Microsoft's
+# Differential Transformer implementations.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""DiffLlama model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...modeling_rope_utils import rope_config_validation
+
+
+class DiffLlamaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DiffLlamaModel`]. It is used to instantiate an DiffLlama
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the defaults
+    will yield a similar configuration to that of the [kajuma/DiffLlama-0.3B-handcut](https://huggingface.co/kajuma/DiffLlama-0.3B-handcut).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the DiffLlama model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`DiffLlamaModel`]
+        hidden_size (`int`, *optional*, defaults to 2048):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 8192):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 16):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'diffllama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'diffllama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'diffllama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'diffllama3'. Scaling factor applied to high frequency components of the RoPE
+        attention_bias (`bool`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        lambda_std_dev (`float`, *optional*, defaults to 0.1):
+            The standard deviation for initialization of parameter lambda in attention layer.
+        head_dim (`int`, *optional*):
+            The attention head dimension. If None, it will default to hidden_size // num_heads
+
+    ```python
+    >>> from transformers import DiffLlamaModel, DiffLlamaConfig
+
+    >>> # Initializing a DiffLlama diffllama-7b style configuration
+    >>> configuration = DiffLlamaConfig()
+
+    >>> # Initializing a model from the diffllama-7b style configuration
+    >>> model = DiffLlamaModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "diffllama"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        hidden_size=2048,
+        intermediate_size=8192,
+        num_hidden_layers=16,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        pad_token_id=None,
+        bos_token_id=1,
+        eos_token_id=2,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        lambda_std_dev=0.1,
+        head_dim=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+        self.lambda_std_dev = lambda_std_dev
+        self.head_dim = head_dim if head_dim is not None else self.hidden_size // self.num_attention_heads
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, copy it it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+
+__all__ = ["DiffLlamaConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/diffllama/modeling_diffllama.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/diffllama/modeling_diffllama.py
new file mode 100644
index 0000000000000000000000000000000000000000..094cc375057f71eb51644bf2b49c524613ed22e1
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/diffllama/modeling_diffllama.py
@@ -0,0 +1,767 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/diffllama/modular_diffllama.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_diffllama.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2024 weak-kajuma and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on Llama implementations in this library and Microsoft's
+# Differential Transformer implementations.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import Optional, Union
+
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache, StaticCache
+from ...generation import GenerationMixin
+from ...integrations import use_kernel_forward_from_hub
+from ...masking_utils import create_causal_mask
+from ...modeling_flash_attention_utils import _flash_attention_forward, flash_attn_supports_top_left_mask
+from ...modeling_layers import (
+    GenericForQuestionAnswering,
+    GenericForSequenceClassification,
+    GenericForTokenClassification,
+    GradientCheckpointingLayer,
+)
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
+from ...utils.deprecation import deprecate_kwarg
+from ...utils.generic import check_model_inputs
+from .configuration_diffllama import DiffLlamaConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class DiffLlamaMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def lambda_init_fn(layer_idx):
+    return 0.8 - 0.6 * math.exp(-0.3 * layer_idx)
+
+
+class DiffLlamaAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: DiffLlamaConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = getattr(config, "head_dim", self.hidden_size // self.num_heads)
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        # under this are not used
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
+
+        self.lambda_init = lambda_init_fn(layer_idx)
+        self.lambda_q1 = nn.Parameter(torch.normal(0, config.lambda_std_dev, size=(self.head_dim,)))
+        self.lambda_k1 = nn.Parameter(torch.normal(0, config.lambda_std_dev, size=(self.head_dim,)))
+        self.lambda_q2 = nn.Parameter(torch.normal(0, config.lambda_std_dev, size=(self.head_dim,)))
+        self.lambda_k2 = nn.Parameter(torch.normal(0, config.lambda_std_dev, size=(self.head_dim,)))
+        self.groupnorm = nn.RMSNorm(2 * self.head_dim, eps=config.rms_norm_eps, elementwise_affine=False)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        bsz, target_len, _ = hidden_states.size()
+        q_len = target_len
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        value_states = torch.cat(torch.chunk(value_states, 2, dim=1), dim=-1)
+        value_states = value_states.repeat(1, 2, 1, 1)
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+            attn_weights = attn_weights + causal_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        lambda_1 = torch.exp(torch.sum(self.lambda_q1 * self.lambda_k1, dim=-1, dtype=torch.float32)).to(
+            query_states.dtype
+        )
+        lambda_2 = torch.exp(torch.sum(self.lambda_q2 * self.lambda_k2, dim=-1, dtype=torch.float32)).to(
+            query_states.dtype
+        )
+        lambda_full = lambda_1 - lambda_2 + self.lambda_init
+
+        attn_output = torch.matmul(attn_weights, value_states)
+        attn_output1, attn_output2 = torch.chunk(attn_output, 2, dim=1)
+
+        attn_output = attn_output1 - lambda_full * attn_output2
+        attn_output = (1 - self.lambda_init) * self.groupnorm(attn_output)
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, -1)
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class DiffLlamaFlashAttention2(DiffLlamaAttention):
+    """
+    DiffLlama flash attention module. This module inherits from `DiffLlamaAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> tuple[torch.Tensor, None]:
+        if isinstance(past_key_values, StaticCache):
+            raise ValueError(
+                "`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` "
+                "make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformers"
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
+        # to be able to avoid many of these transpose/reshape/view.
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        dropout_rate = self.attention_dropout if self.training else 0.0
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (DiffLlamaRMSNorm handles it correctly)
+
+        input_dtype = query_states.dtype
+        device_type = query_states.device.type if query_states.device.type != "mps" else "cpu"
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = (
+                    torch.get_autocast_dtype(device_type)
+                    if hasattr(torch, "get_autocast_dtype")
+                    else torch.get_autocast_gpu_dtype()
+                )
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        value_states1, value_states2 = torch.chunk(value_states, 2, dim=2)
+        value_states1 = value_states1.repeat(1, 1, 2, 1)
+        value_states2 = value_states2.repeat(1, 1, 2, 1)
+
+        attn_output1 = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states1,
+            attention_mask,
+            q_len,
+            position_ids=position_ids,
+            dropout=dropout_rate,
+            sliding_window=getattr(self, "sliding_window", None),
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+            is_causal=self.is_causal,
+        )
+
+        attn_output2 = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states2,
+            attention_mask,
+            q_len,
+            position_ids=position_ids,
+            dropout=dropout_rate,
+            sliding_window=getattr(self, "sliding_window", None),
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+            is_causal=self.is_causal,
+        )
+
+        attn_output = torch.cat([attn_output1, attn_output2], dim=-1)
+        attn_output1, attn_output2 = torch.chunk(attn_output, 2, dim=2)
+
+        lambda_1 = torch.exp(torch.sum(self.lambda_q1 * self.lambda_k1, dim=-1, dtype=torch.float32)).to(
+            query_states.dtype
+        )
+        lambda_2 = torch.exp(torch.sum(self.lambda_q2 * self.lambda_k2, dim=-1, dtype=torch.float32)).to(
+            query_states.dtype
+        )
+        lambda_full = lambda_1 - lambda_2 + self.lambda_init
+
+        attn_output = attn_output1 - lambda_full * attn_output2
+        attn_output = (1 - self.lambda_init) * self.groupnorm(attn_output)
+        attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, None
+
+
+class DiffLlamaSdpaAttention(DiffLlamaAttention):
+    """
+    DiffLlama attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `DiffLlamaAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+
+    # Adapted from DiffLlamaAttention.forward
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        value_states = torch.cat(torch.chunk(value_states, 2, dim=1), dim=-1)
+        value_states = value_states.repeat(1, 2, 1, 1)
+
+        causal_mask = attention_mask
+        if attention_mask is not None:
+            causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
+
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and causal_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        is_causal = causal_mask is None and q_len > 1
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=causal_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            is_causal=is_causal,
+        )
+
+        attn_output1, attn_output2 = torch.chunk(attn_output, 2, dim=1)
+
+        lambda_1 = torch.exp(torch.sum(self.lambda_q1 * self.lambda_k1, dim=-1, dtype=torch.float32)).to(
+            query_states.dtype
+        )
+        lambda_2 = torch.exp(torch.sum(self.lambda_q2 * self.lambda_k2, dim=-1, dtype=torch.float32)).to(
+            query_states.dtype
+        )
+        lambda_full = lambda_1 - lambda_2 + self.lambda_init
+
+        attn_output = attn_output1 - lambda_full * attn_output2
+        attn_output = (1 - self.lambda_init) * self.groupnorm(attn_output)
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, -1)
+        attn_output = self.o_proj(attn_output)
+        return attn_output, None
+
+
+@use_kernel_forward_from_hub("RMSNorm")
+class DiffLlamaRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        DiffLlamaRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+DIFFLLAMA_ATTENTION_CLASSES = {
+    "eager": DiffLlamaAttention,
+    "flash_attention_2": DiffLlamaFlashAttention2,
+    "sdpa": DiffLlamaSdpaAttention,
+}
+
+
+class DiffLlamaDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: DiffLlamaConfig, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = DIFFLLAMA_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
+
+        self.mlp = DiffLlamaMLP(config)
+        self.input_layernorm = DiffLlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = DiffLlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+@auto_docstring
+class DiffLlamaPreTrainedModel(PreTrainedModel):
+    config: DiffLlamaConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["DiffLlamaDecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = False
+
+    _can_compile_fullgraph = True
+    _supports_attention_backend = False
+    _can_record_outputs = {
+        "hidden_states": DiffLlamaDecoderLayer,
+        "attentions": DiffLlamaAttention,
+    }
+
+    def _init_weights(self, module):
+        super()._init_weights(module)
+        if isinstance(module, DiffLlamaAttention):
+            module.lambda_q1.data.normal_(0, self.config.lambda_std_dev)
+            module.lambda_k1.data.normal_(0, self.config.lambda_std_dev)
+            module.lambda_q2.data.normal_(0, self.config.lambda_std_dev)
+            module.lambda_k2.data.normal_(0, self.config.lambda_std_dev)
+
+
+class DiffLlamaRotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: DiffLlamaConfig, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+@auto_docstring
+class DiffLlamaModel(DiffLlamaPreTrainedModel):
+    def __init__(self, config: DiffLlamaConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [DiffLlamaDecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = DiffLlamaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = DiffLlamaRotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds: torch.Tensor = self.embed_tokens(input_ids)
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position: torch.Tensor = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+
+        hidden_states = inputs_embeds
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            hidden_states = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+
+        hidden_states = self.norm(hidden_states)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+        )
+
+
+@auto_docstring
+class DiffLlamaForCausalLM(DiffLlamaPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = DiffLlamaModel(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> CausalLMOutputWithPast:
+        r"""
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, DiffLlamaForCausalLM
+
+        >>> model = DiffLlamaForCausalLM.from_pretrained("google/diffllama-7b")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/diffllama-7b")
+
+        >>> prompt = "What is your favorite condiment?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "What is your favorite condiment?"
+        ```"""
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class DiffLlamaForSequenceClassification(GenericForSequenceClassification, DiffLlamaPreTrainedModel):
+    pass
+
+
+class DiffLlamaForQuestionAnswering(GenericForQuestionAnswering, DiffLlamaPreTrainedModel):
+    base_model_prefix = "transformer"  # For BC, where `transformer` was used instead of `model`
+
+
+class DiffLlamaForTokenClassification(GenericForTokenClassification, DiffLlamaPreTrainedModel):
+    pass
+
+
+__all__ = [
+    "DiffLlamaPreTrainedModel",
+    "DiffLlamaModel",
+    "DiffLlamaForCausalLM",
+    "DiffLlamaForSequenceClassification",
+    "DiffLlamaForQuestionAnswering",
+    "DiffLlamaForTokenClassification",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/diffllama/modular_diffllama.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/diffllama/modular_diffllama.py
new file mode 100644
index 0000000000000000000000000000000000000000..253b99edff0d7a557da404fa680ce8403e22ccf1
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/diffllama/modular_diffllama.py
@@ -0,0 +1,447 @@
+# coding=utf-8
+# Copyright 2024 weak-kajuma and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on Llama implementations in this library and Microsoft's
+# Differential Transformer implementations.
+
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import Optional
+
+import torch
+from torch import nn
+
+from ...cache_utils import Cache, StaticCache
+from ...modeling_flash_attention_utils import _flash_attention_forward, flash_attn_supports_top_left_mask
+from ...modeling_utils import PreTrainedModel
+from ...utils import logging
+from ...utils.deprecation import deprecate_kwarg
+from ..gemma.modeling_gemma import GemmaForCausalLM
+from ..llama.modeling_llama import (
+    LlamaDecoderLayer,
+    LlamaForQuestionAnswering,
+    LlamaForSequenceClassification,
+    LlamaForTokenClassification,
+    LlamaModel,
+    LlamaPreTrainedModel,
+    apply_rotary_pos_emb,
+    repeat_kv,
+)
+from ..mistral.modeling_mistral import MistralMLP
+from .configuration_diffllama import DiffLlamaConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "kajuma/DiffLlama-0.3B-handcut"
+_CONFIG_FOR_DOC = "DiffLlamaConfig"
+
+
+class DiffLlamaMLP(MistralMLP):
+    pass
+
+
+def lambda_init_fn(layer_idx):
+    return 0.8 - 0.6 * math.exp(-0.3 * layer_idx)
+
+
+class DiffLlamaAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: DiffLlamaConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.attention_dropout = config.attention_dropout
+        self.hidden_size = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = getattr(config, "head_dim", self.hidden_size // self.num_heads)
+        self.num_key_value_heads = config.num_key_value_heads
+        self.num_key_value_groups = self.num_heads // self.num_key_value_heads
+        # under this are not used
+        self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = config.rope_theta
+        self.is_causal = True
+
+        self.q_proj = nn.Linear(self.hidden_size, self.num_heads * self.head_dim, bias=config.attention_bias)
+        self.k_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.v_proj = nn.Linear(self.hidden_size, self.num_key_value_heads * self.head_dim, bias=config.attention_bias)
+        self.o_proj = nn.Linear(self.num_heads * self.head_dim, self.hidden_size, bias=config.attention_bias)
+
+        self.lambda_init = lambda_init_fn(layer_idx)
+        self.lambda_q1 = nn.Parameter(torch.normal(0, config.lambda_std_dev, size=(self.head_dim,)))
+        self.lambda_k1 = nn.Parameter(torch.normal(0, config.lambda_std_dev, size=(self.head_dim,)))
+        self.lambda_q2 = nn.Parameter(torch.normal(0, config.lambda_std_dev, size=(self.head_dim,)))
+        self.lambda_k2 = nn.Parameter(torch.normal(0, config.lambda_std_dev, size=(self.head_dim,)))
+        self.groupnorm = nn.RMSNorm(2 * self.head_dim, eps=config.rms_norm_eps, elementwise_affine=False)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        bsz, target_len, _ = hidden_states.size()
+        q_len = target_len
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        value_states = torch.cat(torch.chunk(value_states, 2, dim=1), dim=-1)
+        value_states = value_states.repeat(1, 2, 1, 1)
+
+        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
+
+        if attention_mask is not None:  # no matter the length, we just slice it
+            causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+            attn_weights = attn_weights + causal_mask
+
+        # upcast attention to fp32
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
+        lambda_1 = torch.exp(torch.sum(self.lambda_q1 * self.lambda_k1, dim=-1, dtype=torch.float32)).to(
+            query_states.dtype
+        )
+        lambda_2 = torch.exp(torch.sum(self.lambda_q2 * self.lambda_k2, dim=-1, dtype=torch.float32)).to(
+            query_states.dtype
+        )
+        lambda_full = lambda_1 - lambda_2 + self.lambda_init
+
+        attn_output = torch.matmul(attn_weights, value_states)
+        attn_output1, attn_output2 = torch.chunk(attn_output, 2, dim=1)
+
+        attn_output = attn_output1 - lambda_full * attn_output2
+        attn_output = (1 - self.lambda_init) * self.groupnorm(attn_output)
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.reshape(bsz, q_len, -1)
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class DiffLlamaFlashAttention2(DiffLlamaAttention):
+    """
+    DiffLlama flash attention module. This module inherits from `DiffLlamaAttention` as the weights of the module stays
+    untouched. The only required change would be on the forward pass where it needs to correctly call the public API of
+    flash attention and deal with padding tokens in case the input contains any of them.
+    """
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        # TODO: Should be removed once Flash Attention for RoCm is bumped to 2.1.
+        # flash_attn<2.1 generates top-left aligned causal mask, while what is needed here is bottom-right alignment, that was made default for flash_attn>=2.1. This attribute is used to handle this difference. Reference: https://github.com/Dao-AILab/flash-attention/releases/tag/v2.1.0.
+        # Beware that with flash_attn<2.1, using q_seqlen != k_seqlen (except for the case q_seqlen == 1) produces a wrong mask (top-left).
+        self._flash_attn_uses_top_left_mask = flash_attn_supports_top_left_mask()
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> tuple[torch.Tensor, None]:
+        if isinstance(past_key_values, StaticCache):
+            raise ValueError(
+                "`static` cache implementation is not compatible with `attn_implementation==flash_attention_2` "
+                "make sure to use `sdpa` in the mean time, and open an issue at https://github.com/huggingface/transformers"
+            )
+
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        # Flash attention requires the input to have the shape
+        # batch_size x seq_length x head_dim x hidden_dim
+        # therefore we just need to keep the original shape
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        if position_embeddings is None:
+            logger.warning_once(
+                "The attention layers in this model are transitioning from computing the RoPE embeddings internally "
+                "through `position_ids` (2D tensor with the indexes of the tokens), to using externally computed "
+                "`position_embeddings` (Tuple of tensors, containing cos and sin). In v4.46 `position_ids` will be "
+                "removed and `position_embeddings` will be mandatory."
+            )
+            cos, sin = self.rotary_emb(value_states, position_ids)
+        else:
+            cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        # TODO: These transpose are quite inefficient but Flash Attention requires the layout [batch_size, sequence_length, num_heads, head_dim]. We would need to refactor the KV cache
+        # to be able to avoid many of these transpose/reshape/view.
+        query_states = query_states.transpose(1, 2)
+        key_states = key_states.transpose(1, 2)
+        value_states = value_states.transpose(1, 2)
+
+        dropout_rate = self.attention_dropout if self.training else 0.0
+
+        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
+        # therefore the input hidden states gets silently casted in float32. Hence, we need
+        # cast them back in the correct dtype just to be sure everything works as expected.
+        # This might slowdown training & inference so it is recommended to not cast the LayerNorms
+        # in fp32. (DiffLlamaRMSNorm handles it correctly)
+
+        input_dtype = query_states.dtype
+        device_type = query_states.device.type if query_states.device.type != "mps" else "cpu"
+        if input_dtype == torch.float32:
+            if torch.is_autocast_enabled():
+                target_dtype = (
+                    torch.get_autocast_dtype(device_type)
+                    if hasattr(torch, "get_autocast_dtype")
+                    else torch.get_autocast_gpu_dtype()
+                )
+            # Handle the case where the model is quantized
+            elif hasattr(self.config, "_pre_quantization_dtype"):
+                target_dtype = self.config._pre_quantization_dtype
+            else:
+                target_dtype = self.q_proj.weight.dtype
+
+            logger.warning_once(
+                f"The input hidden states seems to be silently casted in float32, this might be related to"
+                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
+                f" {target_dtype}."
+            )
+
+            query_states = query_states.to(target_dtype)
+            key_states = key_states.to(target_dtype)
+            value_states = value_states.to(target_dtype)
+
+        value_states1, value_states2 = torch.chunk(value_states, 2, dim=2)
+        value_states1 = value_states1.repeat(1, 1, 2, 1)
+        value_states2 = value_states2.repeat(1, 1, 2, 1)
+
+        attn_output1 = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states1,
+            attention_mask,
+            q_len,
+            position_ids=position_ids,
+            dropout=dropout_rate,
+            sliding_window=getattr(self, "sliding_window", None),
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+            is_causal=self.is_causal,
+        )
+
+        attn_output2 = _flash_attention_forward(
+            query_states,
+            key_states,
+            value_states2,
+            attention_mask,
+            q_len,
+            position_ids=position_ids,
+            dropout=dropout_rate,
+            sliding_window=getattr(self, "sliding_window", None),
+            use_top_left_mask=self._flash_attn_uses_top_left_mask,
+            is_causal=self.is_causal,
+        )
+
+        attn_output = torch.cat([attn_output1, attn_output2], dim=-1)
+        attn_output1, attn_output2 = torch.chunk(attn_output, 2, dim=2)
+
+        lambda_1 = torch.exp(torch.sum(self.lambda_q1 * self.lambda_k1, dim=-1, dtype=torch.float32)).to(
+            query_states.dtype
+        )
+        lambda_2 = torch.exp(torch.sum(self.lambda_q2 * self.lambda_k2, dim=-1, dtype=torch.float32)).to(
+            query_states.dtype
+        )
+        lambda_full = lambda_1 - lambda_2 + self.lambda_init
+
+        attn_output = attn_output1 - lambda_full * attn_output2
+        attn_output = (1 - self.lambda_init) * self.groupnorm(attn_output)
+        attn_output = attn_output.reshape(bsz, q_len, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, None
+
+
+class DiffLlamaSdpaAttention(DiffLlamaAttention):
+    """
+    DiffLlama attention module using torch.nn.functional.scaled_dot_product_attention. This module inherits from
+    `DiffLlamaAttention` as the weights of the module stays untouched. The only changes are on the forward pass to adapt to
+    SDPA API.
+    """
+
+    # Adapted from DiffLlamaAttention.forward
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: bool = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        bsz, q_len, _ = hidden_states.size()
+
+        query_states = self.q_proj(hidden_states)
+        key_states = self.k_proj(hidden_states)
+        value_states = self.v_proj(hidden_states)
+
+        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
+        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        key_states = repeat_kv(key_states, self.num_key_value_groups)
+        value_states = repeat_kv(value_states, self.num_key_value_groups)
+        value_states = torch.cat(torch.chunk(value_states, 2, dim=1), dim=-1)
+        value_states = value_states.repeat(1, 2, 1, 1)
+
+        causal_mask = attention_mask
+        if attention_mask is not None:
+            causal_mask = causal_mask[:, :, :, : key_states.shape[-2]]
+
+        # SDPA with memory-efficient backend is currently (torch==2.1.2) bugged with non-contiguous inputs with custom attn_mask,
+        # Reference: https://github.com/pytorch/pytorch/issues/112577.
+        if query_states.device.type == "cuda" and causal_mask is not None:
+            query_states = query_states.contiguous()
+            key_states = key_states.contiguous()
+            value_states = value_states.contiguous()
+
+        # We dispatch to SDPA's Flash Attention or Efficient kernels via this `is_causal` if statement instead of an inline conditional assignment
+        # in SDPA to support both torch.compile's dynamic shapes and full graph options. An inline conditional prevents dynamic shapes from compiling.
+        is_causal = causal_mask is None and q_len > 1
+
+        attn_output = torch.nn.functional.scaled_dot_product_attention(
+            query_states,
+            key_states,
+            value_states,
+            attn_mask=causal_mask,
+            dropout_p=self.attention_dropout if self.training else 0.0,
+            is_causal=is_causal,
+        )
+
+        attn_output1, attn_output2 = torch.chunk(attn_output, 2, dim=1)
+
+        lambda_1 = torch.exp(torch.sum(self.lambda_q1 * self.lambda_k1, dim=-1, dtype=torch.float32)).to(
+            query_states.dtype
+        )
+        lambda_2 = torch.exp(torch.sum(self.lambda_q2 * self.lambda_k2, dim=-1, dtype=torch.float32)).to(
+            query_states.dtype
+        )
+        lambda_full = lambda_1 - lambda_2 + self.lambda_init
+
+        attn_output = attn_output1 - lambda_full * attn_output2
+        attn_output = (1 - self.lambda_init) * self.groupnorm(attn_output)
+        attn_output = attn_output.transpose(1, 2).contiguous()
+        attn_output = attn_output.view(bsz, q_len, -1)
+        attn_output = self.o_proj(attn_output)
+        return attn_output, None
+
+
+DIFFLLAMA_ATTENTION_CLASSES = {
+    "eager": DiffLlamaAttention,
+    "flash_attention_2": DiffLlamaFlashAttention2,
+    "sdpa": DiffLlamaSdpaAttention,
+}
+
+
+class DiffLlamaDecoderLayer(LlamaDecoderLayer):
+    def __init__(self, config: DiffLlamaConfig, layer_idx: int):
+        super().__init__(config, layer_idx)
+
+        self.self_attn = DIFFLLAMA_ATTENTION_CLASSES[config._attn_implementation](config=config, layer_idx=layer_idx)
+
+
+class DiffLlamaPreTrainedModel(LlamaPreTrainedModel):
+    _supports_flex_attn = False
+    _supports_attention_backend = False
+
+    def _init_weights(self, module):
+        PreTrainedModel._init_weights(self, module)
+        if isinstance(module, DiffLlamaAttention):
+            module.lambda_q1.data.normal_(0, self.config.lambda_std_dev)
+            module.lambda_k1.data.normal_(0, self.config.lambda_std_dev)
+            module.lambda_q2.data.normal_(0, self.config.lambda_std_dev)
+            module.lambda_k2.data.normal_(0, self.config.lambda_std_dev)
+
+
+class DiffLlamaModel(LlamaModel):
+    pass
+
+
+class DiffLlamaForCausalLM(GemmaForCausalLM):
+    pass
+
+
+class DiffLlamaForSequenceClassification(LlamaForSequenceClassification):
+    pass
+
+
+class DiffLlamaForQuestionAnswering(LlamaForQuestionAnswering):
+    pass
+
+
+class DiffLlamaForTokenClassification(LlamaForTokenClassification):
+    pass
+
+
+__all__ = [
+    "DiffLlamaPreTrainedModel",
+    "DiffLlamaModel",
+    "DiffLlamaForCausalLM",
+    "DiffLlamaForSequenceClassification",
+    "DiffLlamaForQuestionAnswering",
+    "DiffLlamaForTokenClassification",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinat/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinat/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b64cdbb3c7eb0467f6112225b8c0d9e1f65f9e99
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinat/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_dinat import *
+    from .modeling_dinat import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinat/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinat/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fbe30443f29195d178c9668f72978179c9f285b5
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinat/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinat/__pycache__/configuration_dinat.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinat/__pycache__/configuration_dinat.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9253c61b9c513defb5fde11579c8216b5d3d39a6
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinat/__pycache__/configuration_dinat.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinat/__pycache__/modeling_dinat.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinat/__pycache__/modeling_dinat.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..789f73cec864be698526ecda3173b7d2136ca2fe
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinat/__pycache__/modeling_dinat.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinat/configuration_dinat.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinat/configuration_dinat.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7d7fa509c5a3b2f5efc3b936cf1761b4ab0e107
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinat/configuration_dinat.py
@@ -0,0 +1,152 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Dilated Neighborhood Attention Transformer model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
+
+
+logger = logging.get_logger(__name__)
+
+
+class DinatConfig(BackboneConfigMixin, PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`DinatModel`]. It is used to instantiate a Dinat
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Dinat
+    [shi-labs/dinat-mini-in1k-224](https://huggingface.co/shi-labs/dinat-mini-in1k-224) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        patch_size (`int`, *optional*, defaults to 4):
+            The size (resolution) of each patch. NOTE: Only patch size of 4 is supported at the moment.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        embed_dim (`int`, *optional*, defaults to 64):
+            Dimensionality of patch embedding.
+        depths (`list[int]`, *optional*, defaults to `[3, 4, 6, 5]`):
+            Number of layers in each level of the encoder.
+        num_heads (`list[int]`, *optional*, defaults to `[2, 4, 8, 16]`):
+            Number of attention heads in each layer of the Transformer encoder.
+        kernel_size (`int`, *optional*, defaults to 7):
+            Neighborhood Attention kernel size.
+        dilations (`list[list[int]]`, *optional*, defaults to `[[1, 8, 1], [1, 4, 1, 4], [1, 2, 1, 2, 1, 2], [1, 1, 1, 1, 1]]`):
+            Dilation value of each NA layer in the Transformer encoder.
+        mlp_ratio (`float`, *optional*, defaults to 3.0):
+            Ratio of MLP hidden dimensionality to embedding dimensionality.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether or not a learnable bias should be added to the queries, keys and values.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings and encoder.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        drop_path_rate (`float`, *optional*, defaults to 0.1):
+            Stochastic depth rate.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder. If string, `"gelu"`, `"relu"`,
+            `"selu"` and `"gelu_new"` are supported.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        layer_scale_init_value (`float`, *optional*, defaults to 0.0):
+            The initial value for the layer scale. Disabled if <=0.
+        out_features (`list[str]`, *optional*):
+            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+        out_indices (`list[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+
+    Example:
+
+    ```python
+    >>> from transformers import DinatConfig, DinatModel
+
+    >>> # Initializing a Dinat shi-labs/dinat-mini-in1k-224 style configuration
+    >>> configuration = DinatConfig()
+
+    >>> # Initializing a model (with random weights) from the shi-labs/dinat-mini-in1k-224 style configuration
+    >>> model = DinatModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "dinat"
+
+    attribute_map = {
+        "num_attention_heads": "num_heads",
+        "num_hidden_layers": "num_layers",
+    }
+
+    def __init__(
+        self,
+        patch_size=4,
+        num_channels=3,
+        embed_dim=64,
+        depths=[3, 4, 6, 5],
+        num_heads=[2, 4, 8, 16],
+        kernel_size=7,
+        dilations=[[1, 8, 1], [1, 4, 1, 4], [1, 2, 1, 2, 1, 2], [1, 1, 1, 1, 1]],
+        mlp_ratio=3.0,
+        qkv_bias=True,
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        drop_path_rate=0.1,
+        hidden_act="gelu",
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        layer_scale_init_value=0.0,
+        out_features=None,
+        out_indices=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.embed_dim = embed_dim
+        self.depths = depths
+        self.num_layers = len(depths)
+        self.num_heads = num_heads
+        self.kernel_size = kernel_size
+        self.dilations = dilations
+        self.mlp_ratio = mlp_ratio
+        self.qkv_bias = qkv_bias
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.drop_path_rate = drop_path_rate
+        self.hidden_act = hidden_act
+        self.layer_norm_eps = layer_norm_eps
+        self.initializer_range = initializer_range
+        # we set the hidden_size attribute in order to make Dinat work with VisionEncoderDecoderModel
+        # this indicates the channel dimension after the last stage of the model
+        self.hidden_size = int(embed_dim * 2 ** (len(depths) - 1))
+        self.layer_scale_init_value = layer_scale_init_value
+        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
+
+
+__all__ = ["DinatConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinat/modeling_dinat.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinat/modeling_dinat.py
new file mode 100644
index 0000000000000000000000000000000000000000..4b7ec37b0ea8489e5db87df22c1876fd4548fe86
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinat/modeling_dinat.py
@@ -0,0 +1,855 @@
+# coding=utf-8
+# Copyright 2022 SHI Labs and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Dilated Neighborhood Attention Transformer model."""
+
+import math
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...modeling_outputs import BackboneOutput
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
+    ModelOutput,
+    OptionalDependencyNotAvailable,
+    auto_docstring,
+    is_natten_available,
+    logging,
+    requires_backends,
+)
+from ...utils.backbone_utils import BackboneMixin
+from .configuration_dinat import DinatConfig
+
+
+if is_natten_available():
+    from natten.functional import natten2dav, natten2dqkrpb
+else:
+
+    def natten2dqkrpb(*args, **kwargs):
+        raise OptionalDependencyNotAvailable()
+
+    def natten2dav(*args, **kwargs):
+        raise OptionalDependencyNotAvailable()
+
+
+logger = logging.get_logger(__name__)
+
+
+# drop_path and DinatDropPath are from the timm library.
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Dinat encoder's outputs, with potential hidden states and attentions.
+    """
+)
+class DinatEncoderOutput(ModelOutput):
+    r"""
+    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+        shape `(batch_size, hidden_size, height, width)`.
+
+        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
+        include the spatial dimensions.
+    """
+
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+    reshaped_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Dinat model's outputs that also contains a pooling of the last hidden states.
+    """
+)
+class DinatModelOutput(ModelOutput):
+    r"""
+    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
+        Average pooling of the last layer hidden-state.
+    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+        shape `(batch_size, hidden_size, height, width)`.
+
+        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
+        include the spatial dimensions.
+    """
+
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    pooler_output: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+    reshaped_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Dinat outputs for image classification.
+    """
+)
+class DinatImageClassifierOutput(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Classification (or regression if config.num_labels==1) loss.
+    logits (`torch.FloatTensor` of shape `(batch_size, config.num_labels)`):
+        Classification (or regression if config.num_labels==1) scores (before SoftMax).
+    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+        shape `(batch_size, hidden_size, height, width)`.
+
+        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
+        include the spatial dimensions.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+    reshaped_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+class DinatEmbeddings(nn.Module):
+    """
+    Construct the patch and position embeddings.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.patch_embeddings = DinatPatchEmbeddings(config)
+
+        self.norm = nn.LayerNorm(config.embed_dim)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, pixel_values: Optional[torch.FloatTensor]) -> tuple[torch.Tensor]:
+        embeddings = self.patch_embeddings(pixel_values)
+        embeddings = self.norm(embeddings)
+
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+class DinatPatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, height, width, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        patch_size = config.patch_size
+        num_channels, hidden_size = config.num_channels, config.embed_dim
+        self.num_channels = num_channels
+
+        if patch_size == 4:
+            pass
+        else:
+            # TODO: Support arbitrary patch sizes.
+            raise ValueError("Dinat only supports patch size of 4 at the moment.")
+
+        self.projection = nn.Sequential(
+            nn.Conv2d(self.num_channels, hidden_size // 2, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)),
+            nn.Conv2d(hidden_size // 2, hidden_size, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1)),
+        )
+
+    def forward(self, pixel_values: Optional[torch.FloatTensor]) -> torch.Tensor:
+        _, num_channels, height, width = pixel_values.shape
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        embeddings = self.projection(pixel_values)
+        embeddings = embeddings.permute(0, 2, 3, 1)
+
+        return embeddings
+
+
+class DinatDownsampler(nn.Module):
+    """
+    Convolutional Downsampling Layer.
+
+    Args:
+        dim (`int`):
+            Number of input channels.
+        norm_layer (`nn.Module`, *optional*, defaults to `nn.LayerNorm`):
+            Normalization layer class.
+    """
+
+    def __init__(self, dim: int, norm_layer: nn.Module = nn.LayerNorm) -> None:
+        super().__init__()
+        self.dim = dim
+        self.reduction = nn.Conv2d(dim, 2 * dim, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
+        self.norm = norm_layer(2 * dim)
+
+    def forward(self, input_feature: torch.Tensor) -> torch.Tensor:
+        input_feature = self.reduction(input_feature.permute(0, 3, 1, 2)).permute(0, 2, 3, 1)
+        input_feature = self.norm(input_feature)
+        return input_feature
+
+
+# Copied from transformers.models.beit.modeling_beit.drop_path
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+
+
+# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->Dinat
+class DinatDropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return f"p={self.drop_prob}"
+
+
+class NeighborhoodAttention(nn.Module):
+    def __init__(self, config, dim, num_heads, kernel_size, dilation):
+        super().__init__()
+        if dim % num_heads != 0:
+            raise ValueError(
+                f"The hidden size ({dim}) is not a multiple of the number of attention heads ({num_heads})"
+            )
+
+        self.num_attention_heads = num_heads
+        self.attention_head_size = int(dim / num_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.kernel_size = kernel_size
+        self.dilation = dilation
+
+        # rpb is learnable relative positional biases; same concept is used Swin.
+        self.rpb = nn.Parameter(torch.zeros(num_heads, (2 * self.kernel_size - 1), (2 * self.kernel_size - 1)))
+
+        self.query = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
+        self.key = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
+        self.value = nn.Linear(self.all_head_size, self.all_head_size, bias=config.qkv_bias)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.Tensor]:
+        batch_size, seq_length, _ = hidden_states.shape
+        query_layer = (
+            self.query(hidden_states)
+            .view(batch_size, -1, self.num_attention_heads, self.attention_head_size)
+            .transpose(1, 2)
+        )
+        key_layer = (
+            self.key(hidden_states)
+            .view(batch_size, -1, self.num_attention_heads, self.attention_head_size)
+            .transpose(1, 2)
+        )
+        value_layer = (
+            self.value(hidden_states)
+            .view(batch_size, -1, self.num_attention_heads, self.attention_head_size)
+            .transpose(1, 2)
+        )
+
+        # Apply the scale factor before computing attention weights. It's usually more efficient because
+        # attention weights are typically a bigger tensor compared to query.
+        # It gives identical results because scalars are commutable in matrix multiplication.
+        query_layer = query_layer / math.sqrt(self.attention_head_size)
+
+        # Compute NA between "query" and "key" to get the raw attention scores, and add relative positional biases.
+        attention_scores = natten2dqkrpb(query_layer, key_layer, self.rpb, self.kernel_size, self.dilation)
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        context_layer = natten2dav(attention_probs, value_layer, self.kernel_size, self.dilation)
+        context_layer = context_layer.permute(0, 2, 3, 1, 4).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+
+        return outputs
+
+
+class NeighborhoodAttentionOutput(nn.Module):
+    def __init__(self, config, dim):
+        super().__init__()
+        self.dense = nn.Linear(dim, dim)
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+
+        return hidden_states
+
+
+class NeighborhoodAttentionModule(nn.Module):
+    def __init__(self, config, dim, num_heads, kernel_size, dilation):
+        super().__init__()
+        self.self = NeighborhoodAttention(config, dim, num_heads, kernel_size, dilation)
+        self.output = NeighborhoodAttentionOutput(config, dim)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.Tensor]:
+        self_outputs = self.self(hidden_states, output_attentions)
+        attention_output = self.output(self_outputs[0], hidden_states)
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+
+class DinatIntermediate(nn.Module):
+    def __init__(self, config, dim):
+        super().__init__()
+        self.dense = nn.Linear(dim, int(config.mlp_ratio * dim))
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+class DinatOutput(nn.Module):
+    def __init__(self, config, dim):
+        super().__init__()
+        self.dense = nn.Linear(int(config.mlp_ratio * dim), dim)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+class DinatLayer(nn.Module):
+    def __init__(self, config, dim, num_heads, dilation, drop_path_rate=0.0):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.kernel_size = config.kernel_size
+        self.dilation = dilation
+        self.window_size = self.kernel_size * self.dilation
+        self.layernorm_before = nn.LayerNorm(dim, eps=config.layer_norm_eps)
+        self.attention = NeighborhoodAttentionModule(
+            config, dim, num_heads, kernel_size=self.kernel_size, dilation=self.dilation
+        )
+        self.drop_path = DinatDropPath(drop_path_rate) if drop_path_rate > 0.0 else nn.Identity()
+        self.layernorm_after = nn.LayerNorm(dim, eps=config.layer_norm_eps)
+        self.intermediate = DinatIntermediate(config, dim)
+        self.output = DinatOutput(config, dim)
+        self.layer_scale_parameters = (
+            nn.Parameter(config.layer_scale_init_value * torch.ones((2, dim)), requires_grad=True)
+            if config.layer_scale_init_value > 0
+            else None
+        )
+
+    def maybe_pad(self, hidden_states, height, width):
+        window_size = self.window_size
+        pad_values = (0, 0, 0, 0, 0, 0)
+        if height < window_size or width < window_size:
+            pad_l = pad_t = 0
+            pad_r = max(0, window_size - width)
+            pad_b = max(0, window_size - height)
+            pad_values = (0, 0, pad_l, pad_r, pad_t, pad_b)
+            hidden_states = nn.functional.pad(hidden_states, pad_values)
+        return hidden_states, pad_values
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        batch_size, height, width, channels = hidden_states.size()
+        shortcut = hidden_states
+
+        hidden_states = self.layernorm_before(hidden_states)
+        # pad hidden_states if they are smaller than kernel size x dilation
+        hidden_states, pad_values = self.maybe_pad(hidden_states, height, width)
+
+        _, height_pad, width_pad, _ = hidden_states.shape
+
+        attention_outputs = self.attention(hidden_states, output_attentions=output_attentions)
+
+        attention_output = attention_outputs[0]
+
+        was_padded = pad_values[3] > 0 or pad_values[5] > 0
+        if was_padded:
+            attention_output = attention_output[:, :height, :width, :].contiguous()
+
+        if self.layer_scale_parameters is not None:
+            attention_output = self.layer_scale_parameters[0] * attention_output
+
+        hidden_states = shortcut + self.drop_path(attention_output)
+
+        layer_output = self.layernorm_after(hidden_states)
+        layer_output = self.output(self.intermediate(layer_output))
+
+        if self.layer_scale_parameters is not None:
+            layer_output = self.layer_scale_parameters[1] * layer_output
+
+        layer_output = hidden_states + self.drop_path(layer_output)
+
+        layer_outputs = (layer_output, attention_outputs[1]) if output_attentions else (layer_output,)
+        return layer_outputs
+
+
+class DinatStage(nn.Module):
+    def __init__(self, config, dim, depth, num_heads, dilations, drop_path_rate, downsample):
+        super().__init__()
+        self.config = config
+        self.dim = dim
+        self.layers = nn.ModuleList(
+            [
+                DinatLayer(
+                    config=config,
+                    dim=dim,
+                    num_heads=num_heads,
+                    dilation=dilations[i],
+                    drop_path_rate=drop_path_rate[i],
+                )
+                for i in range(depth)
+            ]
+        )
+
+        # patch merging layer
+        if downsample is not None:
+            self.downsample = downsample(dim=dim, norm_layer=nn.LayerNorm)
+        else:
+            self.downsample = None
+
+        self.pointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.Tensor]:
+        _, height, width, _ = hidden_states.size()
+        for i, layer_module in enumerate(self.layers):
+            layer_outputs = layer_module(hidden_states, output_attentions)
+            hidden_states = layer_outputs[0]
+
+        hidden_states_before_downsampling = hidden_states
+        if self.downsample is not None:
+            hidden_states = self.downsample(hidden_states_before_downsampling)
+
+        stage_outputs = (hidden_states, hidden_states_before_downsampling)
+
+        if output_attentions:
+            stage_outputs += layer_outputs[1:]
+        return stage_outputs
+
+
+class DinatEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.num_levels = len(config.depths)
+        self.config = config
+        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, sum(config.depths), device="cpu")]
+        self.levels = nn.ModuleList(
+            [
+                DinatStage(
+                    config=config,
+                    dim=int(config.embed_dim * 2**i_layer),
+                    depth=config.depths[i_layer],
+                    num_heads=config.num_heads[i_layer],
+                    dilations=config.dilations[i_layer],
+                    drop_path_rate=dpr[sum(config.depths[:i_layer]) : sum(config.depths[: i_layer + 1])],
+                    downsample=DinatDownsampler if (i_layer < self.num_levels - 1) else None,
+                )
+                for i_layer in range(self.num_levels)
+            ]
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        output_hidden_states_before_downsampling: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[tuple, DinatEncoderOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_reshaped_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        if output_hidden_states:
+            # rearrange b h w c -> b c h w
+            reshaped_hidden_state = hidden_states.permute(0, 3, 1, 2)
+            all_hidden_states += (hidden_states,)
+            all_reshaped_hidden_states += (reshaped_hidden_state,)
+
+        for i, layer_module in enumerate(self.levels):
+            layer_outputs = layer_module(hidden_states, output_attentions)
+
+            hidden_states = layer_outputs[0]
+            hidden_states_before_downsampling = layer_outputs[1]
+
+            if output_hidden_states and output_hidden_states_before_downsampling:
+                # rearrange b h w c -> b c h w
+                reshaped_hidden_state = hidden_states_before_downsampling.permute(0, 3, 1, 2)
+                all_hidden_states += (hidden_states_before_downsampling,)
+                all_reshaped_hidden_states += (reshaped_hidden_state,)
+            elif output_hidden_states and not output_hidden_states_before_downsampling:
+                # rearrange b h w c -> b c h w
+                reshaped_hidden_state = hidden_states.permute(0, 3, 1, 2)
+                all_hidden_states += (hidden_states,)
+                all_reshaped_hidden_states += (reshaped_hidden_state,)
+
+            if output_attentions:
+                all_self_attentions += layer_outputs[2:]
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+
+        return DinatEncoderOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            reshaped_hidden_states=all_reshaped_hidden_states,
+        )
+
+
+@auto_docstring
+class DinatPreTrainedModel(PreTrainedModel):
+    config: DinatConfig
+    base_model_prefix = "dinat"
+    main_input_name = "pixel_values"
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+@auto_docstring
+class DinatModel(DinatPreTrainedModel):
+    def __init__(self, config, add_pooling_layer=True):
+        r"""
+        add_pooling_layer (bool, *optional*, defaults to `True`):
+            Whether to add a pooling layer
+        """
+        super().__init__(config)
+
+        requires_backends(self, ["natten"])
+
+        self.config = config
+        self.num_levels = len(config.depths)
+        self.num_features = int(config.embed_dim * 2 ** (self.num_levels - 1))
+
+        self.embeddings = DinatEmbeddings(config)
+        self.encoder = DinatEncoder(config)
+
+        self.layernorm = nn.LayerNorm(self.num_features, eps=config.layer_norm_eps)
+        self.pooler = nn.AdaptiveAvgPool1d(1) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.patch_embeddings
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, DinatModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        embedding_output = self.embeddings(pixel_values)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = encoder_outputs[0]
+        sequence_output = self.layernorm(sequence_output)
+
+        pooled_output = None
+        if self.pooler is not None:
+            pooled_output = self.pooler(sequence_output.flatten(1, 2).transpose(1, 2))
+            pooled_output = torch.flatten(pooled_output, 1)
+
+        if not return_dict:
+            output = (sequence_output, pooled_output) + encoder_outputs[1:]
+
+            return output
+
+        return DinatModelOutput(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            reshaped_hidden_states=encoder_outputs.reshaped_hidden_states,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    Dinat Model transformer with an image classification head on top (a linear layer on top of the final hidden state
+    of the [CLS] token) e.g. for ImageNet.
+    """
+)
+class DinatForImageClassification(DinatPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        requires_backends(self, ["natten"])
+
+        self.num_labels = config.num_labels
+        self.dinat = DinatModel(config)
+
+        # Classifier head
+        self.classifier = (
+            nn.Linear(self.dinat.num_features, config.num_labels) if config.num_labels > 0 else nn.Identity()
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, DinatImageClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.dinat(
+            pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(labels, logits, self.config)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return DinatImageClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            reshaped_hidden_states=outputs.reshaped_hidden_states,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    NAT backbone, to be used with frameworks like DETR and MaskFormer.
+    """
+)
+class DinatBackbone(DinatPreTrainedModel, BackboneMixin):
+    def __init__(self, config):
+        super().__init__(config)
+        super()._init_backbone(config)
+
+        requires_backends(self, ["natten"])
+
+        self.embeddings = DinatEmbeddings(config)
+        self.encoder = DinatEncoder(config)
+        self.num_features = [config.embed_dim] + [int(config.embed_dim * 2**i) for i in range(len(config.depths))]
+
+        # Add layer norms to hidden states of out_features
+        hidden_states_norms = {}
+        for stage, num_channels in zip(self._out_features, self.channels):
+            hidden_states_norms[stage] = nn.LayerNorm(num_channels)
+        self.hidden_states_norms = nn.ModuleDict(hidden_states_norms)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.patch_embeddings
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> BackboneOutput:
+        r"""
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, AutoBackbone
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> processor = AutoImageProcessor.from_pretrained("shi-labs/nat-mini-in1k-224")
+        >>> model = AutoBackbone.from_pretrained(
+        ...     "shi-labs/nat-mini-in1k-224", out_features=["stage1", "stage2", "stage3", "stage4"]
+        ... )
+
+        >>> inputs = processor(image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+
+        >>> feature_maps = outputs.feature_maps
+        >>> list(feature_maps[-1].shape)
+        [1, 512, 7, 7]
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        embedding_output = self.embeddings(pixel_values)
+
+        outputs = self.encoder(
+            embedding_output,
+            output_attentions=output_attentions,
+            output_hidden_states=True,
+            output_hidden_states_before_downsampling=True,
+            return_dict=True,
+        )
+
+        hidden_states = outputs.reshaped_hidden_states
+
+        feature_maps = ()
+        for stage, hidden_state in zip(self.stage_names, hidden_states):
+            if stage in self.out_features:
+                batch_size, num_channels, height, width = hidden_state.shape
+                hidden_state = hidden_state.permute(0, 2, 3, 1).contiguous()
+                hidden_state = hidden_state.view(batch_size, height * width, num_channels)
+                hidden_state = self.hidden_states_norms[stage](hidden_state)
+                hidden_state = hidden_state.view(batch_size, height, width, num_channels)
+                hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous()
+                feature_maps += (hidden_state,)
+
+        if not return_dict:
+            output = (feature_maps,)
+            if output_hidden_states:
+                output += (outputs.hidden_states,)
+            return output
+
+        return BackboneOutput(
+            feature_maps=feature_maps,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = ["DinatForImageClassification", "DinatModel", "DinatPreTrainedModel", "DinatBackbone"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov2/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov2/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..48bb6f4ca9223757afd8cf7da0bd2c90bd371784
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov2/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov2/__pycache__/configuration_dinov2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov2/__pycache__/configuration_dinov2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..923ce38f539afb2ce60cb26c2fb3e0181bfdf50a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov2/__pycache__/configuration_dinov2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov2/__pycache__/modeling_dinov2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov2/__pycache__/modeling_dinov2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..975e54dd5792e9d440fb576f2cd3b27cf8255c91
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov2/__pycache__/modeling_dinov2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov2/__pycache__/modeling_flax_dinov2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov2/__pycache__/modeling_flax_dinov2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..86285018bba522776f6c43710816293d0b60637e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov2/__pycache__/modeling_flax_dinov2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov2_with_registers/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov2_with_registers/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d10027b6a3b6375235a6785df044e8f0ce5fb33
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov2_with_registers/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_dinov2_with_registers import *
+    from .modeling_dinov2_with_registers import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py
new file mode 100644
index 0000000000000000000000000000000000000000..ec4f446fc684f40d634927c1e7a52b64c5732b12
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov2_with_registers/configuration_dinov2_with_registers.py
@@ -0,0 +1,159 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_dinov2_with_registers.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2024 Meta Inc. and the HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from ...configuration_utils import PretrainedConfig
+from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
+
+
+class Dinov2WithRegistersConfig(BackboneConfigMixin, PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Dinov2WithRegistersModel`]. It is used to instantiate an
+    Dinov2WithRegisters model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the DINOv2 with Registers
+    [facebook/dinov2-with-registers-base](https://huggingface.co/facebook/dinov2-with-registers-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        mlp_ratio (`int`, *optional*, defaults to 4):
+            Ratio of the hidden size of the MLPs relative to the `hidden_size`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+        layerscale_value (`float`, *optional*, defaults to 1.0):
+           Initial value to use for layer scale.
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            Stochastic depth rate per sample (when applied in the main path of residual layers).
+        use_swiglu_ffn (`bool`, *optional*, defaults to `False`):
+            Whether to use the SwiGLU feedforward neural network.
+        num_register_tokens (`int`, *optional*, defaults to 4):
+            Number of register tokens to use.
+        out_features (`list[str]`, *optional*):
+            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+        out_indices (`list[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+        apply_layernorm (`bool`, *optional*, defaults to `True`):
+            Whether to apply layer normalization to the feature maps in case the model is used as backbone.
+        reshape_hidden_states (`bool`, *optional*, defaults to `True`):
+            Whether to reshape the feature maps to 4D tensors of shape `(batch_size, hidden_size, height, width)` in
+            case the model is used as backbone. If `False`, the feature maps will be 3D tensors of shape `(batch_size,
+            seq_len, hidden_size)`.
+
+    Example:
+
+    ```python
+    >>> from transformers import Dinov2WithRegistersConfig, Dinov2WithRegistersModel
+
+    >>> # Initializing a Dinov2WithRegisters base style configuration
+    >>> configuration = Dinov2WithRegistersConfig()
+
+    >>> # Initializing a model (with random weights) from the base style configuration
+    >>> model = Dinov2WithRegistersModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "dinov2_with_registers"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        mlp_ratio=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        initializer_range=0.02,
+        layer_norm_eps=1e-6,
+        image_size=224,
+        patch_size=16,
+        num_channels=3,
+        qkv_bias=True,
+        layerscale_value=1.0,
+        drop_path_rate=0.0,
+        use_swiglu_ffn=False,
+        num_register_tokens=4,
+        out_features=None,
+        out_indices=None,
+        apply_layernorm=True,
+        reshape_hidden_states=True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.mlp_ratio = mlp_ratio
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.qkv_bias = qkv_bias
+        self.layerscale_value = layerscale_value
+        self.drop_path_rate = drop_path_rate
+        self.use_swiglu_ffn = use_swiglu_ffn
+        self.num_register_tokens = num_register_tokens
+        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, num_hidden_layers + 1)]
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
+        self.apply_layernorm = apply_layernorm
+        self.reshape_hidden_states = reshape_hidden_states
+
+
+__all__ = ["Dinov2WithRegistersConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py
new file mode 100644
index 0000000000000000000000000000000000000000..9c6f4c335f58b2ddd37bf4042d5f8e51c474cee9
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov2_with_registers/modeling_dinov2_with_registers.py
@@ -0,0 +1,712 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_dinov2_with_registers.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2024 Meta Inc. and the HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import collections.abc
+from typing import Callable, Optional, Union
+
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BackboneOutput, BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...pytorch_utils import find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import TransformersKwargs, auto_docstring, torch_int
+from ...utils.backbone_utils import BackboneMixin
+from ...utils.generic import can_return_tuple, check_model_inputs
+from .configuration_dinov2_with_registers import Dinov2WithRegistersConfig
+
+
+class Dinov2WithRegistersPatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        image_size, patch_size = config.image_size, config.patch_size
+        num_channels, hidden_size = config.num_channels, config.hidden_size
+
+        image_size = image_size if isinstance(image_size, collections.abc.Iterable) else (image_size, image_size)
+        patch_size = patch_size if isinstance(patch_size, collections.abc.Iterable) else (patch_size, patch_size)
+        num_patches = (image_size[1] // patch_size[1]) * (image_size[0] // patch_size[0])
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.num_patches = num_patches
+
+        self.projection = nn.Conv2d(num_channels, hidden_size, kernel_size=patch_size, stride=patch_size)
+
+    def forward(self, pixel_values: torch.Tensor) -> torch.Tensor:
+        num_channels = pixel_values.shape[1]
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+                f" Expected {self.num_channels} but got {num_channels}."
+            )
+        embeddings = self.projection(pixel_values).flatten(2).transpose(1, 2)
+        return embeddings
+
+
+class Dinov2WithRegistersEmbeddings(nn.Module):
+    """
+    Construct the CLS token, mask token, register tokens, position and patch embeddings.
+    """
+
+    def __init__(self, config: Dinov2WithRegistersConfig) -> None:
+        super().__init__()
+
+        self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size))
+        self.mask_token = nn.Parameter(torch.zeros(1, config.hidden_size))
+        self.register_tokens = nn.Parameter(torch.zeros(1, config.num_register_tokens, config.hidden_size))
+        self.patch_embeddings = Dinov2WithRegistersPatchEmbeddings(config)
+        num_patches = self.patch_embeddings.num_patches
+        self.position_embeddings = nn.Parameter(torch.randn(1, num_patches + 1, config.hidden_size))
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.patch_size = config.patch_size
+        self.config = config
+
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
+        resolution images. This implementation supports torch.jit tracing while maintaining backwards compatibility
+        with the original implementation.
+
+        Adapted from:
+        - https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
+        - https://github.com/facebookresearch/dinov2/blob/main/dinov2/models/vision_transformer.py
+        """
+        num_patches = embeddings.shape[1] - 1
+        num_positions = self.position_embeddings.shape[1] - 1
+
+        # Skip interpolation for matching dimensions (unless tracing)
+        if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
+            return self.position_embeddings
+
+        # Handle class token and patch embeddings separately
+        class_pos_embed = self.position_embeddings[:, 0]
+        patch_pos_embed = self.position_embeddings[:, 1:]
+        dim = embeddings.shape[-1]
+
+        # Calculate new dimensions
+        height = height // self.config.patch_size
+        width = width // self.config.patch_size
+
+        # Reshape for interpolation
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
+        # Store original dtype for restoration after interpolation
+        target_dtype = patch_pos_embed.dtype
+
+        # Interpolate at float32 precision
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.to(dtype=torch.float32),
+            size=(torch_int(height), torch_int(width)),  # Explicit size instead of scale_factor
+            mode="bicubic",
+            align_corners=False,
+            antialias=True,
+        ).to(dtype=target_dtype)
+
+        # Validate output dimensions if not tracing
+        if not torch.jit.is_tracing():
+            if int(height) != patch_pos_embed.shape[-2] or int(width) != patch_pos_embed.shape[-1]:
+                raise ValueError("Width or height does not match with the interpolated position embeddings")
+
+        # Reshape back to original format
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+
+        # Combine class and patch embeddings
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1)
+
+    def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.Tensor] = None) -> torch.Tensor:
+        batch_size, _, height, width = pixel_values.shape
+        target_dtype = self.patch_embeddings.projection.weight.dtype
+        embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype))
+
+        if bool_masked_pos is not None:
+            embeddings = torch.where(
+                bool_masked_pos.unsqueeze(-1), self.mask_token.to(embeddings.dtype).unsqueeze(0), embeddings
+            )
+
+        # add the [CLS] token to the embedded patch tokens
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+
+        # add positional encoding to each token
+        embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+
+        # add register tokens
+        embeddings = torch.cat(
+            (embeddings[:, :1], self.register_tokens.expand(embeddings.shape[0], -1, -1), embeddings[:, 1:]), dim=1
+        )
+
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    # Take the dot product between "query" and "key" to get the raw attention scores.
+    attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
+
+    # Normalize the attention scores to probabilities.
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+
+    # This is actually dropping out entire tokens to attend to, which might
+    # seem a bit unusual, but is taken from the original Transformer paper.
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+    # Mask heads if we want to
+    if attention_mask is not None:
+        attn_weights = attn_weights * attention_mask
+
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class Dinov2WithRegistersSelfAttention(nn.Module):
+    def __init__(self, config: Dinov2WithRegistersConfig):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size {config.hidden_size} is not a multiple of the number of attention "
+                f"heads {config.num_attention_heads}."
+            )
+
+        self.config = config
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.dropout_prob = config.attention_probs_dropout_prob
+        self.scaling = self.attention_head_size**-0.5
+        self.is_causal = False
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size, bias=config.qkv_bias)
+
+    def forward(
+        self, hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        batch_size = hidden_states.shape[0]
+        new_shape = batch_size, -1, self.num_attention_heads, self.attention_head_size
+
+        key_layer = self.key(hidden_states).view(*new_shape).transpose(1, 2)
+        value_layer = self.value(hidden_states).view(*new_shape).transpose(1, 2)
+        query_layer = self.query(hidden_states).view(*new_shape).transpose(1, 2)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        context_layer, attention_probs = attention_interface(
+            self,
+            query_layer,
+            key_layer,
+            value_layer,
+            head_mask,
+            is_causal=self.is_causal,
+            scaling=self.scaling,
+            dropout=0.0 if not self.training else self.dropout_prob,
+        )
+
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.reshape(new_context_layer_shape)
+
+        return context_layer, attention_probs
+
+
+class Dinov2WithRegistersSelfOutput(nn.Module):
+    """
+    The residual connection is defined in Dinov2WithRegistersLayer instead of here (as is the case with other models), due to the
+    layernorm applied before each block.
+    """
+
+    def __init__(self, config: Dinov2WithRegistersConfig):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+class Dinov2WithRegistersAttention(nn.Module):
+    def __init__(self, config: Dinov2WithRegistersConfig):
+        super().__init__()
+        self.attention = Dinov2WithRegistersSelfAttention(config)
+        self.output = Dinov2WithRegistersSelfOutput(config)
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads: set[int]):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.attention.num_attention_heads, self.attention.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.attention.query = prune_linear_layer(self.attention.query, index)
+        self.attention.key = prune_linear_layer(self.attention.key, index)
+        self.attention.value = prune_linear_layer(self.attention.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.attention.num_attention_heads = self.attention.num_attention_heads - len(heads)
+        self.attention.all_head_size = self.attention.attention_head_size * self.attention.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def forward(self, hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None) -> torch.Tensor:
+        self_attn_output, _ = self.attention(hidden_states, head_mask)
+        output = self.output(self_attn_output, hidden_states)
+        return output
+
+
+class Dinov2WithRegistersLayerScale(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        self.lambda1 = nn.Parameter(config.layerscale_value * torch.ones(config.hidden_size))
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        return hidden_state * self.lambda1
+
+
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+
+
+class Dinov2WithRegistersDropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return f"p={self.drop_prob}"
+
+
+class Dinov2WithRegistersMLP(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        in_features = out_features = config.hidden_size
+        hidden_features = int(config.hidden_size * config.mlp_ratio)
+        self.fc1 = nn.Linear(in_features, hidden_features, bias=True)
+        if isinstance(config.hidden_act, str):
+            self.activation = ACT2FN[config.hidden_act]
+        else:
+            self.activation = config.hidden_act
+        self.fc2 = nn.Linear(hidden_features, out_features, bias=True)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.fc1(hidden_state)
+        hidden_state = self.activation(hidden_state)
+        hidden_state = self.fc2(hidden_state)
+        return hidden_state
+
+
+class Dinov2WithRegistersSwiGLUFFN(nn.Module):
+    def __init__(self, config) -> None:
+        super().__init__()
+        in_features = out_features = config.hidden_size
+        hidden_features = int(config.hidden_size * config.mlp_ratio)
+        hidden_features = (int(hidden_features * 2 / 3) + 7) // 8 * 8
+
+        self.weights_in = nn.Linear(in_features, 2 * hidden_features, bias=True)
+        self.weights_out = nn.Linear(hidden_features, out_features, bias=True)
+
+    def forward(self, hidden_state: torch.Tensor) -> torch.Tensor:
+        hidden_state = self.weights_in(hidden_state)
+        x1, x2 = hidden_state.chunk(2, dim=-1)
+        hidden = nn.functional.silu(x1) * x2
+        return self.weights_out(hidden)
+
+
+class Dinov2WithRegistersLayer(GradientCheckpointingLayer):
+    """This corresponds to the Block class in the original implementation."""
+
+    def __init__(self, config: Dinov2WithRegistersConfig) -> None:
+        super().__init__()
+
+        self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.attention = Dinov2WithRegistersAttention(config)
+        self.layer_scale1 = Dinov2WithRegistersLayerScale(config)
+        self.drop_path = (
+            Dinov2WithRegistersDropPath(config.drop_path_rate) if config.drop_path_rate > 0.0 else nn.Identity()
+        )
+
+        self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        if config.use_swiglu_ffn:
+            self.mlp = Dinov2WithRegistersSwiGLUFFN(config)
+        else:
+            self.mlp = Dinov2WithRegistersMLP(config)
+        self.layer_scale2 = Dinov2WithRegistersLayerScale(config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        hidden_states_norm = self.norm1(hidden_states)
+        self_attention_output = self.attention(hidden_states_norm, head_mask)
+        self_attention_output = self.layer_scale1(self_attention_output)
+
+        # first residual connection
+        hidden_states = self.drop_path(self_attention_output) + hidden_states
+
+        # in Dinov2WithRegisters, layernorm is also applied after self-attention
+        layer_output = self.norm2(hidden_states)
+        layer_output = self.mlp(layer_output)
+        layer_output = self.layer_scale2(layer_output)
+
+        # second residual connection
+        layer_output = self.drop_path(layer_output) + hidden_states
+
+        return layer_output
+
+
+class Dinov2WithRegistersEncoder(nn.Module):
+    def __init__(self, config: Dinov2WithRegistersConfig):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([Dinov2WithRegistersLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self, hidden_states: torch.Tensor, head_mask: Optional[torch.Tensor] = None, output_hidden_states: bool = False
+    ) -> BaseModelOutput:
+        all_hidden_states = [hidden_states] if output_hidden_states else None
+        for i, layer_module in enumerate(self.layer):
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            hidden_states = layer_module(hidden_states, layer_head_mask)
+            if all_hidden_states:
+                all_hidden_states.append(hidden_states)
+
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=tuple(all_hidden_states) if all_hidden_states else None,
+        )
+
+
+@auto_docstring
+class Dinov2WithRegistersPreTrainedModel(PreTrainedModel):
+    config: Dinov2WithRegistersConfig
+    base_model_prefix = "dinov2_with_registers"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Dinov2WithRegistersLayer"]
+    _supports_sdpa = True
+    _supports_flash_attn = True
+    _supports_flex_attn = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "attentions": Dinov2WithRegistersSelfAttention,
+    }
+
+    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid
+            # `trunc_normal_cpu` not implemented in `half` issues
+            module.weight.data = nn.init.trunc_normal_(
+                module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range
+            ).to(module.weight.dtype)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, Dinov2WithRegistersEmbeddings):
+            module.position_embeddings.data = nn.init.trunc_normal_(
+                module.position_embeddings.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.position_embeddings.dtype)
+
+            module.cls_token.data = nn.init.trunc_normal_(
+                module.cls_token.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.cls_token.dtype)
+
+            module.mask_token.data.zero_()
+            module.register_tokens.data.zero_()
+        elif isinstance(module, Dinov2WithRegistersLayerScale):  # noqa: F821
+            module.lambda1.data.fill_(self.config.layerscale_value)
+
+
+@auto_docstring
+class Dinov2WithRegistersModel(Dinov2WithRegistersPreTrainedModel):
+    def __init__(self, config: Dinov2WithRegistersConfig):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = Dinov2WithRegistersEmbeddings(config)
+        self.encoder = Dinov2WithRegistersEncoder(config)
+
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> Dinov2WithRegistersPatchEmbeddings:
+        return self.embeddings.patch_embeddings
+
+    def _prune_heads(self, heads_to_prune: dict[int, list[int]]) -> None:
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @check_model_inputs(tie_last_hidden_states=False)
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        bool_masked_pos: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        **kwargs,
+    ) -> BaseModelOutputWithPooling:
+        r"""
+        bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`):
+            Boolean masked positions. Indicates which patches are masked (1) and which aren't (0). Only relevant for
+            pre-training.
+        """
+        if output_hidden_states is None:
+            output_hidden_states = self.config.output_hidden_states
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        embedding_output = self.embeddings(pixel_values, bool_masked_pos=bool_masked_pos)
+
+        encoder_outputs: BaseModelOutput = self.encoder(
+            embedding_output, head_mask=head_mask, output_hidden_states=output_hidden_states
+        )
+        sequence_output = encoder_outputs.last_hidden_state
+        sequence_output = self.layernorm(sequence_output)
+        pooled_output = sequence_output[:, 0, :]
+
+        return BaseModelOutputWithPooling(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    Dinov2WithRegisters Model transformer with an image classification head on top (a linear layer on top of the final hidden state
+    of the [CLS] token) e.g. for ImageNet.
+    """
+)
+class Dinov2WithRegistersForImageClassification(Dinov2WithRegistersPreTrainedModel):
+    def __init__(self, config: Dinov2WithRegistersConfig) -> None:
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.dinov2_with_registers = Dinov2WithRegistersModel(config)
+
+        # Classifier head
+        self.classifier = (
+            nn.Linear(config.hidden_size * 2, config.num_labels) if config.num_labels > 0 else nn.Identity()
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> ImageClassifierOutput:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        outputs: BaseModelOutputWithPooling = self.dinov2_with_registers(pixel_values, head_mask=head_mask, **kwargs)
+        sequence_output = outputs.last_hidden_state  # batch_size, sequence_length, hidden_size
+
+        cls_token = sequence_output[:, 0]
+        # cls and register tokens should not be included in patch tokens variable
+        patch_tokens = sequence_output[:, 1 + self.config.num_register_tokens :]
+
+        linear_input = torch.cat([cls_token, patch_tokens.mean(dim=1)], dim=1)
+        logits = self.classifier(linear_input)
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(labels, logits, self.config, **kwargs)
+
+        return ImageClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    Dinov2WithRegisters backbone, to be used with frameworks like DETR and MaskFormer.
+    """
+)
+class Dinov2WithRegistersBackbone(Dinov2WithRegistersPreTrainedModel, BackboneMixin):
+    def __init__(self, config):
+        super().__init__(config)
+        super()._init_backbone(config)
+        self.num_features = [config.hidden_size for _ in range(config.num_hidden_layers + 1)]
+        self.embeddings = Dinov2WithRegistersEmbeddings(config)
+        self.encoder = Dinov2WithRegistersEncoder(config)
+
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        self.num_register_tokens = config.num_register_tokens
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> Dinov2WithRegistersPatchEmbeddings:
+        return self.embeddings.patch_embeddings
+
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        output_hidden_states: Optional[bool] = None,
+        **kwargs,
+    ) -> BackboneOutput:
+        r"""
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, AutoBackbone
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> processor = AutoImageProcessor.from_pretrained("facebook/dinov2-with-registers-base")
+        >>> model = AutoBackbone.from_pretrained(
+        ...     "facebook/dinov2-with-registers-base", out_features=["stage2", "stage5", "stage8", "stage11"]
+        ... )
+
+        >>> inputs = processor(image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> feature_maps = outputs.feature_maps
+        >>> list(feature_maps[-1].shape)
+        [1, 768, 16, 16]
+        ```"""
+        if output_hidden_states is None:
+            output_hidden_states = self.config.output_hidden_states
+
+        embedding_output = self.embeddings(pixel_values)
+        output: BaseModelOutput = self.encoder(embedding_output, output_hidden_states=True)
+        hidden_states = output.hidden_states
+
+        feature_maps = []
+        for stage, hidden_state in zip(self.stage_names, hidden_states):
+            if stage in self.out_features:
+                if self.config.apply_layernorm:
+                    hidden_state = self.layernorm(hidden_state)
+                if self.config.reshape_hidden_states:
+                    hidden_state = hidden_state[:, 1 + self.num_register_tokens :]
+                    # this was actually a bug in the original implementation that we copied here,
+                    # cause normally the order is height, width
+                    batch_size, _, height, width = pixel_values.shape
+                    patch_size = self.config.patch_size
+                    hidden_state = hidden_state.reshape(batch_size, height // patch_size, width // patch_size, -1)
+                    hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous()
+                feature_maps.append(hidden_state)
+
+        return BackboneOutput(
+            feature_maps=tuple(feature_maps),
+            hidden_states=hidden_states if output_hidden_states else None,
+        )
+
+
+__all__ = [
+    "Dinov2WithRegistersPreTrainedModel",
+    "Dinov2WithRegistersModel",
+    "Dinov2WithRegistersForImageClassification",
+    "Dinov2WithRegistersBackbone",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py
new file mode 100644
index 0000000000000000000000000000000000000000..686528002b09c9689d66a057ed55eb1a43b0d256
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dinov2_with_registers/modular_dinov2_with_registers.py
@@ -0,0 +1,435 @@
+# coding=utf-8
+# Copyright 2024 Meta Inc. and the HuggingFace Inc. team. All rights reserved.
+#
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Union
+
+import torch
+from torch import nn
+
+from ....transformers.models.dinov2.modeling_dinov2 import (
+    Dinov2Backbone,
+    Dinov2Encoder,
+    Dinov2ForImageClassification,
+    Dinov2Model,
+    Dinov2PatchEmbeddings,
+    Dinov2PreTrainedModel,
+)
+from ...configuration_utils import PretrainedConfig
+from ...modeling_outputs import BackboneOutput, BaseModelOutput, BaseModelOutputWithPooling, ImageClassifierOutput
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, logging, torch_int
+from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
+
+
+logger = logging.get_logger(__name__)
+
+
+class Dinov2WithRegistersConfig(BackboneConfigMixin, PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Dinov2WithRegistersModel`]. It is used to instantiate an
+    Dinov2WithRegisters model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the DINOv2 with Registers
+    [facebook/dinov2-with-registers-base](https://huggingface.co/facebook/dinov2-with-registers-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        mlp_ratio (`int`, *optional*, defaults to 4):
+            Ratio of the hidden size of the MLPs relative to the `hidden_size`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 16):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+        layerscale_value (`float`, *optional*, defaults to 1.0):
+           Initial value to use for layer scale.
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            Stochastic depth rate per sample (when applied in the main path of residual layers).
+        use_swiglu_ffn (`bool`, *optional*, defaults to `False`):
+            Whether to use the SwiGLU feedforward neural network.
+        num_register_tokens (`int`, *optional*, defaults to 4):
+            Number of register tokens to use.
+        out_features (`list[str]`, *optional*):
+            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+        out_indices (`list[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+        apply_layernorm (`bool`, *optional*, defaults to `True`):
+            Whether to apply layer normalization to the feature maps in case the model is used as backbone.
+        reshape_hidden_states (`bool`, *optional*, defaults to `True`):
+            Whether to reshape the feature maps to 4D tensors of shape `(batch_size, hidden_size, height, width)` in
+            case the model is used as backbone. If `False`, the feature maps will be 3D tensors of shape `(batch_size,
+            seq_len, hidden_size)`.
+
+    Example:
+
+    ```python
+    >>> from transformers import Dinov2WithRegistersConfig, Dinov2WithRegistersModel
+
+    >>> # Initializing a Dinov2WithRegisters base style configuration
+    >>> configuration = Dinov2WithRegistersConfig()
+
+    >>> # Initializing a model (with random weights) from the base style configuration
+    >>> model = Dinov2WithRegistersModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "dinov2_with_registers"
+
+    def __init__(
+        self,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        mlp_ratio=4,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.0,
+        attention_probs_dropout_prob=0.0,
+        initializer_range=0.02,
+        layer_norm_eps=1e-6,
+        image_size=224,
+        patch_size=16,
+        num_channels=3,
+        qkv_bias=True,
+        layerscale_value=1.0,
+        drop_path_rate=0.0,
+        use_swiglu_ffn=False,
+        num_register_tokens=4,
+        out_features=None,
+        out_indices=None,
+        apply_layernorm=True,
+        reshape_hidden_states=True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.mlp_ratio = mlp_ratio
+        self.hidden_act = hidden_act
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.qkv_bias = qkv_bias
+        self.layerscale_value = layerscale_value
+        self.drop_path_rate = drop_path_rate
+        self.use_swiglu_ffn = use_swiglu_ffn
+        self.num_register_tokens = num_register_tokens
+        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, num_hidden_layers + 1)]
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
+        self.apply_layernorm = apply_layernorm
+        self.reshape_hidden_states = reshape_hidden_states
+
+
+class Dinov2WithRegistersPatchEmbeddings(Dinov2PatchEmbeddings):
+    pass
+
+
+class Dinov2WithRegistersEmbeddings(nn.Module):
+    """
+    Construct the CLS token, mask token, register tokens, position and patch embeddings.
+    """
+
+    def __init__(self, config: Dinov2WithRegistersConfig) -> None:
+        super().__init__()
+
+        self.cls_token = nn.Parameter(torch.randn(1, 1, config.hidden_size))
+        self.mask_token = nn.Parameter(torch.zeros(1, config.hidden_size))
+        self.register_tokens = nn.Parameter(torch.zeros(1, config.num_register_tokens, config.hidden_size))
+        self.patch_embeddings = Dinov2WithRegistersPatchEmbeddings(config)
+        num_patches = self.patch_embeddings.num_patches
+        self.position_embeddings = nn.Parameter(torch.randn(1, num_patches + 1, config.hidden_size))
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        self.patch_size = config.patch_size
+        self.config = config
+
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher
+        resolution images. This implementation supports torch.jit tracing while maintaining backwards compatibility
+        with the original implementation.
+
+        Adapted from:
+        - https://github.com/facebookresearch/dino/blob/main/vision_transformer.py
+        - https://github.com/facebookresearch/dinov2/blob/main/dinov2/models/vision_transformer.py
+        """
+        num_patches = embeddings.shape[1] - 1
+        num_positions = self.position_embeddings.shape[1] - 1
+
+        # Skip interpolation for matching dimensions (unless tracing)
+        if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
+            return self.position_embeddings
+
+        # Handle class token and patch embeddings separately
+        class_pos_embed = self.position_embeddings[:, 0]
+        patch_pos_embed = self.position_embeddings[:, 1:]
+        dim = embeddings.shape[-1]
+
+        # Calculate new dimensions
+        height = height // self.config.patch_size
+        width = width // self.config.patch_size
+
+        # Reshape for interpolation
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
+        # Store original dtype for restoration after interpolation
+        target_dtype = patch_pos_embed.dtype
+
+        # Interpolate at float32 precision
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed.to(dtype=torch.float32),
+            size=(torch_int(height), torch_int(width)),  # Explicit size instead of scale_factor
+            mode="bicubic",
+            align_corners=False,
+            antialias=True,
+        ).to(dtype=target_dtype)
+
+        # Validate output dimensions if not tracing
+        if not torch.jit.is_tracing():
+            if int(height) != patch_pos_embed.shape[-2] or int(width) != patch_pos_embed.shape[-1]:
+                raise ValueError("Width or height does not match with the interpolated position embeddings")
+
+        # Reshape back to original format
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+
+        # Combine class and patch embeddings
+        return torch.cat((class_pos_embed.unsqueeze(0), patch_pos_embed), dim=1)
+
+    def forward(self, pixel_values: torch.Tensor, bool_masked_pos: Optional[torch.Tensor] = None) -> torch.Tensor:
+        batch_size, _, height, width = pixel_values.shape
+        target_dtype = self.patch_embeddings.projection.weight.dtype
+        embeddings = self.patch_embeddings(pixel_values.to(dtype=target_dtype))
+
+        if bool_masked_pos is not None:
+            embeddings = torch.where(
+                bool_masked_pos.unsqueeze(-1), self.mask_token.to(embeddings.dtype).unsqueeze(0), embeddings
+            )
+
+        # add the [CLS] token to the embedded patch tokens
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        embeddings = torch.cat((cls_tokens, embeddings), dim=1)
+
+        # add positional encoding to each token
+        embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+
+        # add register tokens
+        embeddings = torch.cat(
+            (embeddings[:, :1], self.register_tokens.expand(embeddings.shape[0], -1, -1), embeddings[:, 1:]), dim=1
+        )
+
+        embeddings = self.dropout(embeddings)
+
+        return embeddings
+
+
+class Dinov2WithRegistersEncoder(Dinov2Encoder):
+    pass
+
+
+class Dinov2WithRegistersPreTrainedModel(Dinov2PreTrainedModel):
+    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d, nn.LayerNorm]) -> None:
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            # Upcast the input in `fp32` and cast it back to desired `dtype` to avoid
+            # `trunc_normal_cpu` not implemented in `half` issues
+            module.weight.data = nn.init.trunc_normal_(
+                module.weight.data.to(torch.float32), mean=0.0, std=self.config.initializer_range
+            ).to(module.weight.dtype)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, Dinov2WithRegistersEmbeddings):
+            module.position_embeddings.data = nn.init.trunc_normal_(
+                module.position_embeddings.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.position_embeddings.dtype)
+
+            module.cls_token.data = nn.init.trunc_normal_(
+                module.cls_token.data.to(torch.float32),
+                mean=0.0,
+                std=self.config.initializer_range,
+            ).to(module.cls_token.dtype)
+
+            module.mask_token.data.zero_()
+            module.register_tokens.data.zero_()
+        elif isinstance(module, Dinov2WithRegistersLayerScale):  # noqa: F821
+            module.lambda1.data.fill_(self.config.layerscale_value)
+
+
+class Dinov2WithRegistersModel(Dinov2Model):
+    pass
+
+
+class Dinov2WithRegistersForImageClassification(Dinov2ForImageClassification):
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> ImageClassifierOutput:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        outputs: BaseModelOutputWithPooling = self.dinov2_with_registers(pixel_values, head_mask=head_mask, **kwargs)
+        sequence_output = outputs.last_hidden_state  # batch_size, sequence_length, hidden_size
+
+        cls_token = sequence_output[:, 0]
+        # cls and register tokens should not be included in patch tokens variable
+        patch_tokens = sequence_output[:, 1 + self.config.num_register_tokens :]
+
+        linear_input = torch.cat([cls_token, patch_tokens.mean(dim=1)], dim=1)
+        logits = self.classifier(linear_input)
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(labels, logits, self.config, **kwargs)
+
+        return ImageClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class Dinov2WithRegistersBackbone(Dinov2Backbone):
+    def __init__(self, config):
+        super().__init__(config)
+        super()._init_backbone(config)
+
+        self.num_register_tokens = config.num_register_tokens
+        self.num_features = [config.hidden_size for _ in range(config.num_hidden_layers + 1)]
+        self.embeddings = Dinov2WithRegistersEmbeddings(config)
+        self.encoder = Dinov2WithRegistersEncoder(config)
+
+        self.layernorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> Dinov2WithRegistersPatchEmbeddings:
+        return self.embeddings.patch_embeddings
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        output_hidden_states: Optional[bool] = None,
+        **kwargs,
+    ) -> BackboneOutput:
+        r"""
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, AutoBackbone
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> processor = AutoImageProcessor.from_pretrained("facebook/dinov2-with-registers-base")
+        >>> model = AutoBackbone.from_pretrained(
+        ...     "facebook/dinov2-with-registers-base", out_features=["stage2", "stage5", "stage8", "stage11"]
+        ... )
+
+        >>> inputs = processor(image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> feature_maps = outputs.feature_maps
+        >>> list(feature_maps[-1].shape)
+        [1, 768, 16, 16]
+        ```"""
+        if output_hidden_states is None:
+            output_hidden_states = self.config.output_hidden_states
+
+        embedding_output = self.embeddings(pixel_values)
+        output: BaseModelOutput = self.encoder(embedding_output, output_hidden_states=True)
+        hidden_states = output.hidden_states
+
+        feature_maps = []
+        for stage, hidden_state in zip(self.stage_names, hidden_states):
+            if stage in self.out_features:
+                if self.config.apply_layernorm:
+                    hidden_state = self.layernorm(hidden_state)
+                if self.config.reshape_hidden_states:
+                    hidden_state = hidden_state[:, 1 + self.num_register_tokens :]
+                    # this was actually a bug in the original implementation that we copied here,
+                    # cause normally the order is height, width
+                    batch_size, _, height, width = pixel_values.shape
+                    patch_size = self.config.patch_size
+                    hidden_state = hidden_state.reshape(batch_size, height // patch_size, width // patch_size, -1)
+                    hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous()
+                feature_maps.append(hidden_state)
+
+        return BackboneOutput(
+            feature_maps=tuple(feature_maps),
+            hidden_states=hidden_states if output_hidden_states else None,
+        )
+
+
+__all__ = [
+    "Dinov2WithRegistersConfig",
+    "Dinov2WithRegistersPreTrainedModel",
+    "Dinov2WithRegistersModel",
+    "Dinov2WithRegistersForImageClassification",
+    "Dinov2WithRegistersBackbone",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dots1/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dots1/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..599b811d4b320ccb5e37de217903203234cb210e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dots1/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dots1/__pycache__/configuration_dots1.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dots1/__pycache__/configuration_dots1.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..92d3d510d394c16661ab288a691967d7fc9e97f2
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dots1/__pycache__/configuration_dots1.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dots1/__pycache__/modeling_dots1.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dots1/__pycache__/modeling_dots1.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4d20bd4055bdf368cd830a646ff8138553628f8b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dots1/__pycache__/modeling_dots1.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dots1/__pycache__/modular_dots1.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dots1/__pycache__/modular_dots1.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3f2e4d2b614f86cda26121d71c053efb2b14c5f3
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dots1/__pycache__/modular_dots1.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpr/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpr/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..644582dc354127eae90c97b70441e83e866a9db0
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpr/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpr/__pycache__/configuration_dpr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpr/__pycache__/configuration_dpr.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7cff243f188e561abb1590c18b57bd3ca27c3288
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpr/__pycache__/configuration_dpr.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpr/__pycache__/modeling_dpr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpr/__pycache__/modeling_dpr.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6e5eb4f4d24c2a1de027096d615415d2b2179b62
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpr/__pycache__/modeling_dpr.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpr/__pycache__/modeling_tf_dpr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpr/__pycache__/modeling_tf_dpr.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9ec9e4c526880f439b96ba51b71e1a6cca4cde2b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpr/__pycache__/modeling_tf_dpr.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpr/__pycache__/tokenization_dpr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpr/__pycache__/tokenization_dpr.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c716d6dc82e478bfc4689ff6e08dea518c1677d9
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpr/__pycache__/tokenization_dpr.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpr/__pycache__/tokenization_dpr_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpr/__pycache__/tokenization_dpr_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..60b79af407650d33542b97524342ee0fba792ff8
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/dpr/__pycache__/tokenization_dpr_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/edgetam/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/edgetam/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..acb7d0e68bcf65f2ff3ffd97412eef64b294bd56
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/edgetam/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/edgetam/__pycache__/configuration_edgetam.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/edgetam/__pycache__/configuration_edgetam.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e0b8970371b7bcce7b3f24c8074032adda7cf985
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/edgetam/__pycache__/configuration_edgetam.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/edgetam/__pycache__/modeling_edgetam.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/edgetam/__pycache__/modeling_edgetam.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d0a059a23eda779a10986b4835102c6b4ccbcad3
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/edgetam/__pycache__/modeling_edgetam.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/edgetam/__pycache__/modular_edgetam.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/edgetam/__pycache__/modular_edgetam.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..054b910ec29bc99d72568c85b18c8b22ef935ad8
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/edgetam/__pycache__/modular_edgetam.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/edgetam_video/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/edgetam_video/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0041ad10528d3476375e9c063642a8615fcd96cf
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/edgetam_video/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/edgetam_video/__pycache__/configuration_edgetam_video.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/edgetam_video/__pycache__/configuration_edgetam_video.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..714257f6830885e281e2965f537db9314c482e58
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/edgetam_video/__pycache__/configuration_edgetam_video.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/edgetam_video/__pycache__/modular_edgetam_video.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/edgetam_video/__pycache__/modular_edgetam_video.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..64e81e52a3a867a3be66bb5a04fa4b4361de10e9
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/edgetam_video/__pycache__/modular_edgetam_video.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/emu3/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/emu3/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a5ab3632346a8617f48ac7e4da81f906afa8e04b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/emu3/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/emu3/__pycache__/configuration_emu3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/emu3/__pycache__/configuration_emu3.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1484c2d0363731fa03458eb00227ec60d523e3ef
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/emu3/__pycache__/configuration_emu3.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/emu3/__pycache__/image_processing_emu3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/emu3/__pycache__/image_processing_emu3.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..724667c4fde01b25406b0a304ec4eee34ac98f74
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/emu3/__pycache__/image_processing_emu3.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/emu3/__pycache__/modeling_emu3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/emu3/__pycache__/modeling_emu3.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..81b29339028dcd953130b34ad2610a2c311bb02a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/emu3/__pycache__/modeling_emu3.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/emu3/__pycache__/modular_emu3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/emu3/__pycache__/modular_emu3.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0c95204eb37e7e2aefb3ca524ecef36c59f9ae43
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/emu3/__pycache__/modular_emu3.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/emu3/__pycache__/processing_emu3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/emu3/__pycache__/processing_emu3.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..22c400874c8080e2a590bcd49ee5ecb8606c9d58
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/emu3/__pycache__/processing_emu3.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/encodec/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/encodec/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9b637f9cf4f167ceafc67bcccf46d18817038825
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/encodec/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/encodec/__pycache__/configuration_encodec.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/encodec/__pycache__/configuration_encodec.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..651fd220de7a9e6eed86bbde354be56f3113e3f7
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/encodec/__pycache__/configuration_encodec.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/encodec/__pycache__/feature_extraction_encodec.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/encodec/__pycache__/feature_extraction_encodec.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..186107de88454a08029eafe4030f32618dcc645f
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/encodec/__pycache__/feature_extraction_encodec.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/encodec/__pycache__/modeling_encodec.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/encodec/__pycache__/modeling_encodec.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e0c1ab9537cbb6ee96f34c85d849310070ac1d0b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/encodec/__pycache__/modeling_encodec.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/chunk_utils.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/chunk_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..a735fcee001a0d94215c7c101ecad523b4fee3fd
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/chunk_utils.py
@@ -0,0 +1,398 @@
+# Copyright 2021 AlQuraishi Laboratory
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import logging
+import math
+from collections.abc import Iterable, Sequence
+from functools import partial
+from typing import Any, Callable, Optional, Union
+
+import torch
+
+from .tensor_utils import tensor_tree_map, tree_map
+
+
+def _fetch_dims(tree: Union[dict, list, tuple, torch.Tensor]) -> list[tuple[int, ...]]:
+    shapes = []
+    if isinstance(tree, dict):
+        for v in tree.values():
+            shapes.extend(_fetch_dims(v))
+    elif isinstance(tree, (list, tuple)):
+        for t in tree:
+            shapes.extend(_fetch_dims(t))
+    elif isinstance(tree, torch.Tensor):
+        shapes.append(tree.shape)
+    else:
+        raise TypeError("Not supported")
+
+    return shapes
+
+
+@torch.jit.ignore
+def _flat_idx_to_idx(flat_idx: int, dims: tuple[int, ...]) -> tuple[int, ...]:
+    idx = []
+    for d in reversed(dims):
+        idx.append(flat_idx % d)
+        flat_idx = flat_idx // d
+
+    return tuple(reversed(idx))
+
+
+@torch.jit.ignore
+def _get_minimal_slice_set(
+    start: Sequence[int],
+    end: Sequence[int],
+    dims: Sequence[int],
+    start_edges: Optional[Sequence[bool]] = None,
+    end_edges: Optional[Sequence[bool]] = None,
+) -> list[tuple[slice, ...]]:
+    """
+    Produces an ordered sequence of tensor slices that, when used in sequence on a tensor with shape dims, yields
+    tensors that contain every leaf in the contiguous range [start, end]. Care is taken to yield a short sequence of
+    slices, and perhaps even the shortest possible (I'm pretty sure it's the latter).
+
+    end is INCLUSIVE.
+    """
+
+    # start_edges and end_edges both indicate whether, starting from any given
+    # dimension, the start/end index is at the top/bottom edge of the
+    # corresponding tensor, modeled as a tree
+    def reduce_edge_list(l: list[bool]) -> None:
+        tally = True
+        for i in range(len(l)):
+            reversed_idx = -1 * (i + 1)
+            l[reversed_idx] &= tally
+            tally = l[reversed_idx]
+
+    if start_edges is None:
+        start_edges = [s == 0 for s in start]
+        reduce_edge_list(start_edges)
+    if end_edges is None:
+        end_edges = [e == (d - 1) for e, d in zip(end, dims)]
+        reduce_edge_list(end_edges)
+
+    # Base cases. Either start/end are empty and we're done, or the final,
+    # one-dimensional tensor can be simply sliced
+    if len(start) == 0:
+        return [()]
+    elif len(start) == 1:
+        return [(slice(start[0], end[0] + 1),)]
+
+    slices: list[tuple[slice, ...]] = []
+    path_list: list[slice] = []
+
+    # Dimensions common to start and end can be selected directly
+    for s, e in zip(start, end):
+        if s == e:
+            path_list.append(slice(s, s + 1))
+        else:
+            break
+
+    path: tuple[slice, ...] = tuple(path_list)
+    divergence_idx = len(path)
+
+    # start == end, and we're done
+    if divergence_idx == len(dims):
+        return [path]
+
+    def upper() -> tuple[tuple[slice, ...], ...]:
+        assert start_edges is not None
+        assert end_edges is not None
+
+        sdi = start[divergence_idx]
+        return tuple(
+            path + (slice(sdi, sdi + 1),) + s
+            for s in _get_minimal_slice_set(
+                start[divergence_idx + 1 :],
+                [d - 1 for d in dims[divergence_idx + 1 :]],
+                dims[divergence_idx + 1 :],
+                start_edges=start_edges[divergence_idx + 1 :],
+                end_edges=[True for _ in end_edges[divergence_idx + 1 :]],
+            )
+        )
+
+    def lower() -> tuple[tuple[slice, ...], ...]:
+        assert start_edges is not None
+        assert end_edges is not None
+
+        edi = end[divergence_idx]
+        return tuple(
+            path + (slice(edi, edi + 1),) + s
+            for s in _get_minimal_slice_set(
+                [0 for _ in start[divergence_idx + 1 :]],
+                end[divergence_idx + 1 :],
+                dims[divergence_idx + 1 :],
+                start_edges=[True for _ in start_edges[divergence_idx + 1 :]],
+                end_edges=end_edges[divergence_idx + 1 :],
+            )
+        )
+
+    # If both start and end are at the edges of the subtree rooted at
+    # divergence_idx, we can just select the whole subtree at once
+    if start_edges[divergence_idx] and end_edges[divergence_idx]:
+        slices.append(path + (slice(start[divergence_idx], end[divergence_idx] + 1),))
+    # If just start is at the edge, we can grab almost all of the subtree,
+    # treating only the ragged bottom edge as an edge case
+    elif start_edges[divergence_idx]:
+        slices.append(path + (slice(start[divergence_idx], end[divergence_idx]),))
+        slices.extend(lower())
+    # Analogous to the previous case, but the top is ragged this time
+    elif end_edges[divergence_idx]:
+        slices.extend(upper())
+        slices.append(path + (slice(start[divergence_idx] + 1, end[divergence_idx] + 1),))
+    # If both sides of the range are ragged, we need to handle both sides
+    # separately. If there's contiguous meat in between them, we can index it
+    # in one big chunk
+    else:
+        slices.extend(upper())
+        middle_ground = end[divergence_idx] - start[divergence_idx]
+        if middle_ground > 1:
+            slices.append(path + (slice(start[divergence_idx] + 1, end[divergence_idx]),))
+        slices.extend(lower())
+
+    return slices
+
+
+@torch.jit.ignore
+def _chunk_slice(t: torch.Tensor, flat_start: int, flat_end: int, no_batch_dims: int) -> torch.Tensor:
+    """
+    Equivalent to
+
+        t.reshape((-1,) + t.shape[no_batch_dims:])[flat_start:flat_end]
+
+    but without the need for the initial reshape call, which can be memory-intensive in certain situations. The only
+    reshape operations in this function are performed on sub-tensors that scale with (flat_end - flat_start), the chunk
+    size.
+    """
+
+    batch_dims = t.shape[:no_batch_dims]
+    start_idx = list(_flat_idx_to_idx(flat_start, batch_dims))
+    # _get_minimal_slice_set is inclusive
+    end_idx = list(_flat_idx_to_idx(flat_end - 1, batch_dims))
+
+    # Get an ordered list of slices to perform
+    slices = _get_minimal_slice_set(
+        start_idx,
+        end_idx,
+        batch_dims,
+    )
+
+    sliced_tensors = [t[s] for s in slices]
+
+    return torch.cat([s.view((-1,) + t.shape[no_batch_dims:]) for s in sliced_tensors])
+
+
+def chunk_layer(
+    layer: Callable,
+    inputs: dict[str, Any],
+    chunk_size: int,
+    no_batch_dims: int,
+    low_mem: bool = False,
+    _out: Any = None,
+    _add_into_out: bool = False,
+) -> Any:
+    """
+    Implements the "chunking" procedure described in section 1.11.8.
+
+    Layer outputs and inputs are assumed to be simple "pytrees," consisting only of (arbitrarily nested) lists, tuples,
+    and dicts with torch.Tensor leaves.
+
+    Args:
+        layer:
+            The layer to be applied chunk-wise
+        inputs:
+            A (non-nested) dictionary of keyworded inputs. All leaves must be tensors and must share the same batch
+            dimensions.
+        chunk_size:
+            The number of sub-batches per chunk. If multiple batch dimensions are specified, a "sub-batch" is defined
+            as a single indexing of all batch dimensions simultaneously (s.t. the number of sub-batches is the product
+            of the batch dimensions).
+        no_batch_dims:
+            How many of the initial dimensions of each input tensor can be considered batch dimensions.
+        low_mem:
+            Avoids flattening potentially large input tensors. Unnecessary in most cases, and is ever so slightly
+            slower than the default setting.
+    Returns:
+        The reassembled output of the layer on the inputs.
+    """
+    if not (len(inputs) > 0):
+        raise ValueError("Must provide at least one input")
+
+    initial_dims = [shape[:no_batch_dims] for shape in _fetch_dims(inputs)]
+    orig_batch_dims = tuple(max(s) for s in zip(*initial_dims))
+
+    def _prep_inputs(t: torch.Tensor) -> torch.Tensor:
+        if not low_mem:
+            if sum(t.shape[:no_batch_dims]) != no_batch_dims:
+                t = t.expand(orig_batch_dims + t.shape[no_batch_dims:])
+            t = t.reshape(-1, *t.shape[no_batch_dims:])
+        else:
+            t = t.expand(orig_batch_dims + t.shape[no_batch_dims:])
+        return t
+
+    prepped_inputs: dict[str, Any] = tensor_tree_map(_prep_inputs, inputs)
+    prepped_outputs = None
+    if _out is not None:
+        prepped_outputs = tensor_tree_map(lambda t: t.view([-1] + list(t.shape[no_batch_dims:])), _out)
+
+    flat_batch_dim = 1
+    for d in orig_batch_dims:
+        flat_batch_dim *= d
+
+    no_chunks = flat_batch_dim // chunk_size + (flat_batch_dim % chunk_size != 0)
+
+    def _select_chunk(t: torch.Tensor) -> torch.Tensor:
+        return t[i : i + chunk_size] if t.shape[0] != 1 else t
+
+    i = 0
+    out = prepped_outputs
+    for _ in range(no_chunks):
+        # Chunk the input
+        if not low_mem:
+            select_chunk = _select_chunk
+        else:
+            select_chunk = partial(
+                _chunk_slice,
+                flat_start=i,
+                flat_end=min(flat_batch_dim, i + chunk_size),
+                no_batch_dims=len(orig_batch_dims),
+            )
+
+        chunks: dict[str, Any] = tensor_tree_map(select_chunk, prepped_inputs)
+
+        # Run the layer on the chunk
+        output_chunk = layer(**chunks)
+
+        # Allocate space for the output
+        if out is None:
+            out = tensor_tree_map(lambda t: t.new_zeros((flat_batch_dim,) + t.shape[1:]), output_chunk)
+
+        # Put the chunk in its pre-allocated space
+        if isinstance(output_chunk, dict):
+
+            def assign(d1: dict, d2: dict) -> None:
+                for k, v in d1.items():
+                    if isinstance(v, dict):
+                        assign(v, d2[k])
+                    else:
+                        if _add_into_out:
+                            v[i : i + chunk_size] += d2[k]
+                        else:
+                            v[i : i + chunk_size] = d2[k]
+
+            assign(out, output_chunk)
+        elif isinstance(output_chunk, tuple):
+            for x1, x2 in zip(out, output_chunk):
+                if _add_into_out:
+                    x1[i : i + chunk_size] += x2
+                else:
+                    x1[i : i + chunk_size] = x2
+        elif isinstance(output_chunk, torch.Tensor):
+            if _add_into_out:
+                out[i : i + chunk_size] += output_chunk
+            else:
+                out[i : i + chunk_size] = output_chunk
+        else:
+            raise TypeError("Not supported")
+
+        i += chunk_size
+
+    out = tensor_tree_map(lambda t: t.view(orig_batch_dims + t.shape[1:]), out)
+
+    return out
+
+
+class ChunkSizeTuner:
+    def __init__(
+        self,
+        # Heuristically, runtimes for most of the modules in the network
+        # plateau earlier than this on all GPUs I've run the model on.
+        max_chunk_size: int = 512,
+    ):
+        self.max_chunk_size = max_chunk_size
+        self.cached_chunk_size: Optional[int] = None
+        self.cached_arg_data: Optional[tuple] = None
+
+    def _determine_favorable_chunk_size(self, fn: Callable, args: tuple, min_chunk_size: int) -> int:
+        logging.info("Tuning chunk size...")
+
+        if min_chunk_size >= self.max_chunk_size:
+            return min_chunk_size
+
+        candidates: list[int] = [2**l for l in range(int(math.log2(self.max_chunk_size)) + 1)]
+        candidates = [c for c in candidates if c > min_chunk_size]
+        candidates = [min_chunk_size] + candidates
+        candidates[-1] += 4
+
+        def test_chunk_size(chunk_size: int) -> bool:
+            try:
+                with torch.no_grad():
+                    fn(*args, chunk_size=chunk_size)
+                return True
+            except RuntimeError:
+                return False
+
+        min_viable_chunk_size_index = 0
+        i = len(candidates) - 1
+        while i > min_viable_chunk_size_index:
+            viable = test_chunk_size(candidates[i])
+            if not viable:
+                i = (min_viable_chunk_size_index + i) // 2
+            else:
+                min_viable_chunk_size_index = i
+                i = (i + len(candidates) - 1) // 2
+
+        return candidates[min_viable_chunk_size_index]
+
+    def _compare_arg_caches(self, ac1: Iterable, ac2: Iterable) -> bool:
+        consistent = True
+        for a1, a2 in zip(ac1, ac2):
+            assert type(ac1) is type(ac2)
+            if isinstance(ac1, (list, tuple)):
+                consistent &= self._compare_arg_caches(a1, a2)
+            elif isinstance(ac1, dict):
+                a1_items = [v for _, v in sorted(a1.items(), key=lambda x: x[0])]
+                a2_items = [v for _, v in sorted(a2.items(), key=lambda x: x[0])]
+                consistent &= self._compare_arg_caches(a1_items, a2_items)
+            else:
+                consistent &= a1 == a2
+
+        return consistent
+
+    def tune_chunk_size(
+        self,
+        representative_fn: Callable,
+        args: tuple,
+        min_chunk_size: int,
+    ) -> int:
+        consistent = True
+        arg_data: tuple = tree_map(lambda a: a.shape if isinstance(a, torch.Tensor) else a, args, object)
+        if self.cached_arg_data is not None:
+            # If args have changed shape/value, we need to re-tune
+            assert len(self.cached_arg_data) == len(arg_data)
+            consistent = self._compare_arg_caches(self.cached_arg_data, arg_data)
+        else:
+            # Otherwise, we can reuse the precomputed value
+            consistent = False
+
+        if not consistent:
+            self.cached_chunk_size = self._determine_favorable_chunk_size(
+                representative_fn,
+                args,
+                min_chunk_size,
+            )
+            self.cached_arg_data = arg_data
+
+        assert self.cached_chunk_size is not None
+
+        return self.cached_chunk_size
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/residue_constants.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/residue_constants.py
new file mode 100644
index 0000000000000000000000000000000000000000..e92e65d29bfb2e7a0ad640e71d983c1c467d3a54
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/residue_constants.py
@@ -0,0 +1,979 @@
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+"""Constants used in AlphaFold."""
+
+import collections
+import copy
+import functools
+from collections.abc import Mapping, Sequence
+from importlib import resources
+
+import numpy as np
+
+
+# Internal import (35fd).
+
+
+# Distance from one CA to next CA [trans configuration: omega = 180].
+ca_ca = 3.80209737096
+
+# Format: The list for each AA type contains chi1, chi2, chi3, chi4 in
+# this order (or a relevant subset from chi1 onwards). ALA and GLY don't have
+# chi angles so their chi angle lists are empty.
+chi_angles_atoms: dict[str, list[list[str]]] = {
+    "ALA": [],
+    # Chi5 in arginine is always 0 +- 5 degrees, so ignore it.
+    "ARG": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD"], ["CB", "CG", "CD", "NE"], ["CG", "CD", "NE", "CZ"]],
+    "ASN": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "OD1"]],
+    "ASP": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "OD1"]],
+    "CYS": [["N", "CA", "CB", "SG"]],
+    "GLN": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD"], ["CB", "CG", "CD", "OE1"]],
+    "GLU": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD"], ["CB", "CG", "CD", "OE1"]],
+    "GLY": [],
+    "HIS": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "ND1"]],
+    "ILE": [["N", "CA", "CB", "CG1"], ["CA", "CB", "CG1", "CD1"]],
+    "LEU": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD1"]],
+    "LYS": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD"], ["CB", "CG", "CD", "CE"], ["CG", "CD", "CE", "NZ"]],
+    "MET": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "SD"], ["CB", "CG", "SD", "CE"]],
+    "PHE": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD1"]],
+    "PRO": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD"]],
+    "SER": [["N", "CA", "CB", "OG"]],
+    "THR": [["N", "CA", "CB", "OG1"]],
+    "TRP": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD1"]],
+    "TYR": [["N", "CA", "CB", "CG"], ["CA", "CB", "CG", "CD1"]],
+    "VAL": [["N", "CA", "CB", "CG1"]],
+}
+
+# If chi angles given in fixed-length array, this matrix determines how to mask
+# them for each AA type. The order is as per restype_order (see below).
+chi_angles_mask: list[list[float]] = [
+    [0.0, 0.0, 0.0, 0.0],  # ALA
+    [1.0, 1.0, 1.0, 1.0],  # ARG
+    [1.0, 1.0, 0.0, 0.0],  # ASN
+    [1.0, 1.0, 0.0, 0.0],  # ASP
+    [1.0, 0.0, 0.0, 0.0],  # CYS
+    [1.0, 1.0, 1.0, 0.0],  # GLN
+    [1.0, 1.0, 1.0, 0.0],  # GLU
+    [0.0, 0.0, 0.0, 0.0],  # GLY
+    [1.0, 1.0, 0.0, 0.0],  # HIS
+    [1.0, 1.0, 0.0, 0.0],  # ILE
+    [1.0, 1.0, 0.0, 0.0],  # LEU
+    [1.0, 1.0, 1.0, 1.0],  # LYS
+    [1.0, 1.0, 1.0, 0.0],  # MET
+    [1.0, 1.0, 0.0, 0.0],  # PHE
+    [1.0, 1.0, 0.0, 0.0],  # PRO
+    [1.0, 0.0, 0.0, 0.0],  # SER
+    [1.0, 0.0, 0.0, 0.0],  # THR
+    [1.0, 1.0, 0.0, 0.0],  # TRP
+    [1.0, 1.0, 0.0, 0.0],  # TYR
+    [1.0, 0.0, 0.0, 0.0],  # VAL
+]
+
+# The following chi angles are pi periodic: they can be rotated by a multiple
+# of pi without affecting the structure.
+chi_pi_periodic: list[list[float]] = [
+    [0.0, 0.0, 0.0, 0.0],  # ALA
+    [0.0, 0.0, 0.0, 0.0],  # ARG
+    [0.0, 0.0, 0.0, 0.0],  # ASN
+    [0.0, 1.0, 0.0, 0.0],  # ASP
+    [0.0, 0.0, 0.0, 0.0],  # CYS
+    [0.0, 0.0, 0.0, 0.0],  # GLN
+    [0.0, 0.0, 1.0, 0.0],  # GLU
+    [0.0, 0.0, 0.0, 0.0],  # GLY
+    [0.0, 0.0, 0.0, 0.0],  # HIS
+    [0.0, 0.0, 0.0, 0.0],  # ILE
+    [0.0, 0.0, 0.0, 0.0],  # LEU
+    [0.0, 0.0, 0.0, 0.0],  # LYS
+    [0.0, 0.0, 0.0, 0.0],  # MET
+    [0.0, 1.0, 0.0, 0.0],  # PHE
+    [0.0, 0.0, 0.0, 0.0],  # PRO
+    [0.0, 0.0, 0.0, 0.0],  # SER
+    [0.0, 0.0, 0.0, 0.0],  # THR
+    [0.0, 0.0, 0.0, 0.0],  # TRP
+    [0.0, 1.0, 0.0, 0.0],  # TYR
+    [0.0, 0.0, 0.0, 0.0],  # VAL
+    [0.0, 0.0, 0.0, 0.0],  # UNK
+]
+
+# Atoms positions relative to the 8 rigid groups, defined by the pre-omega, phi,
+# psi and chi angles:
+# 0: 'backbone group',
+# 1: 'pre-omega-group', (empty)
+# 2: 'phi-group', (currently empty, because it defines only hydrogens)
+# 3: 'psi-group',
+# 4,5,6,7: 'chi1,2,3,4-group'
+# The atom positions are relative to the axis-end-atom of the corresponding
+# rotation axis. The x-axis is in direction of the rotation axis, and the y-axis
+# is defined such that the dihedral-angle-definiting atom (the last entry in
+# chi_angles_atoms above) is in the xy-plane (with a positive y-coordinate).
+# format: [atomname, group_idx, rel_position]
+rigid_group_atom_positions: dict[str, list[tuple[str, int, tuple[float, float, float]]]] = {
+    "ALA": [
+        ("N", 0, (-0.525, 1.363, 0.000)),
+        ("CA", 0, (0.000, 0.000, 0.000)),
+        ("C", 0, (1.526, -0.000, -0.000)),
+        ("CB", 0, (-0.529, -0.774, -1.205)),
+        ("O", 3, (0.627, 1.062, 0.000)),
+    ],
+    "ARG": [
+        ("N", 0, (-0.524, 1.362, -0.000)),
+        ("CA", 0, (0.000, 0.000, 0.000)),
+        ("C", 0, (1.525, -0.000, -0.000)),
+        ("CB", 0, (-0.524, -0.778, -1.209)),
+        ("O", 3, (0.626, 1.062, 0.000)),
+        ("CG", 4, (0.616, 1.390, -0.000)),
+        ("CD", 5, (0.564, 1.414, 0.000)),
+        ("NE", 6, (0.539, 1.357, -0.000)),
+        ("NH1", 7, (0.206, 2.301, 0.000)),
+        ("NH2", 7, (2.078, 0.978, -0.000)),
+        ("CZ", 7, (0.758, 1.093, -0.000)),
+    ],
+    "ASN": [
+        ("N", 0, (-0.536, 1.357, 0.000)),
+        ("CA", 0, (0.000, 0.000, 0.000)),
+        ("C", 0, (1.526, -0.000, -0.000)),
+        ("CB", 0, (-0.531, -0.787, -1.200)),
+        ("O", 3, (0.625, 1.062, 0.000)),
+        ("CG", 4, (0.584, 1.399, 0.000)),
+        ("ND2", 5, (0.593, -1.188, 0.001)),
+        ("OD1", 5, (0.633, 1.059, 0.000)),
+    ],
+    "ASP": [
+        ("N", 0, (-0.525, 1.362, -0.000)),
+        ("CA", 0, (0.000, 0.000, 0.000)),
+        ("C", 0, (1.527, 0.000, -0.000)),
+        ("CB", 0, (-0.526, -0.778, -1.208)),
+        ("O", 3, (0.626, 1.062, -0.000)),
+        ("CG", 4, (0.593, 1.398, -0.000)),
+        ("OD1", 5, (0.610, 1.091, 0.000)),
+        ("OD2", 5, (0.592, -1.101, -0.003)),
+    ],
+    "CYS": [
+        ("N", 0, (-0.522, 1.362, -0.000)),
+        ("CA", 0, (0.000, 0.000, 0.000)),
+        ("C", 0, (1.524, 0.000, 0.000)),
+        ("CB", 0, (-0.519, -0.773, -1.212)),
+        ("O", 3, (0.625, 1.062, -0.000)),
+        ("SG", 4, (0.728, 1.653, 0.000)),
+    ],
+    "GLN": [
+        ("N", 0, (-0.526, 1.361, -0.000)),
+        ("CA", 0, (0.000, 0.000, 0.000)),
+        ("C", 0, (1.526, 0.000, 0.000)),
+        ("CB", 0, (-0.525, -0.779, -1.207)),
+        ("O", 3, (0.626, 1.062, -0.000)),
+        ("CG", 4, (0.615, 1.393, 0.000)),
+        ("CD", 5, (0.587, 1.399, -0.000)),
+        ("NE2", 6, (0.593, -1.189, -0.001)),
+        ("OE1", 6, (0.634, 1.060, 0.000)),
+    ],
+    "GLU": [
+        ("N", 0, (-0.528, 1.361, 0.000)),
+        ("CA", 0, (0.000, 0.000, 0.000)),
+        ("C", 0, (1.526, -0.000, -0.000)),
+        ("CB", 0, (-0.526, -0.781, -1.207)),
+        ("O", 3, (0.626, 1.062, 0.000)),
+        ("CG", 4, (0.615, 1.392, 0.000)),
+        ("CD", 5, (0.600, 1.397, 0.000)),
+        ("OE1", 6, (0.607, 1.095, -0.000)),
+        ("OE2", 6, (0.589, -1.104, -0.001)),
+    ],
+    "GLY": [
+        ("N", 0, (-0.572, 1.337, 0.000)),
+        ("CA", 0, (0.000, 0.000, 0.000)),
+        ("C", 0, (1.517, -0.000, -0.000)),
+        ("O", 3, (0.626, 1.062, -0.000)),
+    ],
+    "HIS": [
+        ("N", 0, (-0.527, 1.360, 0.000)),
+        ("CA", 0, (0.000, 0.000, 0.000)),
+        ("C", 0, (1.525, 0.000, 0.000)),
+        ("CB", 0, (-0.525, -0.778, -1.208)),
+        ("O", 3, (0.625, 1.063, 0.000)),
+        ("CG", 4, (0.600, 1.370, -0.000)),
+        ("CD2", 5, (0.889, -1.021, 0.003)),
+        ("ND1", 5, (0.744, 1.160, -0.000)),
+        ("CE1", 5, (2.030, 0.851, 0.002)),
+        ("NE2", 5, (2.145, -0.466, 0.004)),
+    ],
+    "ILE": [
+        ("N", 0, (-0.493, 1.373, -0.000)),
+        ("CA", 0, (0.000, 0.000, 0.000)),
+        ("C", 0, (1.527, -0.000, -0.000)),
+        ("CB", 0, (-0.536, -0.793, -1.213)),
+        ("O", 3, (0.627, 1.062, -0.000)),
+        ("CG1", 4, (0.534, 1.437, -0.000)),
+        ("CG2", 4, (0.540, -0.785, -1.199)),
+        ("CD1", 5, (0.619, 1.391, 0.000)),
+    ],
+    "LEU": [
+        ("N", 0, (-0.520, 1.363, 0.000)),
+        ("CA", 0, (0.000, 0.000, 0.000)),
+        ("C", 0, (1.525, -0.000, -0.000)),
+        ("CB", 0, (-0.522, -0.773, -1.214)),
+        ("O", 3, (0.625, 1.063, -0.000)),
+        ("CG", 4, (0.678, 1.371, 0.000)),
+        ("CD1", 5, (0.530, 1.430, -0.000)),
+        ("CD2", 5, (0.535, -0.774, 1.200)),
+    ],
+    "LYS": [
+        ("N", 0, (-0.526, 1.362, -0.000)),
+        ("CA", 0, (0.000, 0.000, 0.000)),
+        ("C", 0, (1.526, 0.000, 0.000)),
+        ("CB", 0, (-0.524, -0.778, -1.208)),
+        ("O", 3, (0.626, 1.062, -0.000)),
+        ("CG", 4, (0.619, 1.390, 0.000)),
+        ("CD", 5, (0.559, 1.417, 0.000)),
+        ("CE", 6, (0.560, 1.416, 0.000)),
+        ("NZ", 7, (0.554, 1.387, 0.000)),
+    ],
+    "MET": [
+        ("N", 0, (-0.521, 1.364, -0.000)),
+        ("CA", 0, (0.000, 0.000, 0.000)),
+        ("C", 0, (1.525, 0.000, 0.000)),
+        ("CB", 0, (-0.523, -0.776, -1.210)),
+        ("O", 3, (0.625, 1.062, -0.000)),
+        ("CG", 4, (0.613, 1.391, -0.000)),
+        ("SD", 5, (0.703, 1.695, 0.000)),
+        ("CE", 6, (0.320, 1.786, -0.000)),
+    ],
+    "PHE": [
+        ("N", 0, (-0.518, 1.363, 0.000)),
+        ("CA", 0, (0.000, 0.000, 0.000)),
+        ("C", 0, (1.524, 0.000, -0.000)),
+        ("CB", 0, (-0.525, -0.776, -1.212)),
+        ("O", 3, (0.626, 1.062, -0.000)),
+        ("CG", 4, (0.607, 1.377, 0.000)),
+        ("CD1", 5, (0.709, 1.195, -0.000)),
+        ("CD2", 5, (0.706, -1.196, 0.000)),
+        ("CE1", 5, (2.102, 1.198, -0.000)),
+        ("CE2", 5, (2.098, -1.201, -0.000)),
+        ("CZ", 5, (2.794, -0.003, -0.001)),
+    ],
+    "PRO": [
+        ("N", 0, (-0.566, 1.351, -0.000)),
+        ("CA", 0, (0.000, 0.000, 0.000)),
+        ("C", 0, (1.527, -0.000, 0.000)),
+        ("CB", 0, (-0.546, -0.611, -1.293)),
+        ("O", 3, (0.621, 1.066, 0.000)),
+        ("CG", 4, (0.382, 1.445, 0.0)),
+        # ('CD', 5, (0.427, 1.440, 0.0)),
+        ("CD", 5, (0.477, 1.424, 0.0)),  # manually made angle 2 degrees larger
+    ],
+    "SER": [
+        ("N", 0, (-0.529, 1.360, -0.000)),
+        ("CA", 0, (0.000, 0.000, 0.000)),
+        ("C", 0, (1.525, -0.000, -0.000)),
+        ("CB", 0, (-0.518, -0.777, -1.211)),
+        ("O", 3, (0.626, 1.062, -0.000)),
+        ("OG", 4, (0.503, 1.325, 0.000)),
+    ],
+    "THR": [
+        ("N", 0, (-0.517, 1.364, 0.000)),
+        ("CA", 0, (0.000, 0.000, 0.000)),
+        ("C", 0, (1.526, 0.000, -0.000)),
+        ("CB", 0, (-0.516, -0.793, -1.215)),
+        ("O", 3, (0.626, 1.062, 0.000)),
+        ("CG2", 4, (0.550, -0.718, -1.228)),
+        ("OG1", 4, (0.472, 1.353, 0.000)),
+    ],
+    "TRP": [
+        ("N", 0, (-0.521, 1.363, 0.000)),
+        ("CA", 0, (0.000, 0.000, 0.000)),
+        ("C", 0, (1.525, -0.000, 0.000)),
+        ("CB", 0, (-0.523, -0.776, -1.212)),
+        ("O", 3, (0.627, 1.062, 0.000)),
+        ("CG", 4, (0.609, 1.370, -0.000)),
+        ("CD1", 5, (0.824, 1.091, 0.000)),
+        ("CD2", 5, (0.854, -1.148, -0.005)),
+        ("CE2", 5, (2.186, -0.678, -0.007)),
+        ("CE3", 5, (0.622, -2.530, -0.007)),
+        ("NE1", 5, (2.140, 0.690, -0.004)),
+        ("CH2", 5, (3.028, -2.890, -0.013)),
+        ("CZ2", 5, (3.283, -1.543, -0.011)),
+        ("CZ3", 5, (1.715, -3.389, -0.011)),
+    ],
+    "TYR": [
+        ("N", 0, (-0.522, 1.362, 0.000)),
+        ("CA", 0, (0.000, 0.000, 0.000)),
+        ("C", 0, (1.524, -0.000, -0.000)),
+        ("CB", 0, (-0.522, -0.776, -1.213)),
+        ("O", 3, (0.627, 1.062, -0.000)),
+        ("CG", 4, (0.607, 1.382, -0.000)),
+        ("CD1", 5, (0.716, 1.195, -0.000)),
+        ("CD2", 5, (0.713, -1.194, -0.001)),
+        ("CE1", 5, (2.107, 1.200, -0.002)),
+        ("CE2", 5, (2.104, -1.201, -0.003)),
+        ("OH", 5, (4.168, -0.002, -0.005)),
+        ("CZ", 5, (2.791, -0.001, -0.003)),
+    ],
+    "VAL": [
+        ("N", 0, (-0.494, 1.373, -0.000)),
+        ("CA", 0, (0.000, 0.000, 0.000)),
+        ("C", 0, (1.527, -0.000, -0.000)),
+        ("CB", 0, (-0.533, -0.795, -1.213)),
+        ("O", 3, (0.627, 1.062, -0.000)),
+        ("CG1", 4, (0.540, 1.429, -0.000)),
+        ("CG2", 4, (0.533, -0.776, 1.203)),
+    ],
+}
+
+# A list of atoms (excluding hydrogen) for each AA type. PDB naming convention.
+residue_atoms: dict[str, list[str]] = {
+    "ALA": ["C", "CA", "CB", "N", "O"],
+    "ARG": ["C", "CA", "CB", "CG", "CD", "CZ", "N", "NE", "O", "NH1", "NH2"],
+    "ASP": ["C", "CA", "CB", "CG", "N", "O", "OD1", "OD2"],
+    "ASN": ["C", "CA", "CB", "CG", "N", "ND2", "O", "OD1"],
+    "CYS": ["C", "CA", "CB", "N", "O", "SG"],
+    "GLU": ["C", "CA", "CB", "CG", "CD", "N", "O", "OE1", "OE2"],
+    "GLN": ["C", "CA", "CB", "CG", "CD", "N", "NE2", "O", "OE1"],
+    "GLY": ["C", "CA", "N", "O"],
+    "HIS": ["C", "CA", "CB", "CG", "CD2", "CE1", "N", "ND1", "NE2", "O"],
+    "ILE": ["C", "CA", "CB", "CG1", "CG2", "CD1", "N", "O"],
+    "LEU": ["C", "CA", "CB", "CG", "CD1", "CD2", "N", "O"],
+    "LYS": ["C", "CA", "CB", "CG", "CD", "CE", "N", "NZ", "O"],
+    "MET": ["C", "CA", "CB", "CG", "CE", "N", "O", "SD"],
+    "PHE": ["C", "CA", "CB", "CG", "CD1", "CD2", "CE1", "CE2", "CZ", "N", "O"],
+    "PRO": ["C", "CA", "CB", "CG", "CD", "N", "O"],
+    "SER": ["C", "CA", "CB", "N", "O", "OG"],
+    "THR": ["C", "CA", "CB", "CG2", "N", "O", "OG1"],
+    "TRP": ["C", "CA", "CB", "CG", "CD1", "CD2", "CE2", "CE3", "CZ2", "CZ3", "CH2", "N", "NE1", "O"],
+    "TYR": ["C", "CA", "CB", "CG", "CD1", "CD2", "CE1", "CE2", "CZ", "N", "O", "OH"],
+    "VAL": ["C", "CA", "CB", "CG1", "CG2", "N", "O"],
+}
+
+# Naming swaps for ambiguous atom names.
+# Due to symmetries in the amino acids the naming of atoms is ambiguous in
+# 4 of the 20 amino acids.
+# (The LDDT paper lists 7 amino acids as ambiguous, but the naming ambiguities
+# in LEU, VAL and ARG can be resolved by using the 3d constellations of
+# the 'ambiguous' atoms and their neighbours)
+# TODO: ^ interpret this
+residue_atom_renaming_swaps: dict[str, dict[str, str]] = {
+    "ASP": {"OD1": "OD2"},
+    "GLU": {"OE1": "OE2"},
+    "PHE": {"CD1": "CD2", "CE1": "CE2"},
+    "TYR": {"CD1": "CD2", "CE1": "CE2"},
+}
+
+# Van der Waals radii [Angstroem] of the atoms (from Wikipedia)
+van_der_waals_radius: dict[str, float] = {
+    "C": 1.7,
+    "N": 1.55,
+    "O": 1.52,
+    "S": 1.8,
+}
+
+Bond = collections.namedtuple("Bond", ["atom1_name", "atom2_name", "length", "stddev"])
+BondAngle = collections.namedtuple(
+    "BondAngle",
+    ["atom1_name", "atom2_name", "atom3name", "angle_rad", "stddev"],
+)
+
+
+def map_structure_with_atom_order(in_list: list, first_call: bool = True) -> list:
+    # Maps strings in a nested list structure to their corresponding index in atom_order
+    if first_call:
+        in_list = copy.deepcopy(in_list)
+    for i in range(len(in_list)):
+        if isinstance(in_list[i], list):
+            in_list[i] = map_structure_with_atom_order(in_list[i], first_call=False)
+        elif isinstance(in_list[i], str):
+            in_list[i] = atom_order[in_list[i]]
+        else:
+            raise TypeError("Unexpected type when mapping nested lists!")
+    return in_list
+
+
+@functools.cache
+def load_stereo_chemical_props() -> tuple[
+    Mapping[str, list[Bond]],
+    Mapping[str, list[Bond]],
+    Mapping[str, list[BondAngle]],
+]:
+    """Load stereo_chemical_props.txt into a nice structure.
+
+    Load literature values for bond lengths and bond angles and translate bond angles into the length of the opposite
+    edge of the triangle ("residue_virtual_bonds").
+
+    Returns:
+      residue_bonds: dict that maps resname --> list of Bond tuples residue_virtual_bonds: dict that maps resname -->
+      list of Bond tuples residue_bond_angles: dict that maps resname --> list of BondAngle tuples
+    """
+    # TODO: this file should be downloaded in a setup script
+    stereo_chemical_props = resources.read_text("openfold.resources", "stereo_chemical_props.txt")
+
+    lines_iter = iter(stereo_chemical_props.splitlines())
+    # Load bond lengths.
+    residue_bonds: dict[str, list[Bond]] = {}
+    next(lines_iter)  # Skip header line.
+    for line in lines_iter:
+        if line.strip() == "-":
+            break
+        bond, resname, bond_length, stddev = line.split()
+        atom1, atom2 = bond.split("-")
+        if resname not in residue_bonds:
+            residue_bonds[resname] = []
+        residue_bonds[resname].append(Bond(atom1, atom2, float(bond_length), float(stddev)))
+    residue_bonds["UNK"] = []
+
+    # Load bond angles.
+    residue_bond_angles: dict[str, list[BondAngle]] = {}
+    next(lines_iter)  # Skip empty line.
+    next(lines_iter)  # Skip header line.
+    for line in lines_iter:
+        if line.strip() == "-":
+            break
+        bond, resname, angle_degree, stddev_degree = line.split()
+        atom1, atom2, atom3 = bond.split("-")
+        if resname not in residue_bond_angles:
+            residue_bond_angles[resname] = []
+        residue_bond_angles[resname].append(
+            BondAngle(
+                atom1,
+                atom2,
+                atom3,
+                float(angle_degree) / 180.0 * np.pi,
+                float(stddev_degree) / 180.0 * np.pi,
+            )
+        )
+    residue_bond_angles["UNK"] = []
+
+    def make_bond_key(atom1_name: str, atom2_name: str) -> str:
+        """Unique key to lookup bonds."""
+        return "-".join(sorted([atom1_name, atom2_name]))
+
+    # Translate bond angles into distances ("virtual bonds").
+    residue_virtual_bonds: dict[str, list[Bond]] = {}
+    for resname, bond_angles in residue_bond_angles.items():
+        # Create a fast lookup dict for bond lengths.
+        bond_cache: dict[str, Bond] = {}
+        for b in residue_bonds[resname]:
+            bond_cache[make_bond_key(b.atom1_name, b.atom2_name)] = b
+        residue_virtual_bonds[resname] = []
+        for ba in bond_angles:
+            bond1 = bond_cache[make_bond_key(ba.atom1_name, ba.atom2_name)]
+            bond2 = bond_cache[make_bond_key(ba.atom2_name, ba.atom3name)]
+
+            # Compute distance between atom1 and atom3 using the law of cosines
+            # c^2 = a^2 + b^2 - 2ab*cos(gamma).
+            gamma = ba.angle_rad
+            length = np.sqrt(bond1.length**2 + bond2.length**2 - 2 * bond1.length * bond2.length * np.cos(gamma))
+
+            # Propagation of uncertainty assuming uncorrelated errors.
+            dl_outer = 0.5 / length
+            dl_dgamma = (2 * bond1.length * bond2.length * np.sin(gamma)) * dl_outer
+            dl_db1 = (2 * bond1.length - 2 * bond2.length * np.cos(gamma)) * dl_outer
+            dl_db2 = (2 * bond2.length - 2 * bond1.length * np.cos(gamma)) * dl_outer
+            stddev = np.sqrt(
+                (dl_dgamma * ba.stddev) ** 2 + (dl_db1 * bond1.stddev) ** 2 + (dl_db2 * bond2.stddev) ** 2
+            )
+            residue_virtual_bonds[resname].append(Bond(ba.atom1_name, ba.atom3name, length, stddev))
+
+    return (residue_bonds, residue_virtual_bonds, residue_bond_angles)
+
+
+# Between-residue bond lengths for general bonds (first element) and for Proline
+# (second element).
+between_res_bond_length_c_n: tuple[float, float] = (1.329, 1.341)
+between_res_bond_length_stddev_c_n: tuple[float, float] = (0.014, 0.016)
+
+# Between-residue cos_angles.
+between_res_cos_angles_c_n_ca: tuple[float, float] = (-0.5203, 0.0353)  # degrees: 121.352 +- 2.315
+between_res_cos_angles_ca_c_n: tuple[float, float] = (-0.4473, 0.0311)  # degrees: 116.568 +- 1.995
+
+# This mapping is used when we need to store atom data in a format that requires
+# fixed atom data size for every residue (e.g. a numpy array).
+atom_types: list[str] = [
+    "N",
+    "CA",
+    "C",
+    "CB",
+    "O",
+    "CG",
+    "CG1",
+    "CG2",
+    "OG",
+    "OG1",
+    "SG",
+    "CD",
+    "CD1",
+    "CD2",
+    "ND1",
+    "ND2",
+    "OD1",
+    "OD2",
+    "SD",
+    "CE",
+    "CE1",
+    "CE2",
+    "CE3",
+    "NE",
+    "NE1",
+    "NE2",
+    "OE1",
+    "OE2",
+    "CH2",
+    "NH1",
+    "NH2",
+    "OH",
+    "CZ",
+    "CZ2",
+    "CZ3",
+    "NZ",
+    "OXT",
+]
+atom_order: dict[str, int] = {atom_type: i for i, atom_type in enumerate(atom_types)}
+atom_type_num = len(atom_types)  # := 37.
+
+# A compact atom encoding with 14 columns
+# pylint: disable=line-too-long
+restype_name_to_atom14_names: dict[str, list[str]] = {
+    "ALA": ["N", "CA", "C", "O", "CB", "", "", "", "", "", "", "", "", ""],
+    "ARG": ["N", "CA", "C", "O", "CB", "CG", "CD", "NE", "CZ", "NH1", "NH2", "", "", ""],
+    "ASN": ["N", "CA", "C", "O", "CB", "CG", "OD1", "ND2", "", "", "", "", "", ""],
+    "ASP": ["N", "CA", "C", "O", "CB", "CG", "OD1", "OD2", "", "", "", "", "", ""],
+    "CYS": ["N", "CA", "C", "O", "CB", "SG", "", "", "", "", "", "", "", ""],
+    "GLN": ["N", "CA", "C", "O", "CB", "CG", "CD", "OE1", "NE2", "", "", "", "", ""],
+    "GLU": ["N", "CA", "C", "O", "CB", "CG", "CD", "OE1", "OE2", "", "", "", "", ""],
+    "GLY": ["N", "CA", "C", "O", "", "", "", "", "", "", "", "", "", ""],
+    "HIS": ["N", "CA", "C", "O", "CB", "CG", "ND1", "CD2", "CE1", "NE2", "", "", "", ""],
+    "ILE": ["N", "CA", "C", "O", "CB", "CG1", "CG2", "CD1", "", "", "", "", "", ""],
+    "LEU": ["N", "CA", "C", "O", "CB", "CG", "CD1", "CD2", "", "", "", "", "", ""],
+    "LYS": ["N", "CA", "C", "O", "CB", "CG", "CD", "CE", "NZ", "", "", "", "", ""],
+    "MET": ["N", "CA", "C", "O", "CB", "CG", "SD", "CE", "", "", "", "", "", ""],
+    "PHE": ["N", "CA", "C", "O", "CB", "CG", "CD1", "CD2", "CE1", "CE2", "CZ", "", "", ""],
+    "PRO": ["N", "CA", "C", "O", "CB", "CG", "CD", "", "", "", "", "", "", ""],
+    "SER": ["N", "CA", "C", "O", "CB", "OG", "", "", "", "", "", "", "", ""],
+    "THR": ["N", "CA", "C", "O", "CB", "OG1", "CG2", "", "", "", "", "", "", ""],
+    "TRP": ["N", "CA", "C", "O", "CB", "CG", "CD1", "CD2", "NE1", "CE2", "CE3", "CZ2", "CZ3", "CH2"],
+    "TYR": ["N", "CA", "C", "O", "CB", "CG", "CD1", "CD2", "CE1", "CE2", "CZ", "OH", "", ""],
+    "VAL": ["N", "CA", "C", "O", "CB", "CG1", "CG2", "", "", "", "", "", "", ""],
+    "UNK": ["", "", "", "", "", "", "", "", "", "", "", "", "", ""],
+}
+# pylint: enable=line-too-long
+
+
+# This is the standard residue order when coding AA type as a number.
+# Reproduce it by taking 3-letter AA codes and sorting them alphabetically.
+restypes: list[str] = [
+    "A",
+    "R",
+    "N",
+    "D",
+    "C",
+    "Q",
+    "E",
+    "G",
+    "H",
+    "I",
+    "L",
+    "K",
+    "M",
+    "F",
+    "P",
+    "S",
+    "T",
+    "W",
+    "Y",
+    "V",
+]
+restype_order: dict[str, int] = {restype: i for i, restype in enumerate(restypes)}
+restype_num = len(restypes)  # := 20.
+unk_restype_index = restype_num  # Catch-all index for unknown restypes.
+
+restypes_with_x: list[str] = restypes + ["X"]
+restype_order_with_x: dict[str, int] = {restype: i for i, restype in enumerate(restypes_with_x)}
+
+
+def sequence_to_onehot(sequence: str, mapping: Mapping[str, int], map_unknown_to_x: bool = False) -> np.ndarray:
+    """Maps the given sequence into a one-hot encoded matrix.
+
+    Args:
+      sequence: An amino acid sequence.
+      mapping: A dictionary mapping amino acids to integers.
+      map_unknown_to_x: If True, any amino acid that is not in the mapping will be
+        mapped to the unknown amino acid 'X'. If the mapping doesn't contain amino acid 'X', an error will be thrown.
+        If False, any amino acid not in the mapping will throw an error.
+
+    Returns:
+      A numpy array of shape (seq_len, num_unique_aas) with one-hot encoding of the sequence.
+
+    Raises:
+      ValueError: If the mapping doesn't contain values from 0 to
+        num_unique_aas - 1 without any gaps.
+    """
+    num_entries = max(mapping.values()) + 1
+
+    if sorted(set(mapping.values())) != list(range(num_entries)):
+        raise ValueError(
+            "The mapping must have values from 0 to num_unique_aas-1 without any gaps. Got: %s"
+            % sorted(mapping.values())
+        )
+
+    one_hot_arr = np.zeros((len(sequence), num_entries), dtype=np.int32)
+
+    for aa_index, aa_type in enumerate(sequence):
+        if map_unknown_to_x:
+            if aa_type.isalpha() and aa_type.isupper():
+                aa_id = mapping.get(aa_type, mapping["X"])
+            else:
+                raise ValueError(f"Invalid character in the sequence: {aa_type}")
+        else:
+            aa_id = mapping[aa_type]
+        one_hot_arr[aa_index, aa_id] = 1
+
+    return one_hot_arr
+
+
+restype_1to3: dict[str, str] = {
+    "A": "ALA",
+    "R": "ARG",
+    "N": "ASN",
+    "D": "ASP",
+    "C": "CYS",
+    "Q": "GLN",
+    "E": "GLU",
+    "G": "GLY",
+    "H": "HIS",
+    "I": "ILE",
+    "L": "LEU",
+    "K": "LYS",
+    "M": "MET",
+    "F": "PHE",
+    "P": "PRO",
+    "S": "SER",
+    "T": "THR",
+    "W": "TRP",
+    "Y": "TYR",
+    "V": "VAL",
+}
+
+
+# NB: restype_3to1 differs from Bio.PDB.protein_letters_3to1 by being a simple
+# 1-to-1 mapping of 3 letter names to one letter names. The latter contains
+# many more, and less common, three letter names as keys and maps many of these
+# to the same one letter name (including 'X' and 'U' which we don't use here).
+restype_3to1: dict[str, str] = {v: k for k, v in restype_1to3.items()}
+
+# Define a restype name for all unknown residues.
+unk_restype = "UNK"
+
+resnames: list[str] = [restype_1to3[r] for r in restypes] + [unk_restype]
+resname_to_idx: dict[str, int] = {resname: i for i, resname in enumerate(resnames)}
+
+
+# The mapping here uses hhblits convention, so that B is mapped to D, J and O
+# are mapped to X, U is mapped to C, and Z is mapped to E. Other than that the
+# remaining 20 amino acids are kept in alphabetical order.
+# There are 2 non-amino acid codes, X (representing any amino acid) and
+# "-" representing a missing amino acid in an alignment.  The id for these
+# codes is put at the end (20 and 21) so that they can easily be ignored if
+# desired.
+HHBLITS_AA_TO_ID: dict[str, int] = {
+    "A": 0,
+    "B": 2,
+    "C": 1,
+    "D": 2,
+    "E": 3,
+    "F": 4,
+    "G": 5,
+    "H": 6,
+    "I": 7,
+    "J": 20,
+    "K": 8,
+    "L": 9,
+    "M": 10,
+    "N": 11,
+    "O": 20,
+    "P": 12,
+    "Q": 13,
+    "R": 14,
+    "S": 15,
+    "T": 16,
+    "U": 1,
+    "V": 17,
+    "W": 18,
+    "X": 20,
+    "Y": 19,
+    "Z": 3,
+    "-": 21,
+}
+
+# Partial inversion of HHBLITS_AA_TO_ID.
+ID_TO_HHBLITS_AA: dict[int, str] = {
+    0: "A",
+    1: "C",  # Also U.
+    2: "D",  # Also B.
+    3: "E",  # Also Z.
+    4: "F",
+    5: "G",
+    6: "H",
+    7: "I",
+    8: "K",
+    9: "L",
+    10: "M",
+    11: "N",
+    12: "P",
+    13: "Q",
+    14: "R",
+    15: "S",
+    16: "T",
+    17: "V",
+    18: "W",
+    19: "Y",
+    20: "X",  # Includes J and O.
+    21: "-",
+}
+
+restypes_with_x_and_gap: list[str] = restypes + ["X", "-"]
+MAP_HHBLITS_AATYPE_TO_OUR_AATYPE: tuple[int, ...] = tuple(
+    restypes_with_x_and_gap.index(ID_TO_HHBLITS_AA[i]) for i in range(len(restypes_with_x_and_gap))
+)
+
+
+def _make_standard_atom_mask() -> np.ndarray:
+    """Returns [num_res_types, num_atom_types] mask array."""
+    # +1 to account for unknown (all 0s).
+    mask = np.zeros([restype_num + 1, atom_type_num], dtype=np.int32)
+    for restype, restype_letter in enumerate(restypes):
+        restype_name = restype_1to3[restype_letter]
+        atom_names = residue_atoms[restype_name]
+        for atom_name in atom_names:
+            atom_type = atom_order[atom_name]
+            mask[restype, atom_type] = 1
+    return mask
+
+
+STANDARD_ATOM_MASK = _make_standard_atom_mask()
+
+
+# A one hot representation for the first and second atoms defining the axis
+# of rotation for each chi-angle in each residue.
+def chi_angle_atom(atom_index: int) -> np.ndarray:
+    """Define chi-angle rigid groups via one-hot representations."""
+    chi_angles_index = {}
+    one_hots = []
+
+    for k, v in chi_angles_atoms.items():
+        indices = [atom_types.index(s[atom_index]) for s in v]
+        indices.extend([-1] * (4 - len(indices)))
+        chi_angles_index[k] = indices
+
+    for r in restypes:
+        res3 = restype_1to3[r]
+        one_hot = np.eye(atom_type_num)[chi_angles_index[res3]]
+        one_hots.append(one_hot)
+
+    one_hots.append(np.zeros([4, atom_type_num]))  # Add zeros for residue `X`.
+    one_hot = np.stack(one_hots, axis=0)
+    one_hot = np.transpose(one_hot, [0, 2, 1])
+
+    return one_hot
+
+
+chi_atom_1_one_hot = chi_angle_atom(1)
+chi_atom_2_one_hot = chi_angle_atom(2)
+
+# An array like chi_angles_atoms but using indices rather than names.
+chi_angles_atom_indices_list: list[list[list[str]]] = [chi_angles_atoms[restype_1to3[r]] for r in restypes]
+chi_angles_atom_indices_ours: list = map_structure_with_atom_order(chi_angles_atom_indices_list)
+chi_angles_atom_indices = np.array(
+    [chi_atoms + ([[0, 0, 0, 0]] * (4 - len(chi_atoms))) for chi_atoms in chi_angles_atom_indices_list]
+)
+
+# Mapping from (res_name, atom_name) pairs to the atom's chi group index
+# and atom index within that group.
+chi_groups_for_atom: dict[tuple[str, str], list[tuple[int, int]]] = collections.defaultdict(list)
+for res_name, chi_angle_atoms_for_res in chi_angles_atoms.items():
+    for chi_group_i, chi_group in enumerate(chi_angle_atoms_for_res):
+        for atom_i, atom in enumerate(chi_group):
+            chi_groups_for_atom[(res_name, atom)].append((chi_group_i, atom_i))
+chi_groups_for_atom = dict(chi_groups_for_atom)
+
+
+def _make_rigid_transformation_4x4(ex: np.ndarray, ey: np.ndarray, translation: np.ndarray) -> np.ndarray:
+    """Create a rigid 4x4 transformation matrix from two axes and transl."""
+    # Normalize ex.
+    ex_normalized = ex / np.linalg.norm(ex)
+
+    # make ey perpendicular to ex
+    ey_normalized = ey - np.dot(ey, ex_normalized) * ex_normalized
+    ey_normalized /= np.linalg.norm(ey_normalized)
+
+    # compute ez as cross product
+    eznorm = np.cross(ex_normalized, ey_normalized)
+    m = np.stack([ex_normalized, ey_normalized, eznorm, translation]).transpose()
+    m = np.concatenate([m, [[0.0, 0.0, 0.0, 1.0]]], axis=0)
+    return m
+
+
+# create an array with (restype, atomtype) --> rigid_group_idx
+# and an array with (restype, atomtype, coord) for the atom positions
+# and compute affine transformation matrices (4,4) from one rigid group to the
+# previous group
+restype_atom37_to_rigid_group = np.zeros([21, 37], dtype=int)
+restype_atom37_mask = np.zeros([21, 37], dtype=np.float32)
+restype_atom37_rigid_group_positions = np.zeros([21, 37, 3], dtype=np.float32)
+restype_atom14_to_rigid_group = np.zeros([21, 14], dtype=int)
+restype_atom14_mask = np.zeros([21, 14], dtype=np.float32)
+restype_atom14_rigid_group_positions = np.zeros([21, 14, 3], dtype=np.float32)
+restype_rigid_group_default_frame = np.zeros([21, 8, 4, 4], dtype=np.float32)
+
+
+def _make_rigid_group_constants() -> None:
+    """Fill the arrays above."""
+    for restype, restype_letter in enumerate(restypes):
+        resname = restype_1to3[restype_letter]
+        for atomname, group_idx, atom_position in rigid_group_atom_positions[resname]:
+            atomtype = atom_order[atomname]
+            restype_atom37_to_rigid_group[restype, atomtype] = group_idx
+            restype_atom37_mask[restype, atomtype] = 1
+            restype_atom37_rigid_group_positions[restype, atomtype, :] = atom_position
+
+            atom14idx = restype_name_to_atom14_names[resname].index(atomname)
+            restype_atom14_to_rigid_group[restype, atom14idx] = group_idx
+            restype_atom14_mask[restype, atom14idx] = 1
+            restype_atom14_rigid_group_positions[restype, atom14idx, :] = atom_position
+
+    for restype, restype_letter in enumerate(restypes):
+        resname = restype_1to3[restype_letter]
+        atom_positions: dict[str, np.ndarray] = {
+            name: np.array(pos) for name, _, pos in rigid_group_atom_positions[resname]
+        }
+
+        # backbone to backbone is the identity transform
+        restype_rigid_group_default_frame[restype, 0, :, :] = np.eye(4)
+
+        # pre-omega-frame to backbone (currently dummy identity matrix)
+        restype_rigid_group_default_frame[restype, 1, :, :] = np.eye(4)
+
+        # phi-frame to backbone
+        mat = _make_rigid_transformation_4x4(
+            ex=atom_positions["N"] - atom_positions["CA"],
+            ey=np.array([1.0, 0.0, 0.0]),
+            translation=atom_positions["N"],
+        )
+        restype_rigid_group_default_frame[restype, 2, :, :] = mat
+
+        # psi-frame to backbone
+        mat = _make_rigid_transformation_4x4(
+            ex=atom_positions["C"] - atom_positions["CA"],
+            ey=atom_positions["CA"] - atom_positions["N"],
+            translation=atom_positions["C"],
+        )
+        restype_rigid_group_default_frame[restype, 3, :, :] = mat
+
+        # chi1-frame to backbone
+        if chi_angles_mask[restype][0]:
+            base_atom_names = chi_angles_atoms[resname][0]
+            base_atom_positions = [atom_positions[name] for name in base_atom_names]
+            mat = _make_rigid_transformation_4x4(
+                ex=base_atom_positions[2] - base_atom_positions[1],
+                ey=base_atom_positions[0] - base_atom_positions[1],
+                translation=base_atom_positions[2],
+            )
+            restype_rigid_group_default_frame[restype, 4, :, :] = mat
+
+        # chi2-frame to chi1-frame
+        # chi3-frame to chi2-frame
+        # chi4-frame to chi3-frame
+        # luckily all rotation axes for the next frame start at (0,0,0) of the
+        # previous frame
+        for chi_idx in range(1, 4):
+            if chi_angles_mask[restype][chi_idx]:
+                axis_end_atom_name = chi_angles_atoms[resname][chi_idx][2]
+                axis_end_atom_position = atom_positions[axis_end_atom_name]
+                mat = _make_rigid_transformation_4x4(
+                    ex=axis_end_atom_position,
+                    ey=np.array([-1.0, 0.0, 0.0]),
+                    translation=axis_end_atom_position,
+                )
+                restype_rigid_group_default_frame[restype, 4 + chi_idx, :, :] = mat
+
+
+_make_rigid_group_constants()
+
+
+def make_atom14_dists_bounds(
+    overlap_tolerance: float = 1.5,
+    bond_length_tolerance_factor: int = 15,
+) -> dict[str, np.ndarray]:
+    """compute upper and lower bounds for bonds to assess violations."""
+    restype_atom14_bond_lower_bound = np.zeros([21, 14, 14], np.float32)
+    restype_atom14_bond_upper_bound = np.zeros([21, 14, 14], np.float32)
+    restype_atom14_bond_stddev = np.zeros([21, 14, 14], np.float32)
+    residue_bonds, residue_virtual_bonds, _ = load_stereo_chemical_props()
+    for restype, restype_letter in enumerate(restypes):
+        resname = restype_1to3[restype_letter]
+        atom_list = restype_name_to_atom14_names[resname]
+
+        # create lower and upper bounds for clashes
+        for atom1_idx, atom1_name in enumerate(atom_list):
+            if not atom1_name:
+                continue
+            atom1_radius = van_der_waals_radius[atom1_name[0]]
+            for atom2_idx, atom2_name in enumerate(atom_list):
+                if (not atom2_name) or atom1_idx == atom2_idx:
+                    continue
+                atom2_radius = van_der_waals_radius[atom2_name[0]]
+                lower = atom1_radius + atom2_radius - overlap_tolerance
+                upper = 1e10
+                restype_atom14_bond_lower_bound[restype, atom1_idx, atom2_idx] = lower
+                restype_atom14_bond_lower_bound[restype, atom2_idx, atom1_idx] = lower
+                restype_atom14_bond_upper_bound[restype, atom1_idx, atom2_idx] = upper
+                restype_atom14_bond_upper_bound[restype, atom2_idx, atom1_idx] = upper
+
+        # overwrite lower and upper bounds for bonds and angles
+        for b in residue_bonds[resname] + residue_virtual_bonds[resname]:
+            atom1_idx = atom_list.index(b.atom1_name)
+            atom2_idx = atom_list.index(b.atom2_name)
+            lower = b.length - bond_length_tolerance_factor * b.stddev
+            upper = b.length + bond_length_tolerance_factor * b.stddev
+            restype_atom14_bond_lower_bound[restype, atom1_idx, atom2_idx] = lower
+            restype_atom14_bond_lower_bound[restype, atom2_idx, atom1_idx] = lower
+            restype_atom14_bond_upper_bound[restype, atom1_idx, atom2_idx] = upper
+            restype_atom14_bond_upper_bound[restype, atom2_idx, atom1_idx] = upper
+            restype_atom14_bond_stddev[restype, atom1_idx, atom2_idx] = b.stddev
+            restype_atom14_bond_stddev[restype, atom2_idx, atom1_idx] = b.stddev
+    return {
+        "lower_bound": restype_atom14_bond_lower_bound,  # shape (21,14,14)
+        "upper_bound": restype_atom14_bond_upper_bound,  # shape (21,14,14)
+        "stddev": restype_atom14_bond_stddev,  # shape (21,14,14)
+    }
+
+
+restype_atom14_ambiguous_atoms = np.zeros((21, 14), dtype=np.float32)
+restype_atom14_ambiguous_atoms_swap_idx: np.ndarray = np.tile(np.arange(14, dtype=int), (21, 1))
+
+
+def _make_atom14_ambiguity_feats() -> None:
+    for res, pairs in residue_atom_renaming_swaps.items():
+        res_idx = restype_order[restype_3to1[res]]
+        for atom1, atom2 in pairs.items():
+            atom1_idx = restype_name_to_atom14_names[res].index(atom1)
+            atom2_idx = restype_name_to_atom14_names[res].index(atom2)
+            restype_atom14_ambiguous_atoms[res_idx, atom1_idx] = 1
+            restype_atom14_ambiguous_atoms[res_idx, atom2_idx] = 1
+            restype_atom14_ambiguous_atoms_swap_idx[res_idx, atom1_idx] = atom2_idx
+            restype_atom14_ambiguous_atoms_swap_idx[res_idx, atom2_idx] = atom1_idx
+
+
+_make_atom14_ambiguity_feats()
+
+
+def aatype_to_str_sequence(aatype: Sequence[int]) -> str:
+    return "".join([restypes_with_x[aatype[i]] for i in range(len(aatype))])
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/rigid_utils.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/rigid_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..e2bc25cb6d53f0428be3f6c1c25e67d67edab739
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/rigid_utils.py
@@ -0,0 +1,1243 @@
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import annotations
+
+from collections.abc import Sequence
+from functools import cache
+from typing import Any, Callable
+
+import numpy as np
+import torch
+
+
+def rot_matmul(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+    """
+    Performs matrix multiplication of two rotation matrix tensors. Written out by hand to avoid AMP downcasting.
+
+    Args:
+        a: [*, 3, 3] left multiplicand
+        b: [*, 3, 3] right multiplicand
+    Returns:
+        The product ab
+    """
+
+    def row_mul(i: int) -> torch.Tensor:
+        return torch.stack(
+            [
+                a[..., i, 0] * b[..., 0, 0] + a[..., i, 1] * b[..., 1, 0] + a[..., i, 2] * b[..., 2, 0],
+                a[..., i, 0] * b[..., 0, 1] + a[..., i, 1] * b[..., 1, 1] + a[..., i, 2] * b[..., 2, 1],
+                a[..., i, 0] * b[..., 0, 2] + a[..., i, 1] * b[..., 1, 2] + a[..., i, 2] * b[..., 2, 2],
+            ],
+            dim=-1,
+        )
+
+    return torch.stack(
+        [
+            row_mul(0),
+            row_mul(1),
+            row_mul(2),
+        ],
+        dim=-2,
+    )
+
+
+def rot_vec_mul(r: torch.Tensor, t: torch.Tensor) -> torch.Tensor:
+    """
+    Applies a rotation to a vector. Written out by hand to avoid transfer to avoid AMP downcasting.
+
+    Args:
+        r: [*, 3, 3] rotation matrices
+        t: [*, 3] coordinate tensors
+    Returns:
+        [*, 3] rotated coordinates
+    """
+    x, y, z = torch.unbind(t, dim=-1)
+    return torch.stack(
+        [
+            r[..., 0, 0] * x + r[..., 0, 1] * y + r[..., 0, 2] * z,
+            r[..., 1, 0] * x + r[..., 1, 1] * y + r[..., 1, 2] * z,
+            r[..., 2, 0] * x + r[..., 2, 1] * y + r[..., 2, 2] * z,
+        ],
+        dim=-1,
+    )
+
+
+@cache
+def identity_rot_mats(
+    batch_dims: tuple[int, ...],
+    dtype: torch.dtype | None = None,
+    device: torch.device | None = None,
+    requires_grad: bool = True,
+) -> torch.Tensor:
+    rots = torch.eye(3, dtype=dtype, device=device, requires_grad=requires_grad)
+    rots = rots.view(*((1,) * len(batch_dims)), 3, 3)
+    rots = rots.expand(*batch_dims, -1, -1)
+    rots = rots.contiguous()
+
+    return rots
+
+
+@cache
+def identity_trans(
+    batch_dims: tuple[int, ...],
+    dtype: torch.dtype | None = None,
+    device: torch.device | None = None,
+    requires_grad: bool = True,
+) -> torch.Tensor:
+    trans = torch.zeros((*batch_dims, 3), dtype=dtype, device=device, requires_grad=requires_grad)
+    return trans
+
+
+@cache
+def identity_quats(
+    batch_dims: tuple[int, ...],
+    dtype: torch.dtype | None = None,
+    device: torch.device | None = None,
+    requires_grad: bool = True,
+) -> torch.Tensor:
+    quat = torch.zeros((*batch_dims, 4), dtype=dtype, device=device, requires_grad=requires_grad)
+
+    with torch.no_grad():
+        quat[..., 0] = 1
+
+    return quat
+
+
+_quat_elements: list[str] = ["a", "b", "c", "d"]
+_qtr_keys: list[str] = [l1 + l2 for l1 in _quat_elements for l2 in _quat_elements]
+_qtr_ind_dict: dict[str, int] = {key: ind for ind, key in enumerate(_qtr_keys)}
+
+
+def _to_mat(pairs: list[tuple[str, int]]) -> np.ndarray:
+    mat = np.zeros((4, 4))
+    for key, value in pairs:
+        ind = _qtr_ind_dict[key]
+        mat[ind // 4][ind % 4] = value
+
+    return mat
+
+
+_QTR_MAT = np.zeros((4, 4, 3, 3))
+_QTR_MAT[..., 0, 0] = _to_mat([("aa", 1), ("bb", 1), ("cc", -1), ("dd", -1)])
+_QTR_MAT[..., 0, 1] = _to_mat([("bc", 2), ("ad", -2)])
+_QTR_MAT[..., 0, 2] = _to_mat([("bd", 2), ("ac", 2)])
+_QTR_MAT[..., 1, 0] = _to_mat([("bc", 2), ("ad", 2)])
+_QTR_MAT[..., 1, 1] = _to_mat([("aa", 1), ("bb", -1), ("cc", 1), ("dd", -1)])
+_QTR_MAT[..., 1, 2] = _to_mat([("cd", 2), ("ab", -2)])
+_QTR_MAT[..., 2, 0] = _to_mat([("bd", 2), ("ac", -2)])
+_QTR_MAT[..., 2, 1] = _to_mat([("cd", 2), ("ab", 2)])
+_QTR_MAT[..., 2, 2] = _to_mat([("aa", 1), ("bb", -1), ("cc", -1), ("dd", 1)])
+
+
+def quat_to_rot(quat: torch.Tensor) -> torch.Tensor:
+    """
+    Converts a quaternion to a rotation matrix.
+
+    Args:
+        quat: [*, 4] quaternions
+    Returns:
+        [*, 3, 3] rotation matrices
+    """
+    # [*, 4, 4]
+    quat = quat[..., None] * quat[..., None, :]
+
+    # [4, 4, 3, 3]
+    mat = _get_quat("_QTR_MAT", dtype=quat.dtype, device=quat.device)
+
+    # [*, 4, 4, 3, 3]
+    shaped_qtr_mat = mat.view((1,) * len(quat.shape[:-2]) + mat.shape)
+    quat = quat[..., None, None] * shaped_qtr_mat
+
+    # [*, 3, 3]
+    return torch.sum(quat, dim=(-3, -4))
+
+
+def rot_to_quat(rot: torch.Tensor) -> torch.Tensor:
+    if rot.shape[-2:] != (3, 3):
+        raise ValueError("Input rotation is incorrectly shaped")
+
+    [[xx, xy, xz], [yx, yy, yz], [zx, zy, zz]] = [[rot[..., i, j] for j in range(3)] for i in range(3)]
+
+    k = [
+        [
+            xx + yy + zz,
+            zy - yz,
+            xz - zx,
+            yx - xy,
+        ],
+        [
+            zy - yz,
+            xx - yy - zz,
+            xy + yx,
+            xz + zx,
+        ],
+        [
+            xz - zx,
+            xy + yx,
+            yy - xx - zz,
+            yz + zy,
+        ],
+        [
+            yx - xy,
+            xz + zx,
+            yz + zy,
+            zz - xx - yy,
+        ],
+    ]
+
+    _, vectors = torch.linalg.eigh((1.0 / 3.0) * torch.stack([torch.stack(t, dim=-1) for t in k], dim=-2))
+    return vectors[..., -1]
+
+
+_QUAT_MULTIPLY = np.zeros((4, 4, 4))
+_QUAT_MULTIPLY[:, :, 0] = [[1, 0, 0, 0], [0, -1, 0, 0], [0, 0, -1, 0], [0, 0, 0, -1]]
+
+_QUAT_MULTIPLY[:, :, 1] = [[0, 1, 0, 0], [1, 0, 0, 0], [0, 0, 0, 1], [0, 0, -1, 0]]
+
+_QUAT_MULTIPLY[:, :, 2] = [[0, 0, 1, 0], [0, 0, 0, -1], [1, 0, 0, 0], [0, 1, 0, 0]]
+
+_QUAT_MULTIPLY[:, :, 3] = [[0, 0, 0, 1], [0, 0, 1, 0], [0, -1, 0, 0], [1, 0, 0, 0]]
+
+_QUAT_MULTIPLY_BY_VEC = _QUAT_MULTIPLY[:, 1:, :]
+
+_CACHED_QUATS: dict[str, np.ndarray] = {
+    "_QTR_MAT": _QTR_MAT,
+    "_QUAT_MULTIPLY": _QUAT_MULTIPLY,
+    "_QUAT_MULTIPLY_BY_VEC": _QUAT_MULTIPLY_BY_VEC,
+}
+
+
+@cache
+def _get_quat(quat_key: str, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
+    return torch.tensor(_CACHED_QUATS[quat_key], dtype=dtype, device=device)
+
+
+def quat_multiply(quat1: torch.Tensor, quat2: torch.Tensor) -> torch.Tensor:
+    """Multiply a quaternion by another quaternion."""
+    mat = _get_quat("_QUAT_MULTIPLY", dtype=quat1.dtype, device=quat1.device)
+    reshaped_mat = mat.view((1,) * len(quat1.shape[:-1]) + mat.shape)
+    return torch.sum(reshaped_mat * quat1[..., :, None, None] * quat2[..., None, :, None], dim=(-3, -2))
+
+
+def quat_multiply_by_vec(quat: torch.Tensor, vec: torch.Tensor) -> torch.Tensor:
+    """Multiply a quaternion by a pure-vector quaternion."""
+    mat = _get_quat("_QUAT_MULTIPLY_BY_VEC", dtype=quat.dtype, device=quat.device)
+    reshaped_mat = mat.view((1,) * len(quat.shape[:-1]) + mat.shape)
+    return torch.sum(reshaped_mat * quat[..., :, None, None] * vec[..., None, :, None], dim=(-3, -2))
+
+
+def invert_rot_mat(rot_mat: torch.Tensor) -> torch.Tensor:
+    return rot_mat.transpose(-1, -2)
+
+
+def invert_quat(quat: torch.Tensor) -> torch.Tensor:
+    quat_prime = quat.clone()
+    quat_prime[..., 1:] *= -1
+    inv = quat_prime / torch.sum(quat**2, dim=-1, keepdim=True)
+    return inv
+
+
+class Rotation:
+    """
+    A 3D rotation. Depending on how the object is initialized, the rotation is represented by either a rotation matrix
+    or a quaternion, though both formats are made available by helper functions. To simplify gradient computation, the
+    underlying format of the rotation cannot be changed in-place. Like Rigid, the class is designed to mimic the
+    behavior of a torch Tensor, almost as if each Rotation object were a tensor of rotations, in one format or another.
+    """
+
+    def __init__(
+        self,
+        rot_mats: torch.Tensor | None = None,
+        quats: torch.Tensor | None = None,
+        normalize_quats: bool = True,
+    ):
+        """
+        Args:
+            rot_mats:
+                A [*, 3, 3] rotation matrix tensor. Mutually exclusive with quats
+            quats:
+                A [*, 4] quaternion. Mutually exclusive with rot_mats. If normalize_quats is not True, must be a unit
+                quaternion
+            normalize_quats:
+                If quats is specified, whether to normalize quats
+        """
+        if (rot_mats is None and quats is None) or (rot_mats is not None and quats is not None):
+            raise ValueError("Exactly one input argument must be specified")
+
+        if (rot_mats is not None and rot_mats.shape[-2:] != (3, 3)) or (quats is not None and quats.shape[-1] != 4):
+            raise ValueError("Incorrectly shaped rotation matrix or quaternion")
+
+        # Force full-precision
+        if quats is not None:
+            quats = quats.to(dtype=torch.float32)
+        if rot_mats is not None:
+            rot_mats = rot_mats.to(dtype=torch.float32)
+
+        if quats is not None and normalize_quats:
+            quats = quats / torch.linalg.norm(quats, dim=-1, keepdim=True)
+
+        self._rot_mats = rot_mats
+        self._quats = quats
+
+    @staticmethod
+    def identity(
+        shape,
+        dtype: torch.dtype | None = None,
+        device: torch.device | None = None,
+        requires_grad: bool = True,
+        fmt: str = "quat",
+    ) -> Rotation:
+        """
+        Returns an identity Rotation.
+
+        Args:
+            shape:
+                The "shape" of the resulting Rotation object. See documentation for the shape property
+            dtype:
+                The torch dtype for the rotation
+            device:
+                The torch device for the new rotation
+            requires_grad:
+                Whether the underlying tensors in the new rotation object should require gradient computation
+            fmt:
+                One of "quat" or "rot_mat". Determines the underlying format of the new object's rotation
+        Returns:
+            A new identity rotation
+        """
+        if fmt == "rot_mat":
+            rot_mats = identity_rot_mats(
+                shape,
+                dtype,
+                device,
+                requires_grad,
+            )
+            return Rotation(rot_mats=rot_mats, quats=None)
+        elif fmt == "quat":
+            quats = identity_quats(shape, dtype, device, requires_grad)
+            return Rotation(rot_mats=None, quats=quats, normalize_quats=False)
+        else:
+            raise ValueError(f"Invalid format: f{fmt}")
+
+    # Magic methods
+
+    def __getitem__(self, index: Any) -> Rotation:
+        """
+        Allows torch-style indexing over the virtual shape of the rotation object. See documentation for the shape
+        property.
+
+        Args:
+            index:
+                A torch index. E.g. (1, 3, 2), or (slice(None,))
+        Returns:
+            The indexed rotation
+        """
+        if type(index) is not tuple:
+            index = (index,)
+
+        if self._rot_mats is not None:
+            rot_mats = self._rot_mats[index + (slice(None), slice(None))]
+            return Rotation(rot_mats=rot_mats)
+        elif self._quats is not None:
+            quats = self._quats[index + (slice(None),)]
+            return Rotation(quats=quats, normalize_quats=False)
+        else:
+            raise ValueError("Both rotations are None")
+
+    def __mul__(self, right: torch.Tensor) -> Rotation:
+        """
+        Pointwise left multiplication of the rotation with a tensor. Can be used to e.g. mask the Rotation.
+
+        Args:
+            right:
+                The tensor multiplicand
+        Returns:
+            The product
+        """
+        if not (isinstance(right, torch.Tensor)):
+            raise TypeError("The other multiplicand must be a Tensor")
+
+        if self._rot_mats is not None:
+            rot_mats = self._rot_mats * right[..., None, None]
+            return Rotation(rot_mats=rot_mats, quats=None)
+        elif self._quats is not None:
+            quats = self._quats * right[..., None]
+            return Rotation(rot_mats=None, quats=quats, normalize_quats=False)
+        else:
+            raise ValueError("Both rotations are None")
+
+    def __rmul__(self, left: torch.Tensor) -> Rotation:
+        """
+        Reverse pointwise multiplication of the rotation with a tensor.
+
+        Args:
+            left:
+                The left multiplicand
+        Returns:
+            The product
+        """
+        return self.__mul__(left)
+
+    # Properties
+
+    @property
+    def shape(self) -> torch.Size:
+        """
+        Returns the virtual shape of the rotation object. This shape is defined as the batch dimensions of the
+        underlying rotation matrix or quaternion. If the Rotation was initialized with a [10, 3, 3] rotation matrix
+        tensor, for example, the resulting shape would be [10].
+
+        Returns:
+            The virtual shape of the rotation object
+        """
+        if self._rot_mats is not None:
+            return self._rot_mats.shape[:-2]
+        elif self._quats is not None:
+            return self._quats.shape[:-1]
+        else:
+            raise ValueError("Both rotations are None")
+
+    @property
+    def dtype(self) -> torch.dtype:
+        """
+        Returns the dtype of the underlying rotation.
+
+        Returns:
+            The dtype of the underlying rotation
+        """
+        if self._rot_mats is not None:
+            return self._rot_mats.dtype
+        elif self._quats is not None:
+            return self._quats.dtype
+        else:
+            raise ValueError("Both rotations are None")
+
+    @property
+    def device(self) -> torch.device:
+        """
+        The device of the underlying rotation
+
+        Returns:
+            The device of the underlying rotation
+        """
+        if self._rot_mats is not None:
+            return self._rot_mats.device
+        elif self._quats is not None:
+            return self._quats.device
+        else:
+            raise ValueError("Both rotations are None")
+
+    @property
+    def requires_grad(self) -> bool:
+        """
+        Returns the requires_grad property of the underlying rotation
+
+        Returns:
+            The requires_grad property of the underlying tensor
+        """
+        if self._rot_mats is not None:
+            return self._rot_mats.requires_grad
+        elif self._quats is not None:
+            return self._quats.requires_grad
+        else:
+            raise ValueError("Both rotations are None")
+
+    def get_rot_mats(self) -> torch.Tensor:
+        """
+        Returns the underlying rotation as a rotation matrix tensor.
+
+        Returns:
+            The rotation as a rotation matrix tensor
+        """
+        if self._rot_mats is not None:
+            return self._rot_mats
+        elif self._quats is not None:
+            return quat_to_rot(self._quats)
+        else:
+            raise ValueError("Both rotations are None")
+
+    def get_quats(self) -> torch.Tensor:
+        """
+        Returns the underlying rotation as a quaternion tensor.
+
+        Depending on whether the Rotation was initialized with a quaternion, this function may call torch.linalg.eigh.
+
+        Returns:
+            The rotation as a quaternion tensor.
+        """
+        if self._rot_mats is not None:
+            return rot_to_quat(self._rot_mats)
+        elif self._quats is not None:
+            return self._quats
+        else:
+            raise ValueError("Both rotations are None")
+
+    def get_cur_rot(self) -> torch.Tensor:
+        """
+        Return the underlying rotation in its current form
+
+        Returns:
+            The stored rotation
+        """
+        if self._rot_mats is not None:
+            return self._rot_mats
+        elif self._quats is not None:
+            return self._quats
+        else:
+            raise ValueError("Both rotations are None")
+
+    # Rotation functions
+
+    def compose_q_update_vec(self, q_update_vec: torch.Tensor, normalize_quats: bool = True) -> Rotation:
+        """
+        Returns a new quaternion Rotation after updating the current object's underlying rotation with a quaternion
+        update, formatted as a [*, 3] tensor whose final three columns represent x, y, z such that (1, x, y, z) is the
+        desired (not necessarily unit) quaternion update.
+
+        Args:
+            q_update_vec:
+                A [*, 3] quaternion update tensor
+            normalize_quats:
+                Whether to normalize the output quaternion
+        Returns:
+            An updated Rotation
+        """
+        quats = self.get_quats()
+        new_quats = quats + quat_multiply_by_vec(quats, q_update_vec)
+        return Rotation(
+            rot_mats=None,
+            quats=new_quats,
+            normalize_quats=normalize_quats,
+        )
+
+    def compose_r(self, r: Rotation) -> Rotation:
+        """
+        Compose the rotation matrices of the current Rotation object with those of another.
+
+        Args:
+            r:
+                An update rotation object
+        Returns:
+            An updated rotation object
+        """
+        r1 = self.get_rot_mats()
+        r2 = r.get_rot_mats()
+        new_rot_mats = rot_matmul(r1, r2)
+        return Rotation(rot_mats=new_rot_mats, quats=None)
+
+    def compose_q(self, r: Rotation, normalize_quats: bool = True) -> Rotation:
+        """
+        Compose the quaternions of the current Rotation object with those of another.
+
+        Depending on whether either Rotation was initialized with quaternions, this function may call
+        torch.linalg.eigh.
+
+        Args:
+            r:
+                An update rotation object
+        Returns:
+            An updated rotation object
+        """
+        q1 = self.get_quats()
+        q2 = r.get_quats()
+        new_quats = quat_multiply(q1, q2)
+        return Rotation(rot_mats=None, quats=new_quats, normalize_quats=normalize_quats)
+
+    def apply(self, pts: torch.Tensor) -> torch.Tensor:
+        """
+        Apply the current Rotation as a rotation matrix to a set of 3D coordinates.
+
+        Args:
+            pts:
+                A [*, 3] set of points
+        Returns:
+            [*, 3] rotated points
+        """
+        rot_mats = self.get_rot_mats()
+        return rot_vec_mul(rot_mats, pts)
+
+    def invert_apply(self, pts: torch.Tensor) -> torch.Tensor:
+        """
+        The inverse of the apply() method.
+
+        Args:
+            pts:
+                A [*, 3] set of points
+        Returns:
+            [*, 3] inverse-rotated points
+        """
+        rot_mats = self.get_rot_mats()
+        inv_rot_mats = invert_rot_mat(rot_mats)
+        return rot_vec_mul(inv_rot_mats, pts)
+
+    def invert(self) -> Rotation:
+        """
+        Returns the inverse of the current Rotation.
+
+        Returns:
+            The inverse of the current Rotation
+        """
+        if self._rot_mats is not None:
+            return Rotation(rot_mats=invert_rot_mat(self._rot_mats), quats=None)
+        elif self._quats is not None:
+            return Rotation(
+                rot_mats=None,
+                quats=invert_quat(self._quats),
+                normalize_quats=False,
+            )
+        else:
+            raise ValueError("Both rotations are None")
+
+    # "Tensor" stuff
+
+    def unsqueeze(self, dim: int) -> Rotation:
+        """
+        Analogous to torch.unsqueeze. The dimension is relative to the shape of the Rotation object.
+
+        Args:
+            dim: A positive or negative dimension index.
+        Returns:
+            The unsqueezed Rotation.
+        """
+        if dim >= len(self.shape):
+            raise ValueError("Invalid dimension")
+
+        if self._rot_mats is not None:
+            rot_mats = self._rot_mats.unsqueeze(dim if dim >= 0 else dim - 2)
+            return Rotation(rot_mats=rot_mats, quats=None)
+        elif self._quats is not None:
+            quats = self._quats.unsqueeze(dim if dim >= 0 else dim - 1)
+            return Rotation(rot_mats=None, quats=quats, normalize_quats=False)
+        else:
+            raise ValueError("Both rotations are None")
+
+    @staticmethod
+    def cat(rs: Sequence[Rotation], dim: int) -> Rotation:
+        """
+        Concatenates rotations along one of the batch dimensions. Analogous to torch.cat().
+
+        Note that the output of this operation is always a rotation matrix, regardless of the format of input
+        rotations.
+
+        Args:
+            rs:
+                A list of rotation objects
+            dim:
+                The dimension along which the rotations should be concatenated
+        Returns:
+            A concatenated Rotation object in rotation matrix format
+        """
+        rot_mats = torch.cat(
+            [r.get_rot_mats() for r in rs],
+            dim=dim if dim >= 0 else dim - 2,
+        )
+
+        return Rotation(rot_mats=rot_mats, quats=None)
+
+    def map_tensor_fn(self, fn: Callable[[torch.Tensor], torch.Tensor]) -> Rotation:
+        """
+        Apply a Tensor -> Tensor function to underlying rotation tensors, mapping over the rotation dimension(s). Can
+        be used e.g. to sum out a one-hot batch dimension.
+
+        Args:
+            fn:
+                A Tensor -> Tensor function to be mapped over the Rotation
+        Returns:
+            The transformed Rotation object
+        """
+        if self._rot_mats is not None:
+            rot_mats = self._rot_mats.view(self._rot_mats.shape[:-2] + (9,))
+            rot_mats = torch.stack(list(map(fn, torch.unbind(rot_mats, dim=-1))), dim=-1)
+            rot_mats = rot_mats.view(rot_mats.shape[:-1] + (3, 3))
+            return Rotation(rot_mats=rot_mats, quats=None)
+        elif self._quats is not None:
+            quats = torch.stack(list(map(fn, torch.unbind(self._quats, dim=-1))), dim=-1)
+            return Rotation(rot_mats=None, quats=quats, normalize_quats=False)
+        else:
+            raise ValueError("Both rotations are None")
+
+    def cuda(self) -> Rotation:
+        """
+        Analogous to the cuda() method of torch Tensors
+
+        Returns:
+            A copy of the Rotation in CUDA memory
+        """
+        if self._rot_mats is not None:
+            return Rotation(rot_mats=self._rot_mats.cuda(), quats=None)
+        elif self._quats is not None:
+            return Rotation(rot_mats=None, quats=self._quats.cuda(), normalize_quats=False)
+        else:
+            raise ValueError("Both rotations are None")
+
+    def to(self, device: torch.device | None, dtype: torch.dtype | None) -> Rotation:
+        """
+        Analogous to the to() method of torch Tensors
+
+        Args:
+            device:
+                A torch device
+            dtype:
+                A torch dtype
+        Returns:
+            A copy of the Rotation using the new device and dtype
+        """
+        if self._rot_mats is not None:
+            return Rotation(
+                rot_mats=self._rot_mats.to(device=device, dtype=dtype),
+                quats=None,
+            )
+        elif self._quats is not None:
+            return Rotation(
+                rot_mats=None,
+                quats=self._quats.to(device=device, dtype=dtype),
+                normalize_quats=False,
+            )
+        else:
+            raise ValueError("Both rotations are None")
+
+    def detach(self) -> Rotation:
+        """
+        Returns a copy of the Rotation whose underlying Tensor has been detached from its torch graph.
+
+        Returns:
+            A copy of the Rotation whose underlying Tensor has been detached from its torch graph
+        """
+        if self._rot_mats is not None:
+            return Rotation(rot_mats=self._rot_mats.detach(), quats=None)
+        elif self._quats is not None:
+            return Rotation(
+                rot_mats=None,
+                quats=self._quats.detach(),
+                normalize_quats=False,
+            )
+        else:
+            raise ValueError("Both rotations are None")
+
+
+class Rigid:
+    """
+    A class representing a rigid transformation. Little more than a wrapper around two objects: a Rotation object and a
+    [*, 3] translation Designed to behave approximately like a single torch tensor with the shape of the shared batch
+    dimensions of its component parts.
+    """
+
+    def __init__(self, rots: Rotation | None, trans: torch.Tensor | None):
+        """
+        Args:
+            rots: A [*, 3, 3] rotation tensor
+            trans: A corresponding [*, 3] translation tensor
+        """
+        # (we need device, dtype, etc. from at least one input)
+
+        batch_dims, dtype, device, requires_grad = None, None, None, None
+        if trans is not None:
+            batch_dims = trans.shape[:-1]
+            dtype = trans.dtype
+            device = trans.device
+            requires_grad = trans.requires_grad
+        elif rots is not None:
+            batch_dims = rots.shape
+            dtype = rots.dtype
+            device = rots.device
+            requires_grad = rots.requires_grad
+        else:
+            raise ValueError("At least one input argument must be specified")
+
+        if rots is None:
+            rots = Rotation.identity(
+                batch_dims,
+                dtype,
+                device,
+                requires_grad,
+            )
+        elif trans is None:
+            trans = identity_trans(
+                batch_dims,
+                dtype,
+                device,
+                requires_grad,
+            )
+
+        assert rots is not None
+        assert trans is not None
+
+        if (rots.shape != trans.shape[:-1]) or (rots.device != trans.device):
+            raise ValueError("Rots and trans incompatible")
+
+        # Force full precision. Happens to the rotations automatically.
+        trans = trans.to(dtype=torch.float32)
+
+        self._rots = rots
+        self._trans = trans
+
+    @staticmethod
+    def identity(
+        shape: tuple[int, ...],
+        dtype: torch.dtype | None = None,
+        device: torch.device | None = None,
+        requires_grad: bool = True,
+        fmt: str = "quat",
+    ) -> Rigid:
+        """
+        Constructs an identity transformation.
+
+        Args:
+            shape:
+                The desired shape
+            dtype:
+                The dtype of both internal tensors
+            device:
+                The device of both internal tensors
+            requires_grad:
+                Whether grad should be enabled for the internal tensors
+        Returns:
+            The identity transformation
+        """
+        return Rigid(
+            Rotation.identity(shape, dtype, device, requires_grad, fmt=fmt),
+            identity_trans(shape, dtype, device, requires_grad),
+        )
+
+    def __getitem__(self, index: Any) -> Rigid:
+        """
+        Indexes the affine transformation with PyTorch-style indices. The index is applied to the shared dimensions of
+        both the rotation and the translation.
+
+        E.g.::
+
+            r = Rotation(rot_mats=torch.rand(10, 10, 3, 3), quats=None) t = Rigid(r, torch.rand(10, 10, 3)) indexed =
+            t[3, 4:6] assert(indexed.shape == (2,)) assert(indexed.get_rots().shape == (2,))
+            assert(indexed.get_trans().shape == (2, 3))
+
+        Args:
+            index: A standard torch tensor index. E.g. 8, (10, None, 3),
+            or (3, slice(0, 1, None))
+        Returns:
+            The indexed tensor
+        """
+        if type(index) is not tuple:
+            index = (index,)
+
+        return Rigid(
+            self._rots[index],
+            self._trans[index + (slice(None),)],
+        )
+
+    def __mul__(self, right: torch.Tensor) -> Rigid:
+        """
+        Pointwise left multiplication of the transformation with a tensor. Can be used to e.g. mask the Rigid.
+
+        Args:
+            right:
+                The tensor multiplicand
+        Returns:
+            The product
+        """
+        if not (isinstance(right, torch.Tensor)):
+            raise TypeError("The other multiplicand must be a Tensor")
+
+        new_rots = self._rots * right
+        new_trans = self._trans * right[..., None]
+
+        return Rigid(new_rots, new_trans)
+
+    def __rmul__(self, left: torch.Tensor) -> Rigid:
+        """
+        Reverse pointwise multiplication of the transformation with a tensor.
+
+        Args:
+            left:
+                The left multiplicand
+        Returns:
+            The product
+        """
+        return self.__mul__(left)
+
+    @property
+    def shape(self) -> torch.Size:
+        """
+        Returns the shape of the shared dimensions of the rotation and the translation.
+
+        Returns:
+            The shape of the transformation
+        """
+        return self._trans.shape[:-1]
+
+    @property
+    def device(self) -> torch.device:
+        """
+        Returns the device on which the Rigid's tensors are located.
+
+        Returns:
+            The device on which the Rigid's tensors are located
+        """
+        return self._trans.device
+
+    def get_rots(self) -> Rotation:
+        """
+        Getter for the rotation.
+
+        Returns:
+            The rotation object
+        """
+        return self._rots
+
+    def get_trans(self) -> torch.Tensor:
+        """
+        Getter for the translation.
+
+        Returns:
+            The stored translation
+        """
+        return self._trans
+
+    def compose_q_update_vec(self, q_update_vec: torch.Tensor) -> Rigid:
+        """
+        Composes the transformation with a quaternion update vector of shape [*, 6], where the final 6 columns
+        represent the x, y, and z values of a quaternion of form (1, x, y, z) followed by a 3D translation.
+
+        Args:
+            q_vec: The quaternion update vector.
+        Returns:
+            The composed transformation.
+        """
+        q_vec, t_vec = q_update_vec[..., :3], q_update_vec[..., 3:]
+        new_rots = self._rots.compose_q_update_vec(q_vec)
+
+        trans_update = self._rots.apply(t_vec)
+        new_translation = self._trans + trans_update
+
+        return Rigid(new_rots, new_translation)
+
+    def compose(self, r: Rigid) -> Rigid:
+        """
+        Composes the current rigid object with another.
+
+        Args:
+            r:
+                Another Rigid object
+        Returns:
+            The composition of the two transformations
+        """
+        new_rot = self._rots.compose_r(r._rots)
+        new_trans = self._rots.apply(r._trans) + self._trans
+        return Rigid(new_rot, new_trans)
+
+    def apply(self, pts: torch.Tensor) -> torch.Tensor:
+        """
+        Applies the transformation to a coordinate tensor.
+
+        Args:
+            pts: A [*, 3] coordinate tensor.
+        Returns:
+            The transformed points.
+        """
+        rotated = self._rots.apply(pts)
+        return rotated + self._trans
+
+    def invert_apply(self, pts: torch.Tensor) -> torch.Tensor:
+        """
+        Applies the inverse of the transformation to a coordinate tensor.
+
+        Args:
+            pts: A [*, 3] coordinate tensor
+        Returns:
+            The transformed points.
+        """
+        pts = pts - self._trans
+        return self._rots.invert_apply(pts)
+
+    def invert(self) -> Rigid:
+        """
+        Inverts the transformation.
+
+        Returns:
+            The inverse transformation.
+        """
+        rot_inv = self._rots.invert()
+        trn_inv = rot_inv.apply(self._trans)
+
+        return Rigid(rot_inv, -1 * trn_inv)
+
+    def map_tensor_fn(self, fn: Callable[[torch.Tensor], torch.Tensor]) -> Rigid:
+        """
+        Apply a Tensor -> Tensor function to underlying translation and rotation tensors, mapping over the
+        translation/rotation dimensions respectively.
+
+        Args:
+            fn:
+                A Tensor -> Tensor function to be mapped over the Rigid
+        Returns:
+            The transformed Rigid object
+        """
+        new_rots = self._rots.map_tensor_fn(fn)
+        new_trans = torch.stack(list(map(fn, torch.unbind(self._trans, dim=-1))), dim=-1)
+
+        return Rigid(new_rots, new_trans)
+
+    def to_tensor_4x4(self) -> torch.Tensor:
+        """
+        Converts a transformation to a homogeneous transformation tensor.
+
+        Returns:
+            A [*, 4, 4] homogeneous transformation tensor
+        """
+        tensor = self._trans.new_zeros((*self.shape, 4, 4))
+        tensor[..., :3, :3] = self._rots.get_rot_mats()
+        tensor[..., :3, 3] = self._trans
+        tensor[..., 3, 3] = 1
+        return tensor
+
+    @staticmethod
+    def from_tensor_4x4(t: torch.Tensor) -> Rigid:
+        """
+        Constructs a transformation from a homogeneous transformation tensor.
+
+        Args:
+            t: [*, 4, 4] homogeneous transformation tensor
+        Returns:
+            T object with shape [*]
+        """
+        if t.shape[-2:] != (4, 4):
+            raise ValueError("Incorrectly shaped input tensor")
+
+        rots = Rotation(rot_mats=t[..., :3, :3], quats=None)
+        trans = t[..., :3, 3]
+
+        return Rigid(rots, trans)
+
+    def to_tensor_7(self) -> torch.Tensor:
+        """
+        Converts a transformation to a tensor with 7 final columns, four for the quaternion followed by three for the
+        translation.
+
+        Returns:
+            A [*, 7] tensor representation of the transformation
+        """
+        tensor = self._trans.new_zeros((*self.shape, 7))
+        tensor[..., :4] = self._rots.get_quats()
+        tensor[..., 4:] = self._trans
+
+        return tensor
+
+    @staticmethod
+    def from_tensor_7(t: torch.Tensor, normalize_quats: bool = False) -> Rigid:
+        if t.shape[-1] != 7:
+            raise ValueError("Incorrectly shaped input tensor")
+
+        quats, trans = t[..., :4], t[..., 4:]
+
+        rots = Rotation(rot_mats=None, quats=quats, normalize_quats=normalize_quats)
+
+        return Rigid(rots, trans)
+
+    @staticmethod
+    def from_3_points(
+        p_neg_x_axis: torch.Tensor, origin: torch.Tensor, p_xy_plane: torch.Tensor, eps: float = 1e-8
+    ) -> Rigid:
+        """
+        Implements algorithm 21. Constructs transformations from sets of 3 points using the Gram-Schmidt algorithm.
+
+        Args:
+            p_neg_x_axis: [*, 3] coordinates
+            origin: [*, 3] coordinates used as frame origins
+            p_xy_plane: [*, 3] coordinates
+            eps: Small epsilon value
+        Returns:
+            A transformation object of shape [*]
+        """
+        p_neg_x_axis_unbound = torch.unbind(p_neg_x_axis, dim=-1)
+        origin_unbound = torch.unbind(origin, dim=-1)
+        p_xy_plane_unbound = torch.unbind(p_xy_plane, dim=-1)
+
+        e0 = [c1 - c2 for c1, c2 in zip(origin_unbound, p_neg_x_axis_unbound)]
+        e1 = [c1 - c2 for c1, c2 in zip(p_xy_plane_unbound, origin_unbound)]
+
+        denom = torch.sqrt(sum(c * c for c in e0) + eps * torch.ones_like(e0[0]))
+        e0 = [c / denom for c in e0]
+        dot = sum((c1 * c2 for c1, c2 in zip(e0, e1)))
+        e1 = [c2 - c1 * dot for c1, c2 in zip(e0, e1)]
+        denom = torch.sqrt(sum(c * c for c in e1) + eps * torch.ones_like(e1[0]))
+        e1 = [c / denom for c in e1]
+        e2 = [
+            e0[1] * e1[2] - e0[2] * e1[1],
+            e0[2] * e1[0] - e0[0] * e1[2],
+            e0[0] * e1[1] - e0[1] * e1[0],
+        ]
+
+        rots = torch.stack([c for tup in zip(e0, e1, e2) for c in tup], dim=-1)
+        rots = rots.reshape(rots.shape[:-1] + (3, 3))
+
+        rot_obj = Rotation(rot_mats=rots, quats=None)
+
+        return Rigid(rot_obj, torch.stack(origin_unbound, dim=-1))
+
+    def unsqueeze(self, dim: int) -> Rigid:
+        """
+        Analogous to torch.unsqueeze. The dimension is relative to the shared dimensions of the rotation/translation.
+
+        Args:
+            dim: A positive or negative dimension index.
+        Returns:
+            The unsqueezed transformation.
+        """
+        if dim >= len(self.shape):
+            raise ValueError("Invalid dimension")
+        rots = self._rots.unsqueeze(dim)
+        trans = self._trans.unsqueeze(dim if dim >= 0 else dim - 1)
+
+        return Rigid(rots, trans)
+
+    @staticmethod
+    def cat(ts: Sequence[Rigid], dim: int) -> Rigid:
+        """
+        Concatenates transformations along a new dimension.
+
+        Args:
+            ts:
+                A list of T objects
+            dim:
+                The dimension along which the transformations should be concatenated
+        Returns:
+            A concatenated transformation object
+        """
+        rots = Rotation.cat([t._rots for t in ts], dim)
+        trans = torch.cat([t._trans for t in ts], dim=dim if dim >= 0 else dim - 1)
+
+        return Rigid(rots, trans)
+
+    def apply_rot_fn(self, fn: Callable[[Rotation], Rotation]) -> Rigid:
+        """
+        Applies a Rotation -> Rotation function to the stored rotation object.
+
+        Args:
+            fn: A function of type Rotation -> Rotation
+        Returns:
+            A transformation object with a transformed rotation.
+        """
+        return Rigid(fn(self._rots), self._trans)
+
+    def apply_trans_fn(self, fn: Callable[[torch.Tensor], torch.Tensor]) -> Rigid:
+        """
+        Applies a Tensor -> Tensor function to the stored translation.
+
+        Args:
+            fn:
+                A function of type Tensor -> Tensor to be applied to the translation
+        Returns:
+            A transformation object with a transformed translation.
+        """
+        return Rigid(self._rots, fn(self._trans))
+
+    def scale_translation(self, trans_scale_factor: float) -> Rigid:
+        """
+        Scales the translation by a constant factor.
+
+        Args:
+            trans_scale_factor:
+                The constant factor
+        Returns:
+            A transformation object with a scaled translation.
+        """
+        return self.apply_trans_fn(lambda t: t * trans_scale_factor)
+
+    def stop_rot_gradient(self) -> Rigid:
+        """
+        Detaches the underlying rotation object
+
+        Returns:
+            A transformation object with detached rotations
+        """
+        return self.apply_rot_fn(lambda r: r.detach())
+
+    @staticmethod
+    def make_transform_from_reference(
+        n_xyz: torch.Tensor, ca_xyz: torch.Tensor, c_xyz: torch.Tensor, eps: float = 1e-20
+    ) -> Rigid:
+        """
+        Returns a transformation object from reference coordinates.
+
+        Note that this method does not take care of symmetries. If you provide the atom positions in the non-standard
+        way, the N atom will end up not at [-0.527250, 1.359329, 0.0] but instead at [-0.527250, -1.359329, 0.0]. You
+        need to take care of such cases in your code.
+
+        Args:
+            n_xyz: A [*, 3] tensor of nitrogen xyz coordinates.
+            ca_xyz: A [*, 3] tensor of carbon alpha xyz coordinates.
+            c_xyz: A [*, 3] tensor of carbon xyz coordinates.
+        Returns:
+            A transformation object. After applying the translation and rotation to the reference backbone, the
+            coordinates will approximately equal to the input coordinates.
+        """
+        translation = -1 * ca_xyz
+        n_xyz = n_xyz + translation
+        c_xyz = c_xyz + translation
+
+        c_x, c_y, c_z = [c_xyz[..., i] for i in range(3)]
+        norm = torch.sqrt(eps + c_x**2 + c_y**2)
+        sin_c1 = -c_y / norm
+        cos_c1 = c_x / norm
+
+        c1_rots = sin_c1.new_zeros((*sin_c1.shape, 3, 3))
+        c1_rots[..., 0, 0] = cos_c1
+        c1_rots[..., 0, 1] = -1 * sin_c1
+        c1_rots[..., 1, 0] = sin_c1
+        c1_rots[..., 1, 1] = cos_c1
+        c1_rots[..., 2, 2] = 1
+
+        norm = torch.sqrt(eps + c_x**2 + c_y**2 + c_z**2)
+        sin_c2 = c_z / norm
+        cos_c2 = torch.sqrt(c_x**2 + c_y**2) / norm
+
+        c2_rots = sin_c2.new_zeros((*sin_c2.shape, 3, 3))
+        c2_rots[..., 0, 0] = cos_c2
+        c2_rots[..., 0, 2] = sin_c2
+        c2_rots[..., 1, 1] = 1
+        c2_rots[..., 2, 0] = -1 * sin_c2
+        c2_rots[..., 2, 2] = cos_c2
+
+        c_rots = rot_matmul(c2_rots, c1_rots)
+        n_xyz = rot_vec_mul(c_rots, n_xyz)
+
+        _, n_y, n_z = [n_xyz[..., i] for i in range(3)]
+        norm = torch.sqrt(eps + n_y**2 + n_z**2)
+        sin_n = -n_z / norm
+        cos_n = n_y / norm
+
+        n_rots = sin_c2.new_zeros((*sin_c2.shape, 3, 3))
+        n_rots[..., 0, 0] = 1
+        n_rots[..., 1, 1] = cos_n
+        n_rots[..., 1, 2] = -1 * sin_n
+        n_rots[..., 2, 1] = sin_n
+        n_rots[..., 2, 2] = cos_n
+
+        rots = rot_matmul(n_rots, c_rots)
+
+        rots = rots.transpose(-1, -2)
+        translation = -1 * translation
+
+        rot_obj = Rotation(rot_mats=rots, quats=None)
+
+        return Rigid(rot_obj, translation)
+
+    def cuda(self) -> Rigid:
+        """
+        Moves the transformation object to GPU memory
+
+        Returns:
+            A version of the transformation on GPU
+        """
+        return Rigid(self._rots.cuda(), self._trans.cuda())
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/tensor_utils.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/tensor_utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..449c810aed3f51b34a96674f29d5dc8bdfc35103
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/esm/openfold_utils/tensor_utils.py
@@ -0,0 +1,140 @@
+# Copyright 2021 AlQuraishi Laboratory
+# Copyright 2021 DeepMind Technologies Limited
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#      http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from functools import partial
+from typing import Any, Callable, TypeVar, Union, overload
+
+import torch
+import torch.nn as nn
+import torch.types
+
+
+def add(m1: torch.Tensor, m2: torch.Tensor, inplace: bool) -> torch.Tensor:
+    # The first operation in a checkpoint can't be in-place, but it's
+    # nice to have in-place addition during inference. Thus...
+    if not inplace:
+        m1 = m1 + m2
+    else:
+        m1 += m2
+
+    return m1
+
+
+def permute_final_dims(tensor: torch.Tensor, inds: list[int]) -> torch.Tensor:
+    zero_index = -1 * len(inds)
+    first_inds = list(range(len(tensor.shape[:zero_index])))
+    return tensor.permute(first_inds + [zero_index + i for i in inds])
+
+
+def flatten_final_dims(t: torch.Tensor, no_dims: int) -> torch.Tensor:
+    return t.reshape(t.shape[:-no_dims] + (-1,))
+
+
+def masked_mean(mask: torch.Tensor, value: torch.Tensor, dim: int, eps: float = 1e-4) -> torch.Tensor:
+    mask = mask.expand(*value.shape)
+    return torch.sum(mask * value, dim=dim) / (eps + torch.sum(mask, dim=dim))
+
+
+def pts_to_distogram(
+    pts: torch.Tensor, min_bin: torch.types.Number = 2.3125, max_bin: torch.types.Number = 21.6875, no_bins: int = 64
+) -> torch.Tensor:
+    boundaries = torch.linspace(min_bin, max_bin, no_bins - 1, device=pts.device)
+    dists = torch.sqrt(torch.sum((pts.unsqueeze(-2) - pts.unsqueeze(-3)) ** 2, dim=-1))
+    return torch.bucketize(dists, boundaries)
+
+
+def dict_multimap(fn: Callable[[list], Any], dicts: list[dict]) -> dict:
+    first = dicts[0]
+    new_dict = {}
+    for k, v in first.items():
+        all_v = [d[k] for d in dicts]
+        if isinstance(v, dict):
+            new_dict[k] = dict_multimap(fn, all_v)
+        else:
+            new_dict[k] = fn(all_v)
+
+    return new_dict
+
+
+def one_hot(x: torch.Tensor, v_bins: torch.Tensor) -> torch.Tensor:
+    reshaped_bins = v_bins.view(((1,) * len(x.shape)) + (len(v_bins),))
+    diffs = x[..., None] - reshaped_bins
+    am = torch.argmin(torch.abs(diffs), dim=-1)
+    return nn.functional.one_hot(am, num_classes=len(v_bins)).float()
+
+
+def batched_gather(data: torch.Tensor, inds: torch.Tensor, dim: int = 0, no_batch_dims: int = 0) -> torch.Tensor:
+    ranges: list[Union[slice, torch.Tensor]] = []
+    for i, s in enumerate(data.shape[:no_batch_dims]):
+        r = torch.arange(s)
+        r = r.view(*(*((1,) * i), -1, *((1,) * (len(inds.shape) - i - 1))))
+        ranges.append(r)
+
+    remaining_dims: list[Union[slice, torch.Tensor]] = [slice(None) for _ in range(len(data.shape) - no_batch_dims)]
+    remaining_dims[dim - no_batch_dims if dim >= 0 else dim] = inds
+    ranges.extend(remaining_dims)
+    # Matt note: Editing this to get around the behaviour of using a list as an array index changing
+    # in recent Numpy versions
+    return data[tuple(ranges)]
+
+
+T = TypeVar("T")
+
+
+# With tree_map, a poor man's JAX tree_map
+def dict_map(
+    fn: Callable[[T], Any], dic: dict[Any, Union[dict, list, tuple, T]], leaf_type: type[T]
+) -> dict[Any, Union[dict, list, tuple, Any]]:
+    new_dict: dict[Any, Union[dict, list, tuple, Any]] = {}
+    for k, v in dic.items():
+        if isinstance(v, dict):
+            new_dict[k] = dict_map(fn, v, leaf_type)
+        else:
+            new_dict[k] = tree_map(fn, v, leaf_type)
+
+    return new_dict
+
+
+@overload
+def tree_map(fn: Callable[[T], Any], tree: T, leaf_type: type[T]) -> Any: ...
+
+
+@overload
+def tree_map(fn: Callable[[T], Any], tree: dict, leaf_type: type[T]) -> dict: ...
+
+
+@overload
+def tree_map(fn: Callable[[T], Any], tree: list, leaf_type: type[T]) -> list: ...
+
+
+@overload
+def tree_map(fn: Callable[[T], Any], tree: tuple, leaf_type: type[T]) -> tuple: ...
+
+
+def tree_map(fn, tree, leaf_type):
+    if isinstance(tree, dict):
+        return dict_map(fn, tree, leaf_type)
+    elif isinstance(tree, list):
+        return [tree_map(fn, x, leaf_type) for x in tree]
+    elif isinstance(tree, tuple):
+        return tuple(tree_map(fn, x, leaf_type) for x in tree)
+    elif isinstance(tree, leaf_type):
+        return fn(tree)
+    else:
+        print(type(tree))
+        raise TypeError("Not supported")
+
+
+tensor_tree_map = partial(tree_map, leaf_type=torch.Tensor)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/evolla/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/evolla/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..36481e69b4378636f90d4ac0a6a54a2242af91ee
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/evolla/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/evolla/__pycache__/configuration_evolla.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/evolla/__pycache__/configuration_evolla.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6a7f86a511573c58cf7751e6ad6eae741acee15a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/evolla/__pycache__/configuration_evolla.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/evolla/__pycache__/modeling_evolla.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/evolla/__pycache__/modeling_evolla.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9aef1d0dcfa605387a8a4287d2a751d3e551a0f2
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/evolla/__pycache__/modeling_evolla.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/evolla/__pycache__/modular_evolla.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/evolla/__pycache__/modular_evolla.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..203c1a64a3930f9c073dc36480d3ce9281070ca0
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/evolla/__pycache__/modular_evolla.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/exaone4/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/exaone4/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e33a3612c733ddc6d9f513509cc8985db1901b0f
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/exaone4/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/exaone4/__pycache__/configuration_exaone4.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/exaone4/__pycache__/configuration_exaone4.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ef432bf5eadbeff7c8cfb2c9713accd363220a45
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/exaone4/__pycache__/configuration_exaone4.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/exaone4/__pycache__/modeling_exaone4.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/exaone4/__pycache__/modeling_exaone4.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f944ee5d63d20ac4f5e457dfb99188cb59531772
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/exaone4/__pycache__/modeling_exaone4.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/exaone4/__pycache__/modular_exaone4.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/exaone4/__pycache__/modular_exaone4.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d79ecb912c7291e7594e5308b1014ce1c8942ed9
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/exaone4/__pycache__/modular_exaone4.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/falcon/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/falcon/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..47b57d1de5d8f87b76b60d03d017d8a5141c3f58
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/falcon/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/falcon/__pycache__/configuration_falcon.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/falcon/__pycache__/configuration_falcon.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3b399d000efcb580c1bd878bd086c25fd55c2c2f
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/falcon/__pycache__/configuration_falcon.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/falcon/__pycache__/modeling_falcon.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/falcon/__pycache__/modeling_falcon.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..02162f757a9ecb1cbe3fb01e74431304fd06731b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/falcon/__pycache__/modeling_falcon.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/falcon_mamba/__pycache__/modeling_falcon_mamba.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/falcon_mamba/__pycache__/modeling_falcon_mamba.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..93c752a2fa5afc645a9a9f00c6bd234812d67d77
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/falcon_mamba/__pycache__/modeling_falcon_mamba.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/falcon_mamba/__pycache__/modular_falcon_mamba.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/falcon_mamba/__pycache__/modular_falcon_mamba.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..baf1b2935e3fccd6e34a23012923c5c169ddb5cc
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/falcon_mamba/__pycache__/modular_falcon_mamba.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fastspeech2_conformer/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fastspeech2_conformer/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b6497de5149931f8d9c302661cd6d9b2479d6897
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fastspeech2_conformer/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fastspeech2_conformer/__pycache__/configuration_fastspeech2_conformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fastspeech2_conformer/__pycache__/configuration_fastspeech2_conformer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0782eceb1c67760538352daabb53331970a2913c
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fastspeech2_conformer/__pycache__/configuration_fastspeech2_conformer.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fastspeech2_conformer/__pycache__/modeling_fastspeech2_conformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fastspeech2_conformer/__pycache__/modeling_fastspeech2_conformer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a6a2a5deab4aa46a58b188977c9bf3335522e545
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fastspeech2_conformer/__pycache__/modeling_fastspeech2_conformer.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fastspeech2_conformer/__pycache__/tokenization_fastspeech2_conformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fastspeech2_conformer/__pycache__/tokenization_fastspeech2_conformer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..42f83bf9cf3b4328acce2e29089f6ab5c435f938
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fastspeech2_conformer/__pycache__/tokenization_fastspeech2_conformer.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a26070726d090a5f5914384f3626c8af6af39f06
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/__pycache__/configuration_flava.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/__pycache__/configuration_flava.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..25ee890c7fd29f04b015394d50b9bf498ffc09bc
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/__pycache__/configuration_flava.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/__pycache__/feature_extraction_flava.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/__pycache__/feature_extraction_flava.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4b7a08eef7ab6fe8e3ee0a3474d469e64a33e531
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/__pycache__/feature_extraction_flava.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/__pycache__/image_processing_flava.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/__pycache__/image_processing_flava.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3785c2af8b6373bb4ceff8bc0df139115c02fcf1
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/__pycache__/image_processing_flava.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/__pycache__/image_processing_flava_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/__pycache__/image_processing_flava_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ff7da20da4b31ac113a76fa6aaaa1a82034d9a72
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/__pycache__/image_processing_flava_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/__pycache__/processing_flava.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/__pycache__/processing_flava.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7b846639274a5ff466a1205aa4542bf2518d4a7c
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flava/__pycache__/processing_flava.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flex_olmo/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flex_olmo/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..22de82d9e96490c1319fbe47d4ca3c4c4966849f
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flex_olmo/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flex_olmo/__pycache__/configuration_flex_olmo.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flex_olmo/__pycache__/configuration_flex_olmo.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1b47044534f5fd840cffda9cd07e290dd4fa38ef
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flex_olmo/__pycache__/configuration_flex_olmo.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flex_olmo/__pycache__/modeling_flex_olmo.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flex_olmo/__pycache__/modeling_flex_olmo.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ce1a4813ef4f3d213fe202a97ff4e3640e3616b5
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flex_olmo/__pycache__/modeling_flex_olmo.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flex_olmo/__pycache__/modular_flex_olmo.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flex_olmo/__pycache__/modular_flex_olmo.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..036555ab83006f455e4a0314541469169ce813dc
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/flex_olmo/__pycache__/modular_flex_olmo.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/focalnet/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/focalnet/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5883d5bf73d4de6ffa708d022863df7b11714337
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/focalnet/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/focalnet/__pycache__/configuration_focalnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/focalnet/__pycache__/configuration_focalnet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..223e02718101368fdc175eb2663831fc6a481a5f
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/focalnet/__pycache__/configuration_focalnet.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/focalnet/__pycache__/modeling_focalnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/focalnet/__pycache__/modeling_focalnet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b556738d5aeaa6e6b8ecbce7e47cbbb50e2f9835
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/focalnet/__pycache__/modeling_focalnet.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fsmt/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fsmt/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7c2fbc8d389d00fb4a2a7e76bba92e31fa2e795e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fsmt/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fsmt/__pycache__/configuration_fsmt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fsmt/__pycache__/configuration_fsmt.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..25500da7548e0382094d3ad819abf014b051e76e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fsmt/__pycache__/configuration_fsmt.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fsmt/__pycache__/modeling_fsmt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fsmt/__pycache__/modeling_fsmt.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..631002676875ecc5a198e8499b34fc90f2245414
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fsmt/__pycache__/modeling_fsmt.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fsmt/__pycache__/tokenization_fsmt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fsmt/__pycache__/tokenization_fsmt.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6c467acac282db11fc7426e57d391243f0dff97f
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fsmt/__pycache__/tokenization_fsmt.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/funnel/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/funnel/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f719d800084d58c9fffa884697bc81e7beb95832
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/funnel/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/funnel/__pycache__/configuration_funnel.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/funnel/__pycache__/configuration_funnel.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e6b2aa9101677aa54c3f339265587d7e3440271b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/funnel/__pycache__/configuration_funnel.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/funnel/__pycache__/modeling_funnel.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/funnel/__pycache__/modeling_funnel.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..03e3a38873cb4598c8d77ea8fc048e638614e19f
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/funnel/__pycache__/modeling_funnel.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/funnel/__pycache__/modeling_tf_funnel.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/funnel/__pycache__/modeling_tf_funnel.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..197d4a0d95020b59ccd15681d570798a7184181e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/funnel/__pycache__/modeling_tf_funnel.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/funnel/__pycache__/tokenization_funnel.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/funnel/__pycache__/tokenization_funnel.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f604bac921a9fbd46f0e87347c6a04c29d72a32a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/funnel/__pycache__/tokenization_funnel.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/funnel/__pycache__/tokenization_funnel_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/funnel/__pycache__/tokenization_funnel_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bc0d442764ced5e684f5e8953ebe9402b08f21a0
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/funnel/__pycache__/tokenization_funnel_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fuyu/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fuyu/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3e9faacd3dfeb611c8ea8260a6a5409e667c32b1
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fuyu/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fuyu/__pycache__/configuration_fuyu.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fuyu/__pycache__/configuration_fuyu.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..517e0f4754b09e805a51d8434ffe2c8a40451ed5
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fuyu/__pycache__/configuration_fuyu.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fuyu/__pycache__/image_processing_fuyu.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fuyu/__pycache__/image_processing_fuyu.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5c0352fa0279d23a8643a9207b1ecec1cd364a2c
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fuyu/__pycache__/image_processing_fuyu.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fuyu/__pycache__/modeling_fuyu.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fuyu/__pycache__/modeling_fuyu.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..920dd703bb1d30fe013b1acbcf0690b68a2ca804
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fuyu/__pycache__/modeling_fuyu.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fuyu/__pycache__/processing_fuyu.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fuyu/__pycache__/processing_fuyu.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..744cd073da9d6a6905b73f69d9bcc443961177fe
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/fuyu/__pycache__/processing_fuyu.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9939bc40f83ca6006cdae305c005f1e26da8c26b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma/__pycache__/configuration_gemma.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma/__pycache__/configuration_gemma.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b5dea69bfeb7b2106c300e1ced8e1609c889f620
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma/__pycache__/configuration_gemma.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma/__pycache__/modeling_flax_gemma.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma/__pycache__/modeling_flax_gemma.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d3abab8b48efcbebb6a71565a6fb2432ec6e1779
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma/__pycache__/modeling_flax_gemma.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma/__pycache__/modeling_gemma.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma/__pycache__/modeling_gemma.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3bde7a93e0551438b05a9eb6ffb6479976561324
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma/__pycache__/modeling_gemma.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma/__pycache__/modular_gemma.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma/__pycache__/modular_gemma.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..959d895083cffdb7387168a09fc709ab0b8e2a4d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma/__pycache__/modular_gemma.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma/__pycache__/tokenization_gemma.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma/__pycache__/tokenization_gemma.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3f3a9cc68ce29a9a2ade8b78d355815f70fa6e98
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma/__pycache__/tokenization_gemma.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma/__pycache__/tokenization_gemma_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma/__pycache__/tokenization_gemma_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2b71042e7fbb21a76fe3cd955bd345bfa35b26f0
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gemma/__pycache__/tokenization_gemma_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/git/modeling_git.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/git/modeling_git.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc037912c5c52b869099fd4c35ff1bd4ee0fd1c8
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/git/modeling_git.py
@@ -0,0 +1,1460 @@
+# coding=utf-8
+# Copyright 2022 Microsoft Research and The HuggingFace Inc. team.
+# All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch GIT model."""
+
+import math
+from dataclasses import dataclass
+from typing import Callable, Optional, Union
+
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...modeling_attn_mask_utils import _prepare_4d_attention_mask
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPast,
+    BaseModelOutputWithPooling,
+    CausalLMOutputWithPast,
+)
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...pytorch_utils import apply_chunking_to_forward, find_pruneable_heads_and_indices, prune_linear_layer
+from ...utils import (
+    ModelOutput,
+    auto_docstring,
+    can_return_tuple,
+    logging,
+    torch_int,
+)
+from ...utils.deprecation import deprecate_kwarg
+from .configuration_git import GitConfig, GitVisionConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
+    """
+)
+# Copied from transformers.models.clip.modeling_clip.CLIPVisionModelOutput with CLIP->Git
+class GitVisionModelOutput(ModelOutput):
+    r"""
+    image_embeds (`torch.FloatTensor` of shape `(batch_size, output_dim)` *optional* returned when model is initialized with `with_projection=True`):
+        The image embeddings obtained by applying the projection layer to the pooler_output.
+    """
+
+    image_embeds: Optional[torch.FloatTensor] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+class GitEmbeddings(nn.Module):
+    """Construct the embeddings from word and position embeddings."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.word_embeddings = nn.Embedding(config.vocab_size, config.hidden_size, padding_idx=config.pad_token_id)
+        self.position_embeddings = nn.Embedding(config.max_position_embeddings, config.hidden_size)
+
+        # self.LayerNorm is not snake-cased to stick with TensorFlow model variable name and be able to load
+        # any TensorFlow checkpoint file
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+        # position_ids (1, len position emb) is contiguous in memory and exported when serialized
+        self.position_embedding_type = getattr(config, "position_embedding_type", "absolute")
+        self.register_buffer(
+            "position_ids", torch.arange(config.max_position_embeddings).expand((1, -1)), persistent=False
+        )
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        past_key_values_length: int = 0,
+    ) -> torch.Tensor:
+        if input_ids is not None:
+            input_shape = input_ids.size()
+        else:
+            input_shape = inputs_embeds.size()[:-1]
+
+        seq_length = input_shape[1]
+
+        if position_ids is None:
+            position_ids = self.position_ids[:, past_key_values_length : seq_length + past_key_values_length]
+
+        if inputs_embeds is None:
+            embeddings = self.word_embeddings(input_ids)
+        else:
+            embeddings = inputs_embeds
+
+        if self.position_embedding_type == "absolute":
+            position_embeddings = self.position_embeddings(position_ids)
+            embeddings += position_embeddings
+        embeddings = self.LayerNorm(embeddings)
+        embeddings = self.dropout(embeddings)
+        return embeddings
+
+
+class GitSelfAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None, layer_idx=None):
+        super().__init__()
+        if config.hidden_size % config.num_attention_heads != 0 and not hasattr(config, "embedding_size"):
+            raise ValueError(
+                f"The hidden size ({config.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({config.num_attention_heads})"
+            )
+        self.layer_idx = layer_idx
+        if layer_idx is None:
+            logger.warning_once(
+                f"Instantiating {self.__class__.__name__} without passing a `layer_idx` is not recommended and will "
+                "lead to errors during the forward call if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.num_attention_heads = config.num_attention_heads
+        self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.image_patch_tokens = int((config.vision_config.image_size / config.vision_config.patch_size) ** 2 + 1)
+        if config.num_image_with_embedding is not None:
+            self.image_patch_tokens *= config.num_image_with_embedding
+
+        self.query = nn.Linear(config.hidden_size, self.all_head_size)
+        self.key = nn.Linear(config.hidden_size, self.all_head_size)
+        self.value = nn.Linear(config.hidden_size, self.all_head_size)
+
+        self.dropout = nn.Dropout(config.attention_probs_dropout_prob)
+        self.position_embedding_type = position_embedding_type or getattr(
+            config, "position_embedding_type", "absolute"
+        )
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            self.max_position_embeddings = config.max_position_embeddings
+            self.distance_embedding = nn.Embedding(2 * config.max_position_embeddings - 1, self.attention_head_size)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        pixel_values_present: Optional[bool] = False,
+    ) -> tuple[torch.Tensor]:
+        batch_size, seq_length, _ = hidden_states.shape
+        query_layer = (
+            self.query(hidden_states)
+            .view(batch_size, -1, self.num_attention_heads, self.attention_head_size)
+            .transpose(1, 2)
+        )
+
+        cutoff = self.image_patch_tokens if pixel_values_present else 0
+        key_layer = (
+            self.key(hidden_states)
+            .view(batch_size, -1, self.num_attention_heads, self.attention_head_size)
+            .transpose(1, 2)
+        )
+        value_layer = (
+            self.value(hidden_states)
+            .view(batch_size, -1, self.num_attention_heads, self.attention_head_size)
+            .transpose(1, 2)
+        )
+        if past_key_values is not None:
+            # NOTE: like in other caches, we store the text component. In GIT it means we discard the image component.
+            key_layer_past, value_layer_past = past_key_values.update(
+                key_layer[:, :, cutoff:, :], value_layer[:, :, cutoff:, :], self.layer_idx
+            )
+            key_layer = torch.cat([key_layer[:, :, :cutoff, :], key_layer_past], dim=2)
+            value_layer = torch.cat([value_layer[:, :, :cutoff, :], value_layer_past], dim=2)
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = torch.matmul(query_layer, key_layer.transpose(-1, -2))
+
+        if self.position_embedding_type == "relative_key" or self.position_embedding_type == "relative_key_query":
+            query_length, key_length = query_layer.shape[2], key_layer.shape[2]
+            if past_key_values is not None:
+                position_ids_l = torch.tensor(key_length - 1, dtype=torch.long, device=hidden_states.device).view(
+                    -1, 1
+                )
+            else:
+                position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_l - position_ids_r
+
+            positional_embedding = self.distance_embedding(distance + self.max_position_embeddings - 1)
+            positional_embedding = positional_embedding.to(dtype=query_layer.dtype)  # fp16 compatibility
+
+            if self.position_embedding_type == "relative_key":
+                relative_position_scores = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores
+            elif self.position_embedding_type == "relative_key_query":
+                relative_position_scores_query = torch.einsum("bhld,lrd->bhlr", query_layer, positional_embedding)
+                relative_position_scores_key = torch.einsum("bhrd,lrd->bhlr", key_layer, positional_embedding)
+                attention_scores = attention_scores + relative_position_scores_query + relative_position_scores_key
+
+        attention_scores = attention_scores / math.sqrt(self.attention_head_size)
+        if attention_mask is not None:
+            # Apply the attention mask is (precomputed for all layers in GitModel forward() function)
+            attention_scores = attention_scores + attention_mask
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = nn.functional.softmax(attention_scores, dim=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attention_probs = attention_probs * head_mask
+
+        context_layer = torch.matmul(attention_probs, value_layer)
+
+        context_layer = context_layer.permute(0, 2, 1, 3).contiguous()
+        new_context_layer_shape = context_layer.size()[:-2] + (self.all_head_size,)
+        context_layer = context_layer.view(new_context_layer_shape)
+
+        return context_layer, attention_probs
+
+
+# Copied from transformers.models.bert.modeling_bert.BertSelfOutput
+class GitSelfOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+GIT_SELF_ATTENTION_CLASSES = {
+    "eager": GitSelfAttention,
+}
+
+
+class GitAttention(nn.Module):
+    def __init__(self, config, position_embedding_type=None, layer_idx=None):
+        super().__init__()
+        self.self = GIT_SELF_ATTENTION_CLASSES[config._attn_implementation](
+            config, position_embedding_type=position_embedding_type, layer_idx=layer_idx
+        )
+        self.output = GitSelfOutput(config)
+        self.pruned_heads = set()
+
+    # Copied from transformers.models.bert.modeling_bert.BertAttention.prune_heads
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(
+            heads, self.self.num_attention_heads, self.self.attention_head_size, self.pruned_heads
+        )
+
+        # Prune linear layers
+        self.self.query = prune_linear_layer(self.self.query, index)
+        self.self.key = prune_linear_layer(self.self.key, index)
+        self.self.value = prune_linear_layer(self.self.value, index)
+        self.output.dense = prune_linear_layer(self.output.dense, index, dim=1)
+
+        # Update hyper params and store pruned heads
+        self.self.num_attention_heads = self.self.num_attention_heads - len(heads)
+        self.self.all_head_size = self.self.attention_head_size * self.self.num_attention_heads
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        pixel_values_present: Optional[bool] = False,
+    ) -> tuple[torch.Tensor]:
+        attn_output, self_attn_weights = self.self(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            past_key_values,
+            output_attentions,
+            pixel_values_present,
+        )
+        attention_output = self.output(attn_output, hidden_states)
+        return attention_output, self_attn_weights
+
+
+# Copied from transformers.models.bert.modeling_bert.BertIntermediate
+class GitIntermediate(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_bert.BertOutput
+class GitOutput(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout_prob)
+
+    def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.LayerNorm(hidden_states + input_tensor)
+        return hidden_states
+
+
+class GitLayer(GradientCheckpointingLayer):
+    def __init__(self, config, layer_idx=None):
+        super().__init__()
+        self.chunk_size_feed_forward = config.chunk_size_feed_forward
+        self.seq_len_dim = 1
+        self.attention = GitAttention(config, layer_idx=layer_idx)
+        self.intermediate = GitIntermediate(config)
+        self.output = GitOutput(config)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        pixel_values_present: Optional[bool] = False,
+    ) -> tuple[torch.Tensor]:
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        attention_output, self_attention_weights = self.attention(
+            hidden_states,
+            attention_mask,
+            head_mask,
+            output_attentions=output_attentions,
+            past_key_values=past_key_values,
+            pixel_values_present=pixel_values_present,
+        )
+
+        layer_output = apply_chunking_to_forward(
+            self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
+        )
+        return layer_output, self_attention_weights
+
+    def feed_forward_chunk(self, attention_output):
+        intermediate_output = self.intermediate(attention_output)
+        layer_output = self.output(intermediate_output, attention_output)
+        return layer_output
+
+
+class GitEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.layer = nn.ModuleList([GitLayer(config, i) for i in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        past_key_values: Optional[Union[Cache, tuple[tuple[torch.FloatTensor]]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = False,
+        output_hidden_states: Optional[bool] = False,
+        pixel_values_present: Optional[bool] = False,
+        return_dict: Optional[bool] = True,
+    ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPast]:
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        for i, layer_module in enumerate(self.layer):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            layer_outputs = layer_module(
+                hidden_states,
+                attention_mask,
+                layer_head_mask,
+                past_key_values,
+                output_attentions,
+                pixel_values_present,
+            )
+
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    past_key_values,
+                    all_hidden_states,
+                    all_self_attentions,
+                ]
+                if v is not None
+            )
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+@auto_docstring
+class GitPreTrainedModel(PreTrainedModel):
+    config: GitConfig
+    base_model_prefix = "git"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, GitVisionEmbeddings):
+            nn.init.normal_(module.class_embedding, mean=0.0, std=self.config.initializer_range)
+            nn.init.normal_(module.patch_embedding.weight, std=self.config.initializer_range)
+            nn.init.normal_(module.position_embedding.weight, std=self.config.initializer_range)
+        if isinstance(module, nn.Linear):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+# Copied from transformers.models.clip.modeling_clip.CLIPVisionEmbeddings with CLIP->Git
+class GitVisionEmbeddings(nn.Module):
+    def __init__(self, config: GitVisionConfig):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.image_size = config.image_size
+        self.patch_size = config.patch_size
+
+        self.class_embedding = nn.Parameter(torch.randn(self.embed_dim))
+
+        self.patch_embedding = nn.Conv2d(
+            in_channels=config.num_channels,
+            out_channels=self.embed_dim,
+            kernel_size=self.patch_size,
+            stride=self.patch_size,
+            bias=False,
+        )
+
+        self.num_patches = (self.image_size // self.patch_size) ** 2
+        self.num_positions = self.num_patches + 1
+        self.position_embedding = nn.Embedding(self.num_positions, self.embed_dim)
+        self.register_buffer("position_ids", torch.arange(self.num_positions).expand((1, -1)), persistent=False)
+
+    def interpolate_pos_encoding(self, embeddings: torch.Tensor, height: int, width: int) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
+        images. This method is also adapted to support torch.jit tracing.
+
+        Adapted from:
+        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
+        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
+        """
+
+        num_patches = embeddings.shape[1] - 1
+        position_embedding = self.position_embedding.weight.unsqueeze(0)
+        num_positions = position_embedding.shape[1] - 1
+
+        # always interpolate when tracing to ensure the exported model works for dynamic input shapes
+        if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
+            return self.position_embedding(self.position_ids)
+
+        class_pos_embed = position_embedding[:, :1]
+        patch_pos_embed = position_embedding[:, 1:]
+
+        dim = embeddings.shape[-1]
+
+        new_height = height // self.patch_size
+        new_width = width // self.patch_size
+
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        patch_pos_embed = patch_pos_embed.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
+        patch_pos_embed = patch_pos_embed.permute(0, 3, 1, 2)
+
+        patch_pos_embed = nn.functional.interpolate(
+            patch_pos_embed,
+            size=(new_height, new_width),
+            mode="bicubic",
+            align_corners=False,
+        )
+
+        patch_pos_embed = patch_pos_embed.permute(0, 2, 3, 1).view(1, -1, dim)
+
+        return torch.cat((class_pos_embed, patch_pos_embed), dim=1)
+
+    def forward(self, pixel_values: torch.FloatTensor, interpolate_pos_encoding=False) -> torch.Tensor:
+        batch_size, _, height, width = pixel_values.shape
+        if not interpolate_pos_encoding and (height != self.image_size or width != self.image_size):
+            raise ValueError(
+                f"Input image size ({height}*{width}) doesn't match model ({self.image_size}*{self.image_size})."
+            )
+        target_dtype = self.patch_embedding.weight.dtype
+        patch_embeds = self.patch_embedding(pixel_values.to(dtype=target_dtype))  # shape = [*, width, grid, grid]
+        patch_embeds = patch_embeds.flatten(2).transpose(1, 2)
+
+        class_embeds = self.class_embedding.expand(batch_size, 1, -1)
+        embeddings = torch.cat([class_embeds, patch_embeds], dim=1)
+        if interpolate_pos_encoding:
+            embeddings = embeddings + self.interpolate_pos_encoding(embeddings, height, width)
+        else:
+            embeddings = embeddings + self.position_embedding(self.position_ids)
+        return embeddings
+
+
+class GitVisionMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(config.hidden_size, config.intermediate_size)
+        self.fc2 = nn.Linear(config.intermediate_size, config.hidden_size)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.siglip.modeling_siglip.eager_attention_forward
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    attn_weights = torch.matmul(query, key.transpose(-1, -2)) * scaling
+    if attention_mask is not None:
+        attn_weights = attn_weights + attention_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class GitVisionAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+        self.scale = self.head_dim**-0.5
+        self.dropout = config.attention_dropout
+        self.is_causal = False
+
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """Input shape: Batch x Time x Channel"""
+
+        batch_size, seq_length, embed_dim = hidden_states.shape
+
+        queries = self.q_proj(hidden_states)
+        keys = self.k_proj(hidden_states)
+        values = self.v_proj(hidden_states)
+
+        queries = queries.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+        keys = keys.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+        values = values.view(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+        # CLIP text model uses both `causal_attention_mask` and `attention_mask`
+        # in case FA2 kernel is called, `is_causal` should be inferred from `causal_attention_mask`
+        if self.config._attn_implementation != "flash_attention_2":
+            if attention_mask is not None and causal_attention_mask is not None:
+                attention_mask = attention_mask + causal_attention_mask
+            elif causal_attention_mask is not None:
+                attention_mask = causal_attention_mask
+        else:
+            self.is_causal = causal_attention_mask is not None
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            queries,
+            keys,
+            values,
+            attention_mask,
+            is_causal=self.is_causal,
+            scaling=self.scale,
+            dropout=0.0 if not self.training else self.dropout,
+        )
+
+        attn_output = attn_output.reshape(batch_size, seq_length, embed_dim).contiguous()
+        attn_output = self.out_proj(attn_output)
+        if not output_attentions:
+            attn_weights = None
+        return attn_output, attn_weights
+
+
+# Copied from transformers.models.altclip.modeling_altclip.AltCLIPEncoderLayer with AltCLIP->GitVision
+class GitVisionEncoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: GitVisionConfig):
+        super().__init__()
+        self.embed_dim = config.hidden_size
+        self.self_attn = GitVisionAttention(config)
+        self.layer_norm1 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+        self.mlp = GitVisionMLP(config)
+        self.layer_norm2 = nn.LayerNorm(self.embed_dim, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        causal_attention_mask: torch.Tensor,
+        output_attentions: Optional[bool] = False,
+    ) -> tuple[torch.FloatTensor]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+                `(config.encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        hidden_states = self.layer_norm1(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            causal_attention_mask=causal_attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.layer_norm2(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+# Copied from transformers.models.altclip.modeling_altclip.AltCLIPEncoder with AltCLIP->GitVision, CLIPConfig
+class GitVisionEncoder(nn.Module):
+    """
+    Transformer encoder consisting of `config.num_hidden_layers` self attention layers. Each layer is a
+    [`GitVisionEncoderLayer`].
+
+    Args:
+        config: GitVisionConfig
+    """
+
+    def __init__(self, config: GitVisionConfig):
+        super().__init__()
+        self.config = config
+        self.layers = nn.ModuleList([GitVisionEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    @can_return_tuple
+    def forward(
+        self,
+        inputs_embeds,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutput]:
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            causal_attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Causal mask for the text model. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        hidden_states = inputs_embeds
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            layer_outputs = encoder_layer(
+                hidden_states,
+                attention_mask,
+                causal_attention_mask,
+                output_attentions=output_attentions,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class GitVisionTransformer(nn.Module):
+    # Copied from transformers.models.altclip.modeling_altclip.AltCLIPVisionTransformer.__init__ with AltCLIPEncoder->GitVisionEncoder, AltCLIP->Git
+    def __init__(self, config: GitVisionConfig):
+        super().__init__()
+        self.config = config
+        embed_dim = config.hidden_size
+
+        self.embeddings = GitVisionEmbeddings(config)
+        self.pre_layrnorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+        self.encoder = GitVisionEncoder(config)
+        self.post_layernorm = nn.LayerNorm(embed_dim, eps=config.layer_norm_eps)
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: Optional[bool] = False,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
+        hidden_states = self.pre_layrnorm(hidden_states)
+
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        last_hidden_state = encoder_outputs[0]
+
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+
+        if not return_dict:
+            return (last_hidden_state,) + encoder_outputs[1:]
+
+        return BaseModelOutput(
+            last_hidden_state=last_hidden_state,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    The vision model from CLIP, used in GIT, without any head or projection on top.
+    """
+)
+class GitVisionModel(GitPreTrainedModel):
+    config: GitVisionConfig
+    main_input_name = "pixel_values"
+
+    # Copied from transformers.models.clip.modeling_clip.CLIPVisionModel.__init__ with CLIP->Git
+    def __init__(self, config: GitVisionConfig):
+        super().__init__(config)
+        self.vision_model = GitVisionTransformer(config)
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.vision_model.embeddings.patch_embedding
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutput]:
+        r"""
+        Examples:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, GitVisionModel
+
+        >>> processor = AutoProcessor.from_pretrained("microsoft/git-base")
+        >>> model = GitVisionModel.from_pretrained("microsoft/git-base")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        return self.vision_model(
+            pixel_values=pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            return_dict=return_dict,
+        )
+
+
+class GitProjection(nn.Module):
+    def __init__(self, config: GitConfig):
+        super().__init__()
+        self.config = config
+        self.visual_projection = nn.Sequential(
+            nn.Linear(config.vision_config.hidden_size, config.hidden_size),
+            nn.LayerNorm(config.hidden_size, eps=config.vision_config.layer_norm_eps),
+        )
+
+    def forward(self, embeddings: torch.Tensor) -> torch.Tensor:
+        return self.visual_projection(embeddings)
+
+
+@auto_docstring(
+    custom_intro="""
+    The bare GIT Model transformer consisting of a CLIP image encoder and text decoder outputting raw hidden-states
+    """
+)
+class GitModel(GitPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.config = config
+
+        self.embeddings = GitEmbeddings(config)
+        self.image_encoder = GitVisionModel(config.vision_config)
+        self.encoder = GitEncoder(config)
+
+        self.visual_projection = GitProjection(config)
+
+        if config.num_image_with_embedding is not None:
+            self.img_temporal_embedding = nn.ParameterList(
+                nn.Parameter(torch.zeros(1, 1, config.vision_config.hidden_size))
+                for _ in range(config.num_image_with_embedding)
+            )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.word_embeddings
+
+    def set_input_embeddings(self, value):
+        self.embeddings.word_embeddings = value
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    def _generate_future_mask(self, size: int, dtype: torch.dtype, device: torch.device) -> torch.Tensor:
+        # Default mask is for forward direction. Flip for backward direction.
+        mask = torch.triu(torch.ones(size, size, device=device, dtype=dtype), diagonal=1)
+        mask = mask.masked_fill(mask == 1, float("-inf"))
+        return mask
+
+    def create_attention_mask(self, tgt, memory, tgt_mask, past_key_values_length, memory_key_padding_mask=None):
+        num_tgt = tgt.shape[1]
+        num_memory = memory.shape[1]
+        device = tgt.device
+        dtype = tgt.dtype
+        top_left = torch.zeros((num_memory, num_memory), device=device, dtype=dtype)
+        top_right = torch.full(
+            (num_memory, num_tgt + past_key_values_length),
+            float("-inf"),
+            device=tgt.device,
+            dtype=dtype,
+        )
+        bottom_left = torch.zeros(
+            (num_tgt, num_memory),
+            dtype=dtype,
+            device=tgt_mask.device,
+        )
+
+        if past_key_values_length > 0:
+            tgt_mask = torch.zeros(
+                (tgt_mask.shape[0], tgt_mask.shape[0] + past_key_values_length),
+                dtype=dtype,
+                device=tgt_mask.device,
+            )
+
+        left = torch.cat((top_left, bottom_left), dim=0)
+        right = torch.cat((top_right, tgt_mask.to(dtype)), dim=0)
+
+        full_attention_mask = torch.cat((left, right), dim=1)[None, :]
+
+        if memory_key_padding_mask is None:
+            memory_key_padding_mask = torch.full((memory.shape[0], memory.shape[1]), fill_value=False, device=device)
+        # if it is False, it means valid. That is, it is not a padding
+        if memory_key_padding_mask.dtype != torch.bool:
+            raise ValueError("Memory key padding mask must be a boolean tensor.")
+        zero_negative_infinity = torch.zeros_like(memory_key_padding_mask, dtype=tgt.dtype)
+        zero_negative_infinity[memory_key_padding_mask] = float("-inf")
+        full_attention_mask = full_attention_mask.expand(
+            (memory_key_padding_mask.shape[0], num_memory + num_tgt, num_memory + past_key_values_length + num_tgt)
+        )
+        full_attention_mask = full_attention_mask.clone()
+        origin_left = full_attention_mask[:, :, :num_memory]
+        update = zero_negative_infinity[:, None, :]
+        full_attention_mask[:, :, :num_memory] = origin_left + update
+
+        # add axis for multi-head
+        full_attention_mask = full_attention_mask[:, None, :, :]
+
+        return full_attention_mask
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        pixel_values: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], BaseModelOutputWithPooling]:
+        r"""
+        Examples:
+
+        ```python
+        >>> from transformers import AutoProcessor, AutoModel
+        >>> import requests
+        >>> from PIL import Image
+
+        >>> processor = AutoProcessor.from_pretrained("microsoft/git-base")
+        >>> model = AutoModel.from_pretrained("microsoft/git-base")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> text = "this is an image of two cats"
+
+        >>> inputs = processor(images=image, text=text, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> last_hidden_state = outputs.last_hidden_state
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        seq_length = input_shape[1]
+
+        # past_key_values_length
+        past_key_values_length = 0
+        if past_key_values is not None:
+            past_key_values_length = (
+                past_key_values.get_seq_length()
+                if not isinstance(past_key_values, Cache)
+                else past_key_values.get_seq_length()
+            )
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, self.config.num_hidden_layers)
+
+        projected_visual_features = None
+        if pixel_values is not None:
+            if pixel_values.ndim == 4:
+                # here we assume pixel_values is of shape (batch_size, num_channels, height, width)
+                visual_features = self.image_encoder(
+                    pixel_values, interpolate_pos_encoding=interpolate_pos_encoding
+                ).last_hidden_state
+
+            elif pixel_values.ndim == 5:
+                # here we assume pixel_values is of shape (batch_size, num_frames, num_channels, height, width)
+                visual_features = []
+                for frame_idx in range(pixel_values.shape[1]):
+                    visual_features_frame = self.image_encoder(
+                        pixel_values[:, frame_idx, :, :], interpolate_pos_encoding=interpolate_pos_encoding
+                    ).last_hidden_state
+                    visual_features_frame += self.img_temporal_embedding[frame_idx]
+                    visual_features.append(visual_features_frame)
+
+                # finally, concatenate all features along sequence dimension
+                visual_features = torch.cat(visual_features, dim=1)
+
+            else:
+                raise ValueError("pixel_values must be of rank 4 or 5")
+
+            projected_visual_features = self.visual_projection(visual_features)
+
+        embedding_output = self.embeddings(
+            input_ids=input_ids,
+            position_ids=position_ids,
+            inputs_embeds=inputs_embeds,
+            past_key_values_length=past_key_values_length,
+        )
+
+        if projected_visual_features is None:
+            projected_visual_features = torch.zeros(
+                (embedding_output.shape[0], 0, embedding_output.shape[2]),
+                dtype=embedding_output.dtype,
+                device=embedding_output.device,
+            )
+
+        # Repeat visual features to match embedding batch size.
+        projected_visual_features = projected_visual_features.repeat(
+            embedding_output.size(0) // projected_visual_features.size(0), 1, 1
+        )
+
+        # concatenate patch token and text token embeddings
+        hidden_states = torch.cat((projected_visual_features, embedding_output), dim=1)
+
+        # By default, an additive causal mask is created
+        # for masking the future (one direction).
+        tgt_mask = self._generate_future_mask(seq_length, embedding_output.dtype, embedding_output.device)
+
+        # Create an attention mask of shape (batch_size, 1, tgt_seq_len, src_seq_len)
+        combined_attention_mask = self.create_attention_mask(
+            tgt=embedding_output,
+            memory=projected_visual_features,
+            tgt_mask=tgt_mask,
+            past_key_values_length=past_key_values_length,
+        )
+
+        if attention_mask is not None:
+            # if the user provides an attention mask, we add it to the default one
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            expanded_attn_mask = _prepare_4d_attention_mask(
+                attention_mask, embedding_output.dtype, tgt_len=input_shape[-1]
+            ).to(embedding_output.device)
+            if past_key_values_length > 0:
+                expanded_attn_mask = expanded_attn_mask[:, :, -past_key_values_length:, :]
+            else:
+                combined_attention_mask[:, :, -input_shape[1] :, -input_shape[1] :] += expanded_attn_mask
+
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=combined_attention_mask,
+            head_mask=head_mask,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            pixel_values_present=pixel_values is not None,
+        )
+        sequence_output = encoder_outputs[0]
+
+        if not return_dict:
+            return (sequence_output,) + encoder_outputs[1:]
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=sequence_output,
+            past_key_values=encoder_outputs.past_key_values,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    GIT Model with a `language modeling` head on top for autoregressive language modeling.
+    """
+)
+class GitForCausalLM(GitPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["output.weight"]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.git = GitModel(config)
+        self.output = nn.Linear(config.hidden_size, config.vocab_size)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_output_embeddings(self):
+        return self.output
+
+    def set_output_embeddings(self, new_embeddings):
+        self.output = new_embeddings
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        pixel_values: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Union[Cache, list[torch.Tensor]]] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: bool = False,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[tuple[torch.Tensor], CausalLMOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the left-to-right language modeling loss (next word prediction). Indices should be in
+            `[-100, 0, ..., config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are
+            ignored (masked), the loss is only computed for the tokens with labels n `[0, ..., config.vocab_size]`
+
+        Examples:
+
+        Image captioning example:
+
+        ```python
+        >>> from transformers import AutoProcessor, AutoModelForCausalLM
+        >>> import requests
+        >>> from PIL import Image
+
+        >>> processor = AutoProcessor.from_pretrained("microsoft/git-base-coco")
+        >>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-coco")
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> pixel_values = processor(images=image, return_tensors="pt").pixel_values
+
+        >>> generated_ids = model.generate(pixel_values=pixel_values, max_length=50)
+        >>> generated_caption = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        >>> print(generated_caption)
+        two cats sleeping on a pink blanket next to remotes.
+        ```
+
+        Visual question answering (VQA) example:
+
+        ```python
+        >>> from transformers import AutoProcessor, AutoModelForCausalLM
+        >>> from huggingface_hub import hf_hub_download
+        >>> from PIL import Image
+
+        >>> processor = AutoProcessor.from_pretrained("microsoft/git-base-textvqa")
+        >>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-textvqa")
+
+        >>> file_path = hf_hub_download(repo_id="nielsr/textvqa-sample", filename="bus.png", repo_type="dataset")
+        >>> image = Image.open(file_path).convert("RGB")
+
+        >>> pixel_values = processor(images=image, return_tensors="pt").pixel_values
+
+        >>> question = "what does the front of the bus say at the top?"
+
+        >>> input_ids = processor(text=question, add_special_tokens=False).input_ids
+        >>> input_ids = [processor.tokenizer.cls_token_id] + input_ids
+        >>> input_ids = torch.tensor(input_ids).unsqueeze(0)
+
+        >>> generated_ids = model.generate(pixel_values=pixel_values, input_ids=input_ids, max_length=50)
+        >>> print(processor.batch_decode(generated_ids, skip_special_tokens=True))
+        ['what does the front of the bus say at the top? special']
+        ```
+
+        Video captioning example:
+
+        ```python
+        >>> import av
+        >>> import numpy as np
+        >>> from PIL import Image
+        >>> from huggingface_hub import hf_hub_download
+        >>> from transformers import AutoProcessor, AutoModelForCausalLM
+
+        >>> processor = AutoProcessor.from_pretrained("microsoft/git-base-vatex")
+        >>> model = AutoModelForCausalLM.from_pretrained("microsoft/git-base-vatex")
+
+        >>> # set seed for reproducibility
+        >>> np.random.seed(45)
+
+
+        >>> def read_video_pyav(container, indices):
+        ...     '''
+        ...     Decode the video with PyAV decoder.
+        ...     Args:
+        ...         container (`av.container.input.InputContainer`): PyAV container.
+        ...         indices (`list[int]`): List of frame indices to decode.
+        ...     Returns:
+        ...         result (np.ndarray): np array of decoded frames of shape (num_frames, height, width, 3).
+        ...     '''
+        ...     frames = []
+        ...     container.seek(0)
+        ...     start_index = indices[0]
+        ...     end_index = indices[-1]
+        ...     for i, frame in enumerate(container.decode(video=0)):
+        ...         if i > end_index:
+        ...             break
+        ...         if i >= start_index and i in indices:
+        ...             frames.append(frame)
+        ...     return np.stack([x.to_ndarray(format="rgb24") for x in frames])
+
+
+        >>> def sample_frame_indices(clip_len, frame_sample_rate, seg_len):
+        ...     '''
+        ...     Sample a given number of frame indices from the video.
+        ...     Args:
+        ...         clip_len (`int`): Total number of frames to sample.
+        ...         frame_sample_rate (`int`): Sample every n-th frame.
+        ...         seg_len (`int`): Maximum allowed index of sample's last frame.
+        ...     Returns:
+        ...         indices (`list[int]`): List of sampled frame indices
+        ...     '''
+        ...     converted_len = int(clip_len * frame_sample_rate)
+        ...     end_idx = np.random.randint(converted_len, seg_len)
+        ...     start_idx = end_idx - converted_len
+        ...     indices = np.linspace(start_idx, end_idx, num=clip_len)
+        ...     indices = np.clip(indices, start_idx, end_idx - 1).astype(np.int64)
+        ...     return indices
+
+
+        >>> # load video
+        >>> file_path = hf_hub_download(
+        ...     repo_id="nielsr/video-demo", filename="eating_spaghetti.mp4", repo_type="dataset"
+        ... )
+        >>> container = av.open(file_path)
+
+        >>> # sample frames
+        >>> num_frames = model.config.num_image_with_embedding
+        >>> indices = sample_frame_indices(
+        ...     clip_len=num_frames, frame_sample_rate=4, seg_len=container.streams.video[0].frames
+        ... )
+        >>> frames = read_video_pyav(container, indices)
+
+        >>> pixel_values = processor(images=list(frames), return_tensors="pt").pixel_values
+
+        >>> generated_ids = model.generate(pixel_values=pixel_values, max_length=50)
+
+        >>> print("Generated caption:", processor.batch_decode(generated_ids, skip_special_tokens=True))
+        Generated caption: ['a woman is sitting at a table and she is talking about the food she is holding.']
+        ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            use_cache = False
+
+        outputs = self.git(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            pixel_values=pixel_values,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+        logits = self.output(sequence_output)
+
+        loss = None
+        if labels is not None:
+            # we are doing next-token prediction; shift prediction scores and input ids by one
+            num_image_tokens = self.git.encoder.layer[0].attention.self.image_patch_tokens
+            shifted_logits = logits[:, num_image_tokens:-1, :].contiguous()
+            labels = labels[:, 1:].contiguous()
+            loss = self.loss_function(
+                shifted_logits.view(-1, self.config.vocab_size),
+                labels.view(-1),
+                vocab_size=self.config.vocab_size,
+                **kwargs,
+            )
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self, input_ids, past_key_values=None, attention_mask=None, use_cache=None, **kwargs
+    ):
+        # Overwritten -- `git` has special cache handling and doesn't support generating from `inputs_embeds` atm
+
+        # cut decoder_input_ids if past_key_values is used
+        if past_key_values is not None:
+            past_length = past_key_values.get_seq_length()
+
+            # Some generation methods already pass only the last input ID
+            if input_ids.shape[1] > past_length:
+                remove_prefix_length = past_length
+            else:
+                # Default to old behavior: keep only final ID
+                remove_prefix_length = input_ids.shape[1] - 1
+
+            input_ids = input_ids[:, remove_prefix_length:]
+
+        # if model is used as a decoder in encoder-decoder model, the decoder attention mask is created on the fly
+        input_shape = input_ids.shape
+        if attention_mask is None:
+            attention_mask = input_ids.new_ones(input_shape)
+
+        model_inputs = {
+            "input_ids": input_ids,
+            "attention_mask": attention_mask,
+            "pixel_values": kwargs.get("pixel_values"),
+            "past_key_values": past_key_values,
+            "use_cache": use_cache,
+        }
+
+        # Forward ALL kwargs that are uninitialized (e.g. `use_cache`).
+        for key, value in kwargs.items():
+            if key not in model_inputs:
+                model_inputs[key] = value
+
+        return model_inputs
+
+
+__all__ = ["GitForCausalLM", "GitModel", "GitPreTrainedModel", "GitVisionModel"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a1a73513e3caacdd62de5327c2ee1022fefce378
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4/__pycache__/configuration_glm4.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4/__pycache__/configuration_glm4.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1e91b45da2235a6d1d3b99d08d537b7f06390d7a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4/__pycache__/configuration_glm4.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4/__pycache__/modeling_glm4.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4/__pycache__/modeling_glm4.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b2fb09f675c2fa217e5f8fda966cc679dbc47157
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4/__pycache__/modeling_glm4.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4/__pycache__/modular_glm4.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4/__pycache__/modular_glm4.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..03b9d90dc91cb9d50d92187af3d60221328e7315
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4/__pycache__/modular_glm4.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4v/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4v/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6ab15356ef75e8267fad42da3edda58cd9fd8e8f
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4v/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4v/__pycache__/configuration_glm4v.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4v/__pycache__/configuration_glm4v.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9c9b91b2c416af487b83521997e60a8ad2ffa8bb
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4v/__pycache__/configuration_glm4v.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4v/__pycache__/image_processing_glm4v.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4v/__pycache__/image_processing_glm4v.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..59e6802be823cb67816897a7c39317a395a0f537
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4v/__pycache__/image_processing_glm4v.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4v/__pycache__/image_processing_glm4v_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4v/__pycache__/image_processing_glm4v_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a898ef5d8b92b1d09bcc80cdb71266a881a7bda0
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4v/__pycache__/image_processing_glm4v_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4v/__pycache__/modeling_glm4v.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4v/__pycache__/modeling_glm4v.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..27ce6575bc2f40110c904e1200edf9d67c31361f
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4v/__pycache__/modeling_glm4v.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4v/__pycache__/modular_glm4v.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4v/__pycache__/modular_glm4v.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a5a088d10f5c76828fccb5be3957a5ac8e83067a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4v/__pycache__/modular_glm4v.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4v/__pycache__/processing_glm4v.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4v/__pycache__/processing_glm4v.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c509afc040ae9dcff9012873f7f57ac0f4170fb2
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4v/__pycache__/processing_glm4v.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4v/__pycache__/video_processing_glm4v.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4v/__pycache__/video_processing_glm4v.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dadaff59bcd156745e612f4f093ea2cc5a689219
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/glm4v/__pycache__/video_processing_glm4v.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0757b63702b9e9bdb3ba967ddecbd26ca990528b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/__pycache__/configuration_got_ocr2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/__pycache__/configuration_got_ocr2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..468020f65f0b839c7a27be874901259ddb9c3b3e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/__pycache__/configuration_got_ocr2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/__pycache__/image_processing_got_ocr2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/__pycache__/image_processing_got_ocr2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c6bd4bfa01b6cb3ba130123e877adedb6ef6dbcc
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/__pycache__/image_processing_got_ocr2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/__pycache__/image_processing_got_ocr2_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/__pycache__/image_processing_got_ocr2_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f0a2e9104b900684e14042fa79d480a86df4989f
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/__pycache__/image_processing_got_ocr2_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/__pycache__/modeling_got_ocr2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/__pycache__/modeling_got_ocr2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..474baba5fd562256e91d30a59377114ebe723cad
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/__pycache__/modeling_got_ocr2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/__pycache__/modular_got_ocr2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/__pycache__/modular_got_ocr2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..36fc1e9e757757c6d0198d56dbb9c4976ac09662
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/__pycache__/modular_got_ocr2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/__pycache__/processing_got_ocr2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/__pycache__/processing_got_ocr2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..379c6bf23f28c0227a45f2f2ebebf958cecba335
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/got_ocr2/__pycache__/processing_got_ocr2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..49c9146c62e39e67c83dce58894a277c9eec9a21
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__pycache__/configuration_gpt2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__pycache__/configuration_gpt2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ece9e529f844915862650cec50fe99b434c46e14
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__pycache__/configuration_gpt2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__pycache__/modeling_flax_gpt2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__pycache__/modeling_flax_gpt2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1c122aae93d03319b7c38a67ee2ca35656f69392
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__pycache__/modeling_flax_gpt2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__pycache__/modeling_gpt2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__pycache__/modeling_gpt2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..26057ea532231512299b710df7379fef62e794fb
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__pycache__/modeling_gpt2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__pycache__/modeling_tf_gpt2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__pycache__/modeling_tf_gpt2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ff132753812b189d57a46cf0541c3becaa0b3d5b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__pycache__/modeling_tf_gpt2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__pycache__/tokenization_gpt2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__pycache__/tokenization_gpt2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ab28233ccaaac9297d57a162c06dd2f29a5b2365
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__pycache__/tokenization_gpt2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__pycache__/tokenization_gpt2_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__pycache__/tokenization_gpt2_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c294ab8fcf00daf2202437891a493853a1f71845
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__pycache__/tokenization_gpt2_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__pycache__/tokenization_gpt2_tf.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__pycache__/tokenization_gpt2_tf.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..74af609b3aa07215b27a4960aff70b2e0602b9bb
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt2/__pycache__/tokenization_gpt2_tf.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d925ae82b0b75e9b47f28c3c502cd85afa9e75e8
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox/__pycache__/configuration_gpt_neox.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox/__pycache__/configuration_gpt_neox.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8ad918657f9bd51a1945a36aa4417692f1996dfc
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox/__pycache__/configuration_gpt_neox.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox/__pycache__/modular_gpt_neox.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox/__pycache__/modular_gpt_neox.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1c67d9e3b6d0a6da75b0da0975ae330883b1a268
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox/__pycache__/modular_gpt_neox.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox/__pycache__/tokenization_gpt_neox_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox/__pycache__/tokenization_gpt_neox_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..97974b26e32409c419e8e47fa178497ed8f1fa52
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox/__pycache__/tokenization_gpt_neox_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox_japanese/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox_japanese/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..374b61f801c73905ae7f0b70755794867e51763a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox_japanese/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox_japanese/__pycache__/configuration_gpt_neox_japanese.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox_japanese/__pycache__/configuration_gpt_neox_japanese.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ac02f97f9fed5f0ee72832b5648b15c42292bbbb
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox_japanese/__pycache__/configuration_gpt_neox_japanese.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox_japanese/__pycache__/modeling_gpt_neox_japanese.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox_japanese/__pycache__/modeling_gpt_neox_japanese.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..218edcb605020381f01b082cb57bb4bc84d9b0c0
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox_japanese/__pycache__/modeling_gpt_neox_japanese.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox_japanese/__pycache__/tokenization_gpt_neox_japanese.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox_japanese/__pycache__/tokenization_gpt_neox_japanese.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..92ff2664a9cb494d3ef26155a79cb7e78b4c3f0d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gpt_neox_japanese/__pycache__/tokenization_gpt_neox_japanese.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gptj/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gptj/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..956087c1beaecbf1754ca8c504e4cd7bd213e09d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gptj/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gptj/__pycache__/configuration_gptj.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gptj/__pycache__/configuration_gptj.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..238602497876cd50f479a45f879abfdf515f0f8c
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gptj/__pycache__/configuration_gptj.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gptj/__pycache__/modeling_flax_gptj.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gptj/__pycache__/modeling_flax_gptj.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..193222f28b7294887f17304fa61ec4d3072e02a0
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gptj/__pycache__/modeling_flax_gptj.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gptj/__pycache__/modeling_gptj.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gptj/__pycache__/modeling_gptj.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..066768e78fe56b85d87b6bd612985978d4cf0be0
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gptj/__pycache__/modeling_gptj.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gptj/__pycache__/modeling_tf_gptj.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gptj/__pycache__/modeling_tf_gptj.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..834f27da1e63e2e95a5fa5585fbae5b6b580a453
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/gptj/__pycache__/modeling_tf_gptj.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..49fb0aef84fb229e8a7ddb275bf16b4d5a24350e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite/__pycache__/configuration_granite.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite/__pycache__/configuration_granite.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1850bf4c176228c4f6cd8c803f48cc54d2f225dd
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite/__pycache__/configuration_granite.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite/__pycache__/modeling_granite.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite/__pycache__/modeling_granite.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..94967f0a8373a352f5a6757ef1820bf8d669e90a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite/__pycache__/modeling_granite.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite/__pycache__/modular_granite.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite/__pycache__/modular_granite.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4655cf52a4a13d93309e96be0a96b5a4b59780ec
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granite/__pycache__/modular_granite.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoehybrid/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoehybrid/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f1df1f4dcb48c5a540d1fa92902c4495d3aacaad
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoehybrid/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoehybrid/__pycache__/configuration_granitemoehybrid.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoehybrid/__pycache__/configuration_granitemoehybrid.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..28dec61f0dbeff4be018ddf61bc2857049f698ff
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoehybrid/__pycache__/configuration_granitemoehybrid.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoehybrid/__pycache__/modeling_granitemoehybrid.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoehybrid/__pycache__/modeling_granitemoehybrid.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b13df60442756fd7b84ddd7d67593cb2bf6f5beb
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoehybrid/__pycache__/modeling_granitemoehybrid.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoehybrid/__pycache__/modular_granitemoehybrid.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoehybrid/__pycache__/modular_granitemoehybrid.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bbf563c4964af7dea71ccc8956dd880ca34baf2e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoehybrid/__pycache__/modular_granitemoehybrid.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoeshared/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoeshared/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a5583717fdf5480c86324556a2a2fbf58b17660c
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoeshared/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoeshared/__pycache__/configuration_granitemoeshared.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoeshared/__pycache__/configuration_granitemoeshared.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..961ed6016cc622306dca93770e721c55ef58f552
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoeshared/__pycache__/configuration_granitemoeshared.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoeshared/__pycache__/modeling_granitemoeshared.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoeshared/__pycache__/modeling_granitemoeshared.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fa55a9b0d08d21e12cb74c4f17935a2e77885773
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoeshared/__pycache__/modeling_granitemoeshared.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoeshared/__pycache__/modular_granitemoeshared.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoeshared/__pycache__/modular_granitemoeshared.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..65559a56f6df09ac2c5bfb32bdcef9dddcf97594
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/granitemoeshared/__pycache__/modular_granitemoeshared.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/grounding_dino/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/grounding_dino/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4f6c83fbe9199fc0f60a995e224e8b50e669332d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/grounding_dino/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/grounding_dino/__pycache__/configuration_grounding_dino.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/grounding_dino/__pycache__/configuration_grounding_dino.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c265911b80a34d9084681cc03d765b386716dfed
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/grounding_dino/__pycache__/configuration_grounding_dino.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/grounding_dino/__pycache__/image_processing_grounding_dino.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/grounding_dino/__pycache__/image_processing_grounding_dino.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..57f2a27cd65741f53538054b741466129714556f
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/grounding_dino/__pycache__/image_processing_grounding_dino.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/grounding_dino/__pycache__/image_processing_grounding_dino_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/grounding_dino/__pycache__/image_processing_grounding_dino_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a679e9f918a7271b312cdaed329690b5cfe985cd
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/grounding_dino/__pycache__/image_processing_grounding_dino_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/grounding_dino/__pycache__/modular_grounding_dino.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/grounding_dino/__pycache__/modular_grounding_dino.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..106dc5cd925103f710cbaba498f21a54b264fa98
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/grounding_dino/__pycache__/modular_grounding_dino.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/grounding_dino/__pycache__/processing_grounding_dino.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/grounding_dino/__pycache__/processing_grounding_dino.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6b3e6c5696c90dc6c5202280ea6cbabb486a8e5c
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/grounding_dino/__pycache__/processing_grounding_dino.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hgnet_v2/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hgnet_v2/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6df34aa43f456f52d231db63025dab2212410c8a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hgnet_v2/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hgnet_v2/__pycache__/configuration_hgnet_v2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hgnet_v2/__pycache__/configuration_hgnet_v2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..92f597d99e76d686e7bb69f8c3faa28c090b4d93
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hgnet_v2/__pycache__/configuration_hgnet_v2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hgnet_v2/__pycache__/modeling_hgnet_v2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hgnet_v2/__pycache__/modeling_hgnet_v2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e40037c788a4f0e380367ad9b3a85aa70de4e16f
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hgnet_v2/__pycache__/modeling_hgnet_v2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hgnet_v2/__pycache__/modular_hgnet_v2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hgnet_v2/__pycache__/modular_hgnet_v2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0db23e82b2d7847d2614b2443ca5e58ec614cc86
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hgnet_v2/__pycache__/modular_hgnet_v2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hiera/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hiera/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..841f13be4c0d2f48f54eecc916acd826395449af
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hiera/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_hiera import *
+    from .modeling_hiera import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hiera/configuration_hiera.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hiera/configuration_hiera.py
new file mode 100644
index 0000000000000000000000000000000000000000..2342d7e562a50de0c0937040a8e8279c7860e931
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hiera/configuration_hiera.py
@@ -0,0 +1,194 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Hiera model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
+
+
+logger = logging.get_logger(__name__)
+
+
+class HieraConfig(BackboneConfigMixin, PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`HieraModel`]. It is used to instantiate a Hiera
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the Hiera
+    [facebook/hiera-base-224](https://huggingface.co/facebook/hiera-base-224) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        embed_dim (`int`, *optional*, defaults to 96):
+            Dimensionality of patch embedding.
+        image_size (`list(int)`, *optional*, defaults to `[224, 224]`):
+            The size (resolution) of input in the format (height, width) for images
+            and (frames, height, width) for videos.
+        patch_size (`list(int)`, *optional*, defaults to `[7, 7]`):
+            The size (resolution) of each patch.
+        patch_stride (`list(int)`, *optional*, defaults to `[4, 4]`):
+            The stride of the patch.
+        patch_padding (`list(int)`, *optional*, defaults to `[3, 3]`):
+            The padding of the patch.
+        mlp_ratio (`float`, *optional*, defaults to 4.0):
+            The ratio of mlp hidden dim to embedding dim.
+        depths (`list(int)`, *optional*, defaults to `[2, 3, 16, 3]`):
+            Depth of each layer in the Transformer encoder.
+        num_heads (`list(int)`, *optional*, defaults to `[1, 2, 4, 8]`):
+            Number of attention heads in each layer of the Transformer encoder.
+        embed_dim_multiplier (`float`, *optional*, defaults to 2.0):
+            The multiplier to the dimensionality of patch embedding in each layer of the Transformer encoder.
+        num_query_pool (`int`, *optional*, defaults to 3):
+            The number of query pool stages.
+        query_stride (`list(int)`, *optional*, defaults to `[2, 2]`):
+            The stride of the query pool.
+        masked_unit_size (`list(int)`, *optional*, defaults to `[8, 8]`):
+            The size of the masked unit.
+        masked_unit_attention (`list(bool)`, *optional*, defaults to `[True, True, False, False]`):
+            Whether to use masked unit attention in each layer of the Transformer encoder.
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            The drop path rate.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        hidden_act (`str`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder. If string, `"gelu"`, `"relu"`,
+            `"selu"` and `"gelu_new"` are supported.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices and
+            the zero_initializer for initializing all bias vectors.
+        layer_norm_init (`float`, *optional*, defaults to 1.0):
+            The initial weight value for layer normalization layers.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-06):
+            The epsilon used by the layer normalization layers.
+        decoder_hidden_size (`int`, *optional*):
+            Dimensionality of decoder embeddings for MAE pretraining.
+        decoder_depth (`int`, *optional*):
+            Depth of the decoder for MAE pretraining.
+        decoder_num_heads (`int`, *optional*):
+            Number of attention heads in each layer of the decoder for MAE pretraining.
+        normalize_pixel_loss (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the pixel loss by the number of pixels.
+        mask_ratio (`float`, *optional*, defaults to 0.6):
+            The ratio of masked tokens in the input.
+        out_features (`list[str]`, *optional*):
+            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+        out_indices (`list[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+
+
+    Example:
+
+    ```python
+    >>> from transformers import HieraConfig, HieraModel
+
+    >>> # Initializing a Hiera hiera-base-patch16-224 style configuration
+    >>> configuration = HieraConfig()
+
+    >>> # Initializing a model (with random weights) from the hiera-base-patch16-224 style configuration
+    >>> model = HieraModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "hiera"
+
+    attribute_map = {"num_hidden_layers": "num_layers"}
+
+    def __init__(
+        self,
+        embed_dim=96,
+        image_size=[224, 224],
+        patch_size=[7, 7],
+        patch_stride=[4, 4],
+        patch_padding=[3, 3],
+        mlp_ratio=4.0,
+        depths=[2, 3, 16, 3],
+        num_heads=[1, 2, 4, 8],
+        embed_dim_multiplier=2.0,
+        num_query_pool=3,
+        query_stride=[2, 2],
+        masked_unit_size=[8, 8],
+        masked_unit_attention=[True, True, False, False],
+        drop_path_rate=0.0,
+        num_channels=3,
+        hidden_act="gelu",
+        initializer_range=0.02,
+        layer_norm_init=1.0,
+        layer_norm_eps=1e-6,
+        decoder_hidden_size=None,
+        decoder_depth=None,
+        decoder_num_heads=None,
+        normalize_pixel_loss=True,
+        mask_ratio=0.6,
+        out_features=None,
+        out_indices=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if masked_unit_size[0] % query_stride[0] ** (len(depths) - 1) != 0:
+            raise ValueError(
+                f"masked_unit_size[0] ({masked_unit_size[0]}) must be divisible by query_stride[0] ({query_stride[0]}) "
+                f"raised to the power of the number of layers ({len(depths) - 1})"
+            )
+
+        if num_query_pool >= len(depths):
+            raise ValueError(
+                f"num_query_pool ({num_query_pool}) must be less than the number of layers ({len(depths)})"
+            )
+
+        self.embed_dim = embed_dim
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.patch_stride = patch_stride
+        self.patch_padding = patch_padding
+        self.mlp_ratio = mlp_ratio
+        self.depths = depths
+        self.num_heads = num_heads
+        self.num_layers = len(depths)
+        self.embed_dim_multiplier = embed_dim_multiplier
+        self.num_query_pool = num_query_pool
+        self.query_stride = query_stride
+        self.masked_unit_size = masked_unit_size
+        self.masked_unit_attention = masked_unit_attention
+        self.drop_path_rate = drop_path_rate
+        self.num_channels = num_channels
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.layer_norm_init = layer_norm_init
+        self.layer_norm_eps = layer_norm_eps
+        self.decoder_hidden_size = decoder_hidden_size
+        self.decoder_depth = decoder_depth
+        self.decoder_num_heads = decoder_num_heads
+        self.normalize_pixel_loss = normalize_pixel_loss
+        self.mask_ratio = mask_ratio
+        # we set the hidden_size attribute in order to make Hiera work with VisionEncoderDecoderModel
+        # this indicates the channel dimension after the last stage of the model
+        self.hidden_size = int(embed_dim * embed_dim_multiplier ** (len(depths) - 1))
+        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
+
+
+__all__ = ["HieraConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hiera/modeling_hiera.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hiera/modeling_hiera.py
new file mode 100644
index 0000000000000000000000000000000000000000..0c084f0f836e18c4891491ea983676e9752e80a2
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hiera/modeling_hiera.py
@@ -0,0 +1,1439 @@
+# coding=utf-8
+# Copyright 2024 Meta and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Hiera model."""
+
+import math
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BackboneOutput,
+    BaseModelOutput,
+    BaseModelOutputWithPooling,
+    ImageClassifierOutput,
+    ModelOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import auto_docstring, logging, torch_int
+from ...utils.backbone_utils import BackboneMixin
+from .configuration_hiera import HieraConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Hiera encoder's outputs, with potential hidden states and attentions.
+    """
+)
+class HieraEncoderOutput(ModelOutput):
+    r"""
+    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+        shape `(batch_size, height, width, hidden_size)`. These are the reshaped and re-rolled hidden states of the model.
+
+        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
+        include the spatial dimensions.
+    """
+
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+    reshaped_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Hiera model's outputs that also contains a pooling of the last hidden states.
+    """
+)
+class HieraModelOutput(ModelOutput):
+    r"""
+    pooler_output (`torch.FloatTensor` of shape `(batch_size, hidden_size)`, *optional*, returned when `add_pooling_layer=True` is passed):
+        Average pooling of the last layer hidden-state.
+    bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`):
+        Tensor indicating which patches are masked (0) and which are not (1).
+    ids_restore (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+        Tensor containing the original index of the (shuffled) masked patches.
+    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+        shape `(batch_size, height, width, hidden_size)`. These are the reshaped and re-rolled hidden states of the model.
+
+        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
+        include the spatial dimensions.
+    """
+
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    pooler_output: Optional[torch.FloatTensor] = None
+    bool_masked_pos: Optional[torch.BoolTensor] = None
+    ids_restore: Optional[torch.LongTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+    reshaped_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Hiera image classification outputs.
+    """
+)
+class HieraForImageClassificationOutput(ImageClassifierOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, `optional`):
+        Loss value for the training task.
+    logits (`torch.FloatTensor` of shape `(batch_size, num_labels)`):
+        Prediction scores of the classification head (logits of the output layer).
+    hidden_states (`tuple(torch.FloatTensor)`, `optional`):
+        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+        shape `(batch_size, sequence_length, hidden_size)`. These are the unrolled hidden states of the model.
+
+        Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+    attentions (`tuple(torch.FloatTensor)`, `optional`):
+        Tuple of `torch.FloatTensor` (one for each stage) of shape `(batch_size, num_heads, sequence_length,
+        sequence_length)`.
+
+        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+        heads.
+    reshaped_hidden_states (`tuple(torch.FloatTensor)`, `optional`):
+        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each stage) of
+        shape `(batch_size, height, width, hidden_size)`. These are the reshaped and re-rolled hidden states of the model.
+
+        Hidden-states of the model at the output of each layer plus the initial embedding outputs reshaped to
+        include the spatial dimensions.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+    attentions: Optional[tuple[torch.FloatTensor, ...]] = None
+    reshaped_hidden_states: Optional[tuple[torch.FloatTensor, ...]] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Class for HieraForPreTraining's outputs, with potential hidden states and attentions.
+    """
+)
+class HieraForPreTrainingOutput(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`):
+        Pixel reconstruction loss.
+    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, patch_size ** 2 * num_channels)`):
+        Pixel reconstruction logits.
+    bool_masked_pos (`torch.BoolTensor` of shape `(batch_size, sequence_length)`):
+        Tensor indicating which patches are masked (0) and which are not (1).
+    ids_restore (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+        Tensor containing the original index of the (shuffled) masked patches.
+    reshaped_hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+        Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+        shape `(batch_size, height, width, hidden_size)`. Hidden-states of the model at the output of each layer
+        plus the initial embedding outputs reshaped to include the spatial dimensions.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    bool_masked_pos: Optional[torch.BoolTensor] = None
+    ids_restore: Optional[torch.LongTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    reshaped_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+
+
+class HieraPatchEmbeddings(nn.Module):
+    """
+    This class turns `pixel_values` of shape `(batch_size, num_channels, height, width)` into the initial
+    `hidden_states` (patch embeddings) of shape `(batch_size, seq_length, hidden_size)` to be consumed by a
+    Transformer.
+    """
+
+    def __init__(self, config, is_mae: bool = False):
+        super().__init__()
+
+        # Support any number of spatial dimensions
+        self.spatial_dims = len(config.patch_size)
+        if self.spatial_dims != 2:
+            raise ValueError(f"The number of dimensions of the input image should be 2, but got {self.spatial_dims}.")
+        self.num_channels = config.num_channels
+        self.image_size = config.image_size[-2:]
+        self.tokens_spatial_shape = [i // s for i, s in zip(config.image_size, config.patch_stride)]
+        self.mask_spatial_shape = [i // s for i, s in zip(self.tokens_spatial_shape, config.masked_unit_size)]
+        self.mask_ratio = config.mask_ratio
+        self.is_mae = is_mae
+        self.projection = nn.Conv2d(
+            self.num_channels,
+            config.embed_dim,
+            kernel_size=config.patch_size,
+            stride=config.patch_stride,
+            padding=config.patch_padding,
+        )
+
+    def masked_conv(
+        self, pixel_values: torch.FloatTensor, bool_masked_pos: Optional[torch.BoolTensor] = None
+    ) -> torch.Tensor:
+        """Zero-out the masked regions of the input before conv.
+        Prevents leakage of masked regions when using overlapping kernels.
+        """
+        if bool_masked_pos is None:
+            return self.projection(pixel_values)
+
+        target_size = pixel_values.shape[2:]
+        # Reshape bool_masked_pos to (batch_size, 1, mask_unit_height, mask_unit_width)
+        bool_masked_pos = bool_masked_pos.view(pixel_values.shape[0], 1, *self.mask_spatial_shape)
+
+        bool_masked_pos = nn.functional.interpolate(bool_masked_pos.float(), size=target_size)
+
+        return self.projection(pixel_values * bool_masked_pos)
+
+    def random_masking(
+        self, pixel_values: torch.FloatTensor, noise: Optional[torch.FloatTensor] = None
+    ) -> tuple[torch.BoolTensor, torch.LongTensor]:
+        """
+        Perform per-sample random masking by per-sample shuffling. Per-sample shuffling is done by argsort random
+        noise.
+
+        Args:
+            pixel_values (`torch.FloatTensor` of shape `(batch_size, num_channels, height, width)`)
+            noise (`torch.FloatTensor` of shape `(batch_size, num_mask_units)`, *optional*) which is
+                mainly used for testing purposes to control randomness and maintain the reproducibility
+        """
+        batch_size = pixel_values.shape[0]
+        # Tokens selected for masking at mask unit level
+        num_windows = math.prod(self.mask_spatial_shape)
+        len_keep = int(num_windows * (1 - self.mask_ratio))
+
+        if noise is None:
+            noise = torch.rand(batch_size, num_windows, device=pixel_values.device)
+
+        # Sort noise for each sample
+        ids_shuffle = torch.argsort(noise, dim=1)
+        # ascend: small is keep, large is remove
+        ids_restore = torch.argsort(ids_shuffle, dim=1).to(pixel_values.device)
+
+        # Generate the binary bool_masked_pos: 1 is *keep*, 0 is *remove*
+        # Note this is opposite to original MAE
+        bool_masked_pos = torch.zeros([batch_size, num_windows], device=pixel_values.device)
+        bool_masked_pos[:, :len_keep] = 1
+        # Unshuffle to get the binary bool_masked_pos
+        bool_masked_pos = torch.gather(bool_masked_pos, dim=1, index=ids_restore).bool()
+
+        return bool_masked_pos, ids_restore
+
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        noise: Optional[torch.FloatTensor] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.BoolTensor], Optional[torch.LongTensor]]:
+        (bool_masked_pos, ids_restore) = (
+            self.random_masking(pixel_values, noise=noise) if self.is_mae else (None, None)
+        )
+
+        embeddings = self.masked_conv(pixel_values, bool_masked_pos)
+        embeddings = embeddings.flatten(2).transpose(2, 1)
+
+        return embeddings, bool_masked_pos, ids_restore
+
+
+class HieraEmbeddings(nn.Module):
+    """
+    Construct position and patch embeddings.
+    """
+
+    def __init__(self, config: HieraConfig, is_mae: bool = False) -> None:
+        super().__init__()
+        self.patch_stride = config.patch_stride
+        tokens_spatial_shape = [i // s for i, s in zip(config.image_size, config.patch_stride)]
+        self.mask_spatial_shape = [i // s for i, s in zip(tokens_spatial_shape, config.masked_unit_size)]
+        self.num_tokens = math.prod(tokens_spatial_shape)
+        self.is_mae = is_mae
+
+        self.patch_embeddings = HieraPatchEmbeddings(config, is_mae=is_mae)
+
+        self.position_embeddings = nn.Parameter(torch.zeros(1, self.num_tokens, config.embed_dim))
+
+    def interpolate_pos_encoding(
+        self, embeddings: torch.Tensor, pos_embeds: torch.Tensor, height: int, width: int
+    ) -> torch.Tensor:
+        """
+        This method allows to interpolate the pre-trained position encodings, to be able to use the model on higher resolution
+        images. This method is also adapted to support torch.jit tracing, no class embeddings, and different patch strides.
+
+        Adapted from:
+        - https://github.com/facebookresearch/dino/blob/de9ee3df6cf39fac952ab558447af1fa1365362a/vision_transformer.py#L174-L194, and
+        - https://github.com/facebookresearch/dinov2/blob/e1277af2ba9496fbadf7aec6eba56e8d882d1e35/dinov2/models/vision_transformer.py#L179-L211
+        """
+
+        num_patches = embeddings.shape[1]
+        num_positions = pos_embeds.shape[1]
+
+        # always interpolate when tracing to ensure the exported model works for dynamic input shapes
+        if not torch.jit.is_tracing() and num_patches == num_positions and height == width:
+            return pos_embeds
+
+        dim = embeddings.shape[-1]
+
+        new_height = height // self.patch_stride[0]
+        new_width = width // self.patch_stride[1]
+
+        sqrt_num_positions = torch_int(num_positions**0.5)
+        pos_embeds = pos_embeds.reshape(1, sqrt_num_positions, sqrt_num_positions, dim)
+        pos_embeds = pos_embeds.permute(0, 3, 1, 2)
+
+        pos_embeds = nn.functional.interpolate(
+            pos_embeds,
+            size=(new_height, new_width),
+            mode="bicubic",
+            align_corners=False,
+        )
+
+        pos_embeds = pos_embeds.permute(0, 2, 3, 1).view(1, -1, dim)
+        return pos_embeds
+
+    def get_position_embedding(
+        self, embeddings: torch.Tensor, height: int, width: int, interpolate_pos_encoding: bool
+    ) -> torch.FloatTensor:
+        return (
+            self.interpolate_pos_encoding(embeddings, self.position_embeddings, height, width)
+            if interpolate_pos_encoding
+            else self.position_embeddings
+        )
+
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        noise: Optional[torch.FloatTensor] = None,
+        interpolate_pos_encoding: bool = False,
+    ) -> tuple[torch.Tensor, Optional[torch.BoolTensor], Optional[torch.LongTensor]]:
+        height, width = pixel_values.shape[-2:]
+        embeddings, bool_masked_pos, ids_restore = self.patch_embeddings(pixel_values, noise=noise)
+        embeddings = embeddings + self.get_position_embedding(embeddings, height, width, interpolate_pos_encoding)
+        return embeddings, bool_masked_pos, ids_restore
+
+
+class HieraMaskUnitAttention(nn.Module):
+    """
+    Computes either Mask Unit or Global Attention. Also is able to perform query pooling.
+
+    Note: this assumes the tokens have already been flattened and unrolled into mask units.
+    """
+
+    def __init__(
+        self,
+        hidden_size: int,
+        hidden_size_output: int,
+        num_heads: int,
+        query_stride: int = 1,
+        window_size: int = 0,
+        use_mask_unit_attn: bool = False,
+    ) -> None:
+        super().__init__()
+        self.num_heads = num_heads
+        self.query_stride = query_stride
+        self.hidden_size_output = hidden_size_output
+
+        self.head_dim = hidden_size_output // num_heads
+        self.scale = (self.head_dim) ** -0.5
+
+        self.qkv = nn.Linear(hidden_size, 3 * hidden_size_output)
+        self.proj = nn.Linear(hidden_size_output, hidden_size_output)
+
+        self.window_size = window_size
+        self.use_mask_unit_attn = use_mask_unit_attn
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: bool = False,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        """Input should be of shape [batch, tokens, channels]."""
+        batch_size, seq_len, _ = hidden_states.shape
+
+        num_windows = 1
+        if self.use_mask_unit_attn:
+            num_windows = seq_len // (self.query_stride * self.window_size)
+
+        qkv = self.qkv(hidden_states)
+        qkv = qkv.reshape(batch_size, -1, num_windows, 3, self.num_heads, self.head_dim)
+        qkv = qkv.permute(3, 0, 4, 2, 1, 5)
+
+        query, key, value = qkv.unbind(0)
+
+        if self.query_stride > 1:
+            # Refer to unroll to see how this performs a maxpool-Nd
+            query = query.view(batch_size, self.num_heads, num_windows, self.query_stride, -1, self.head_dim)
+            query = query.max(dim=3).values
+
+        attn_weights = (query * self.scale) @ key.transpose(-1, -2)
+        attn_weights = attn_weights.softmax(dim=-1)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_weights = attn_weights * head_mask
+
+        attn_output = attn_weights @ value
+        attn_output = attn_output.transpose(1, 3).reshape(batch_size, -1, self.hidden_size_output)
+        attn_output = self.proj(attn_output)
+
+        return (attn_output, attn_weights) if output_attentions else (attn_output, None)
+
+
+# Copied from transformers.models.beit.modeling_beit.drop_path
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+
+
+# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->Hiera
+class HieraDropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return f"p={self.drop_prob}"
+
+
+class HieraMlp(nn.Module):
+    def __init__(self, config, dim: int) -> None:
+        super().__init__()
+        self.activation_fn = ACT2FN[config.hidden_act]
+        self.fc1 = nn.Linear(dim, int(dim * config.mlp_ratio))
+        self.fc2 = nn.Linear(int(dim * config.mlp_ratio), dim)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class HieraLayer(nn.Module):
+    def __init__(
+        self,
+        config,
+        hidden_size: int,
+        hidden_size_output: int,
+        num_heads: int,
+        drop_path: float = 0.0,
+        query_stride: int = 1,
+        window_size: int = 0,
+        use_mask_unit_attn: bool = False,
+    ) -> None:
+        super().__init__()
+
+        self.hidden_size = hidden_size
+        self.hidden_size_output = hidden_size_output
+        self.query_stride = query_stride
+
+        self.layernorm_before = nn.LayerNorm(hidden_size, eps=config.layer_norm_eps)
+        self.attn = HieraMaskUnitAttention(
+            hidden_size=hidden_size,
+            hidden_size_output=hidden_size_output,
+            num_heads=num_heads,
+            query_stride=query_stride,
+            window_size=window_size,
+            use_mask_unit_attn=use_mask_unit_attn,
+        )
+
+        self.layernorm_after = nn.LayerNorm(hidden_size_output, eps=config.layer_norm_eps)
+        self.mlp = HieraMlp(config, hidden_size_output)
+
+        self.drop_path = HieraDropPath(drop_path) if drop_path > 0 else nn.Identity()
+        if hidden_size != hidden_size_output:
+            self.proj = nn.Linear(hidden_size, hidden_size_output)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: bool = False,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        batch_size, seq_len, _ = hidden_states.shape
+        # Attention + Q Pooling
+        hidden_states_norm = self.layernorm_before(hidden_states)
+        if self.hidden_size != self.hidden_size_output:
+            hidden_states = self.proj(hidden_states_norm)
+            # Refer to unroll to see how this performs a maxpool-Nd
+            hidden_states = (
+                hidden_states.view(batch_size, self.query_stride, -1, self.hidden_size_output).max(dim=1).values
+            )
+
+        (hidden_states_norm, attn_weights) = self.attn(
+            hidden_states_norm, head_mask, output_attentions=output_attentions
+        )
+        hidden_states = hidden_states + self.drop_path(hidden_states_norm)
+
+        residual = hidden_states
+        hidden_states = self.layernorm_after(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + self.drop_path(hidden_states)
+
+        return (hidden_states, attn_weights)
+
+
+class HieraStage(GradientCheckpointingLayer):
+    def __init__(
+        self,
+        config,
+        depth: int,
+        hidden_size: int,
+        hidden_size_output: int,
+        num_heads: int,
+        drop_path: list[float],
+        query_stride: list[int],
+        window_size: int,
+        use_mask_unit_attn: bool,
+        stage_num: Optional[int] = None,
+    ) -> None:
+        super().__init__()
+        # we need to know if the previous stage used masked attention
+        # mask unit or global attention.
+        # lag by 1 layer, so that global attention,
+        # applied post pooling on lower resolution
+        previous_stage_used_masked_attention = False
+        if stage_num is not None:
+            previous_stage_used_masked_attention = config.masked_unit_attention[stage_num - 1 if stage_num > 0 else 0]
+        self.layers = nn.ModuleList(
+            [
+                HieraLayer(
+                    config=config,
+                    hidden_size=hidden_size if i == 0 else hidden_size_output,
+                    hidden_size_output=hidden_size_output,
+                    num_heads=num_heads,
+                    drop_path=drop_path[i],
+                    query_stride=query_stride[i],
+                    window_size=window_size,
+                    use_mask_unit_attn=use_mask_unit_attn or (previous_stage_used_masked_attention and i == 0),
+                )
+                for i in range(depth)
+            ]
+        )
+
+    def forward(
+        self, hidden_states: torch.Tensor, head_mask: Optional[torch.FloatTensor], output_attentions: bool = False
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor]]:
+        for i, layer_module in enumerate(self.layers):
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+            (hidden_states, attn_weights) = layer_module(
+                hidden_states, layer_head_mask, output_attentions=output_attentions
+            )
+
+        return hidden_states, attn_weights
+
+
+def undo_windowing(hidden_states: torch.Tensor, shape: list[int], mask_unit_shape: list[int]) -> torch.Tensor:
+    """
+    Restore spatial organization by undoing windowed organization of mask units.
+
+    Args:
+        hidden_states (`torch.Tensor`): The hidden states tensor of shape `[batch_size, num_mask_unit_height*num_mask_unit_width, hidden_size]`.
+        shape (`list[int]`): The original shape of the hidden states tensor before windowing.
+        mask_unit_shape (`list[int]`): The shape of the mask units used for windowing.
+
+    Returns:
+        torch.Tensor: The restored hidden states tensor of shape [batch_size, num_mask_unit_height*mask_unit_height, num_mask_unit_width*mask_unit_width, hidden_size].
+    """
+    batch_size, hidden_size = hidden_states.shape[0], hidden_states.shape[-1]
+    # From: [batch_size, num_mask_unit_height*num_mask_unit_width, hidden_size]
+    # To: [batch_size, num_mask_unit_height, num_mask_unit_width, mask_unit_height, mask_unit_width, hidden_size]
+    num_mask_units = [s // mu for s, mu in zip(shape, mask_unit_shape)]
+    hidden_states = hidden_states.view(batch_size, *num_mask_units, *mask_unit_shape, hidden_size)
+
+    # From: [batch_size, num_mask_unit_height, num_mask_unit_width, mask_unit_height, mask_unit_width, hidden_size]
+    # To: [batch_size, num_mask_unit_height*mask_unit_height, num_mask_unit_width*mask_unit_width, hidden_size]
+    hidden_states = hidden_states.permute(0, 1, 3, 2, 4, 5)
+    hidden_states = hidden_states.reshape(batch_size, *shape, hidden_size)
+
+    return hidden_states
+
+
+class HieraEncoder(nn.Module):
+    def __init__(self, config: HieraConfig) -> None:
+        super().__init__()
+        total_depth = sum(config.depths)
+        # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, total_depth, device="cpu")]
+        # query strides rule
+        cumulative_depths = torch.tensor(config.depths, device="cpu").cumsum(0).tolist()
+        query_pool_layer = cumulative_depths[: config.num_query_pool]
+        query_strides = [math.prod(config.query_stride) if i in query_pool_layer else 1 for i in range(total_depth)]
+
+        # Transformer blocks
+        self.stages = nn.ModuleList()
+        hidden_size = config.embed_dim
+        stage_ends = [0] + cumulative_depths
+        masked_unit_area = math.prod(config.masked_unit_size)
+        query_stride_area = math.prod(config.query_stride)
+        for idx_stage, depth in enumerate(config.depths):
+            hidden_size_output = int(config.embed_dim * config.embed_dim_multiplier**idx_stage)
+
+            stage = HieraStage(
+                config=config,
+                depth=depth,
+                hidden_size=hidden_size,
+                hidden_size_output=hidden_size_output,
+                num_heads=config.num_heads[idx_stage],
+                drop_path=dpr[stage_ends[idx_stage] : stage_ends[idx_stage + 1]],
+                query_stride=query_strides[stage_ends[idx_stage] : stage_ends[idx_stage + 1]],
+                window_size=int(masked_unit_area * query_stride_area**-idx_stage),
+                use_mask_unit_attn=config.masked_unit_attention[idx_stage],
+                stage_num=idx_stage,
+            )
+
+            hidden_size = hidden_size_output
+            self.stages.append(stage)
+
+        # Setting reroll schedule
+        # The first stage has to reverse everything
+        # The next stage has to reverse all but the first unroll, etc.
+        stage_size = [i // s for i, s in zip(config.image_size, config.patch_stride)]
+        unroll_schedule = [config.query_stride] * len(config.depths[:-1])
+
+        self.schedule = {}
+        for idx_stage in range(len(config.depths)):
+            self.schedule[idx_stage] = unroll_schedule, stage_size
+            if idx_stage < config.num_query_pool:
+                stage_size = [i // s for i, s in zip(stage_size, config.query_stride)]
+                unroll_schedule = unroll_schedule[1:]
+
+        self.gradient_checkpointing = False
+
+    def reroll(
+        self, hidden_states: torch.Tensor, stage_idx: int, bool_masked_pos: Optional[torch.BoolTensor] = None
+    ) -> torch.Tensor:
+        """
+        Roll the given tensor back up to spatial order assuming it's from the given block.
+
+        If no bool_masked_pos is provided returns:
+            - [batch_size, height, width, hidden_size]
+        If a bool_masked_pos is provided returns:
+            - [batch_size, num_mask_units, mask_unit_height, mask_unit_width, hidden_size]
+        """
+        schedule, size = self.schedule[stage_idx]
+        batch_size, seq_len, hidden_size = hidden_states.shape
+
+        num_dim = len(size)
+        mask_unit_shape = [1] * num_dim
+
+        for strides in schedule:
+            # Extract the current patch from seq_len
+            hidden_states = hidden_states.view(
+                batch_size, *strides, seq_len // math.prod(strides), *mask_unit_shape, hidden_size
+            )
+
+            # Move that patch into the current MU
+            # Input: [batch_size, stride, stride, seq_len//(stride*stride), mask_unit_height, mask_unit_width, hidden_size]
+            # Output: [batch_size, seq_len//(stride*stride), stride, mask_unit_height, stride, mask_unit_width, hidden_size]
+            hidden_states = hidden_states.permute(0, 3, 1, 4, 2, 5, 6)
+
+            # Reshape to [batch_size, seq_len//(stride*stride), *mask_units, hidden_size]
+            for i in range(num_dim):
+                mask_unit_shape[i] *= strides[i]
+            hidden_states = hidden_states.reshape(batch_size, -1, *mask_unit_shape, hidden_size)
+            seq_len = hidden_states.shape[1]
+
+        # Current shape (e.g., 2d: [batch_size, #num_mask_units_height*#num_mask_units_width, mask_unit_height, mask_unit_width, hidden_size])
+        hidden_states = hidden_states.view(batch_size, seq_len, *mask_unit_shape, hidden_size)
+
+        # If masked, return [batch_size, num_mask_units, mask_unit_height, mask_unit_width, hidden_size]
+        if bool_masked_pos is not None:
+            return hidden_states
+
+        # If not masked, we can return [batch_size, height, width, hidden_size]
+        hidden_states = undo_windowing(hidden_states, size, mask_unit_shape)
+
+        return hidden_states
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        bool_masked_pos: Optional[torch.BoolTensor] = None,
+        head_mask: Optional[torch.FloatTensor] = None,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, BaseModelOutput]:
+        all_hidden_states = () if output_hidden_states else None
+        all_reshaped_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+            reshaped_hidden_states = self.reroll(hidden_states, stage_idx=0, bool_masked_pos=bool_masked_pos)
+            all_reshaped_hidden_states = all_reshaped_hidden_states + (reshaped_hidden_states,)
+
+        for i, stage_module in enumerate(self.stages):
+            layer_head_mask = head_mask[i] if head_mask is not None else None
+
+            layer_outputs = stage_module(hidden_states, layer_head_mask, output_attentions)
+
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+                reshaped_hidden_states = self.reroll(hidden_states, stage_idx=i, bool_masked_pos=bool_masked_pos)
+                all_reshaped_hidden_states = all_reshaped_hidden_states + (reshaped_hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, all_hidden_states, all_self_attentions, all_reshaped_hidden_states]
+                if v is not None
+            )
+        return HieraEncoderOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            reshaped_hidden_states=all_reshaped_hidden_states,
+        )
+
+
+def unroll(
+    hidden_states: torch.Tensor, image_shape: tuple[int, int], patch_stride: tuple[int, int], schedule: list[list[int]]
+) -> torch.Tensor:
+    """
+    Reorders the tokens such that patches are contiguous in memory.
+    E.g., given [batch_size, (height, width), hidden_size] and stride of (stride, stride), this will re-order the tokens as
+    [batch_size, (stride, stride, height // stride, width // stride), hidden_size]
+
+    This allows operations like Max2d to be computed as x.view(batch_size, stride*stride, -1, hidden_size).max(dim=1).
+    Not only is this faster, but it also makes it easy to support inputs of arbitrary
+    dimensions in addition to patch-wise sparsity.
+
+    Performing this operation multiple times in sequence puts entire windows as contiguous
+    in memory. For instance, if you applied the stride (2, 2) 3 times, entire windows of
+    size 8x8 would be contiguous in memory, allowing operations like mask unit attention
+    computed easily and efficiently, while also allowing max to be applied sequentially.
+
+    Note: This means that intermediate values of the model are not in height x width order, so they
+    need to be re-rolled if you want to use the intermediate values as a height x width feature map.
+    The last block of the network is fine though, since by then the strides are all consumed.
+    """
+    batch_size, _, hidden_size = hidden_states.shape
+
+    size = [i // s for i, s in zip(image_shape, patch_stride)]
+
+    current_size = size
+    hidden_states = hidden_states.view(*([batch_size] + current_size + [hidden_size]))
+
+    for strides in schedule:
+        # Move patches with the given strides to the batch dimension
+
+        # Create a view of the tensor with the patch stride as separate dims
+        # For example in 2d: [batch_size, height // stride, stride, width // stride, stride, C]
+        current_size = [i // s for i, s in zip(current_size, strides)]
+        # initialize new_shape with [height // stride, stride, width // stride, stride]
+        new_shape = [item for pair in zip(current_size, strides) for item in pair]
+        # add batch_size and hidden_size to new_shape
+        new_shape = [batch_size] + new_shape + [hidden_size]
+        hidden_states = hidden_states.view(new_shape)
+
+        # Move the patch stride into the batch dimension
+        # For example in 2d: [batch_size, stride, stride, height // stride, width // stride, hidden_size]
+        num_dims = len(new_shape)
+        permute = [0] + list(range(2, num_dims - 1, 2)) + list(range(1, num_dims - 1, 2)) + [num_dims - 1]
+        hidden_states = hidden_states.permute(permute)
+
+        # Now finally flatten the relevant dims into the batch dimension
+        hidden_states = hidden_states.flatten(0, len(strides))
+        batch_size *= math.prod(strides)
+
+    hidden_states = hidden_states.reshape(-1, math.prod(size), hidden_size)
+    return hidden_states
+
+
+@auto_docstring
+class HieraPreTrainedModel(PreTrainedModel):
+    config: HieraConfig
+    base_model_prefix = "hiera"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module) -> None:
+        """Initialize the weights"""
+        std = self.config.initializer_range
+
+        if isinstance(module, HieraEmbeddings):
+            nn.init.trunc_normal_(module.position_embeddings, std=std)
+
+        elif isinstance(module, HieraDecoder):
+            nn.init.trunc_normal_(module.mask_token, std=std)
+            nn.init.trunc_normal_(module.decoder_position_embeddings, std=std)
+
+        elif isinstance(module, (nn.Linear, nn.Conv1d, nn.Conv2d)):
+            nn.init.trunc_normal_(module.weight, std=std)
+            if module.bias is not None:
+                nn.init.constant_(module.bias, std)
+
+        elif isinstance(module, nn.LayerNorm):
+            nn.init.constant_(module.bias, std)
+            nn.init.constant_(module.weight, self.config.layer_norm_init)
+
+
+class HieraPooler(nn.Module):
+    def __init__(self, config: HieraConfig):
+        super().__init__()
+        num_features = int(config.embed_dim * config.embed_dim_multiplier ** (len(config.depths) - 1))
+        self.layernorm = nn.LayerNorm(num_features, eps=config.layer_norm_eps)
+        self.pooler = nn.AdaptiveAvgPool1d(1)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = hidden_states.transpose(1, 2)
+        pooled_output = self.pooler(hidden_states)
+        pooled_output = torch.flatten(pooled_output, 1)
+        pooled_output = self.layernorm(pooled_output)
+        return pooled_output
+
+
+@auto_docstring
+class HieraModel(HieraPreTrainedModel):
+    def __init__(self, config: HieraConfig, add_pooling_layer: bool = True, is_mae: bool = False):
+        r"""
+        add_pooling_layer (`bool`, *optional*, defaults to `True`):
+            Whether or not to apply pooling layer.
+        is_mae (`bool`, *optional*, defaults to `False`):
+            Whether or not to run the model on MAE mode.
+        """
+        super().__init__(config)
+        self.num_features = int(config.embed_dim * config.embed_dim_multiplier ** (len(config.depths) - 1))
+
+        self.embeddings = HieraEmbeddings(config, is_mae=is_mae)
+        self.encoder = HieraEncoder(config)
+
+        self.unroll_schedule = [config.query_stride] * len(config.depths[:-1])
+
+        self.pooler = HieraPooler(config) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> HieraPatchEmbeddings:
+        return self.embeddings.patch_embeddings
+
+    def _prune_heads(self, heads_to_prune: dict[int, list[int]]) -> None:
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+        class PreTrainedModel
+        """
+        for layer, heads in heads_to_prune.items():
+            self.encoder.layer[layer].attention.prune_heads(heads)
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        noise: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutputWithPooling]:
+        r"""
+        noise (`torch.FloatTensor` of shape `(batch_size, num_mask_units)`, *optional*):
+            Mainly used for testing purposes to control randomness and maintain the reproducibility
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+        # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+        head_mask = self.get_head_mask(head_mask, len(self.config.depths))
+
+        embedding_output, bool_masked_pos, ids_restore = self.embeddings(
+            pixel_values, interpolate_pos_encoding=interpolate_pos_encoding, noise=noise
+        )
+
+        image_shape = (pixel_values.shape[-2], pixel_values.shape[-1])
+        hidden_states = unroll(
+            embedding_output,
+            image_shape=image_shape,
+            patch_stride=self.config.patch_stride,
+            schedule=self.unroll_schedule,
+        )
+
+        # Discard masked tokens if bool_masked_pos is provided
+        if bool_masked_pos is not None:
+            mask_unit_area = math.prod(self.config.masked_unit_size)
+            batch_size, _, hidden_size = hidden_states.shape
+            positions = bool_masked_pos.unsqueeze(-1).tile(1, mask_unit_area, hidden_size)
+            hidden_states = hidden_states[positions]
+            hidden_states = hidden_states.view(batch_size, -1, hidden_size)
+
+        encoder_outputs = self.encoder(
+            hidden_states,
+            bool_masked_pos=bool_masked_pos,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        sequence_output = encoder_outputs[0]
+        pooled_output = None
+        if self.pooler is not None:
+            pooled_output = self.pooler(sequence_output)
+
+        if not return_dict:
+            head_outputs = (sequence_output, pooled_output) if pooled_output is not None else (sequence_output,)
+            head_outputs = (
+                head_outputs + (bool_masked_pos, ids_restore) if bool_masked_pos is not None else head_outputs
+            )
+            return head_outputs + encoder_outputs[1:]
+
+        return HieraModelOutput(
+            last_hidden_state=sequence_output,
+            pooler_output=pooled_output,
+            bool_masked_pos=bool_masked_pos,
+            ids_restore=ids_restore,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+            reshaped_hidden_states=encoder_outputs.reshaped_hidden_states,
+        )
+
+
+class HieraDecoder(nn.Module):
+    def __init__(self, config: HieraConfig):
+        super().__init__()
+        num_features = int(config.embed_dim * config.embed_dim_multiplier ** (len(config.depths) - 1))
+        tokens_spatial_shape = [i // s for i, s in zip(config.image_size, config.patch_stride)]
+        self.tokens_spatial_shape_final = [
+            i // s ** (config.num_query_pool) for i, s in zip(tokens_spatial_shape, config.query_stride)
+        ]
+        self.mask_unit_spatial_shape_final = [
+            i // s ** (config.num_query_pool) for i, s in zip(config.masked_unit_size, config.query_stride)
+        ]
+
+        self.decoder_embeddings = nn.Linear(num_features, config.decoder_hidden_size)
+
+        self.mask_token = nn.Parameter(torch.zeros(1, 1, config.decoder_hidden_size))
+
+        self.decoder_position_embeddings = nn.Parameter(
+            torch.zeros(1, math.prod(self.tokens_spatial_shape_final), config.decoder_hidden_size)
+        )
+
+        self.decoder_block = HieraStage(
+            config=config,
+            hidden_size=config.decoder_hidden_size,
+            hidden_size_output=config.decoder_hidden_size,
+            num_heads=config.decoder_num_heads,
+            depth=config.decoder_depth,
+            use_mask_unit_attn=False,
+            drop_path=[0.0] * config.decoder_depth,
+            query_stride=[1] * config.decoder_depth,
+            window_size=0,
+        )
+
+        self.decoder_norm = nn.LayerNorm(config.decoder_hidden_size, eps=config.layer_norm_eps)
+
+        # patch stride of prediction
+        self.pred_stride = config.patch_stride[-1] * (config.query_stride[-1] ** config.num_query_pool)
+        pred_dim = (self.pred_stride ** len(config.query_stride)) * config.num_channels
+
+        self.decoder_pred = nn.Linear(config.decoder_hidden_size, pred_dim)
+
+    def forward(
+        self,
+        encoder_hidden_states: torch.Tensor,
+        bool_masked_pos: torch.BoolTensor,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> tuple[torch.Tensor, torch.BoolTensor]:
+        # Embed tokens
+        hidden_states = self.decoder_embeddings(encoder_hidden_states)
+
+        # Combine visible and bool_masked_pos tokens
+
+        # hidden_states : [batch_size, num_mask_units_visible, *mask_unit_spatial_shape_final, decoder_hidden_size]
+        # bool_masked_pos: [batch_size, num_mask_units]
+        mask_unit_height, mask_unit_width, decoder_hidden_size = hidden_states.shape[2:]
+        batch_size, num_mask_units = bool_masked_pos.shape
+
+        decoder_hidden_states = torch.zeros(
+            batch_size,
+            num_mask_units,
+            mask_unit_height,
+            mask_unit_width,
+            decoder_hidden_size,
+            device=hidden_states.device,
+            dtype=hidden_states.dtype,
+        )
+        mask_tokens = self.mask_token.view(1, 1, 1, 1, -1)
+        bool_masked_pos = bool_masked_pos.reshape(batch_size, num_mask_units, 1, 1, 1)
+        bool_masked_pos = bool_masked_pos.expand(-1, -1, mask_unit_height, mask_unit_width, decoder_hidden_size)
+        decoder_hidden_states[bool_masked_pos] = hidden_states.flatten()
+        decoder_hidden_states = (
+            1 - bool_masked_pos.float()
+        ) * mask_tokens + bool_masked_pos.float() * decoder_hidden_states
+
+        # Get back spatial order
+        hidden_states = undo_windowing(
+            decoder_hidden_states,
+            self.tokens_spatial_shape_final,
+            self.mask_unit_spatial_shape_final,
+        )
+        bool_masked_pos = undo_windowing(
+            bool_masked_pos[..., 0:1],
+            self.tokens_spatial_shape_final,
+            self.mask_unit_spatial_shape_final,
+        )
+
+        # Flatten
+        hidden_states = hidden_states.reshape(hidden_states.shape[0], -1, hidden_states.shape[-1])
+        bool_masked_pos = bool_masked_pos.view(hidden_states.shape[0], -1)
+
+        # Add pos embed
+        hidden_states = hidden_states + self.decoder_position_embeddings
+
+        # Apply decoder blocks
+        hidden_states, attn_weights = self.decoder_block(
+            hidden_states, head_mask=head_mask, output_attentions=output_attentions
+        )
+        hidden_states = self.decoder_norm(hidden_states)
+
+        # Predictor projection
+        hidden_states = self.decoder_pred(hidden_states)
+
+        return hidden_states, bool_masked_pos
+
+
+class HieraMultiScaleHead(nn.Module):
+    def __init__(self, config: HieraConfig):
+        super().__init__()
+        self.mask_unit_spatial_shape_final = [
+            i // s ** (config.num_query_pool) for i, s in zip(config.masked_unit_size, config.query_stride)
+        ]
+        self.stage_dimensions = [
+            int(config.embed_dim * config.embed_dim_multiplier**i) for i in range(len(config.depths))
+        ]
+        current_masked_unit_size = config.masked_unit_size
+        self.multi_scale_fusion_heads = nn.ModuleList()
+
+        for idx in range(config.num_query_pool):
+            kernel = [i // s for i, s in zip(current_masked_unit_size, self.mask_unit_spatial_shape_final)]
+            current_masked_unit_size = [i // s for i, s in zip(current_masked_unit_size, config.query_stride)]
+            self.multi_scale_fusion_heads.append(
+                nn.Conv2d(
+                    self.stage_dimensions[idx],
+                    self.stage_dimensions[-1],
+                    kernel_size=kernel,
+                    stride=kernel,
+                )
+            )
+        self.multi_scale_fusion_heads.append(nn.Identity())
+
+    def apply_fusion_head(self, head: nn.Module, hidden_states: torch.Tensor) -> torch.Tensor:
+        if isinstance(head, nn.Identity):
+            return hidden_states
+
+        # Doing explicit to avoid problems with torch.fx
+        batch_size, num_mask_units, mask_unit_height, mask_unit_width, hidden_size = hidden_states.shape
+        # From: [batch_size, num_mask_units, mask_unit_height, mask_unit_width, hidden_size]
+        # To: head([batch_size * num_mask_units, hidden_size, mask_unit_height, mask_unit_width])
+        hidden_states = hidden_states.reshape(
+            batch_size * num_mask_units, mask_unit_height, mask_unit_width, hidden_size
+        )
+        hidden_states = hidden_states.permute(0, 3, 1, 2)
+        hidden_states = head(hidden_states)
+
+        # Restore original layout
+        hidden_states = hidden_states.permute(0, 2, 3, 1)
+        mask_unit_height_final, mask_unit_width_final, hidden_size = hidden_states.shape[1:]
+        hidden_states = hidden_states.reshape(
+            batch_size, num_mask_units, mask_unit_height_final, mask_unit_width_final, hidden_size
+        )
+
+        return hidden_states
+
+    def forward(self, feature_maps: list[torch.Tensor]) -> torch.Tensor:
+        # Multi-scale fusion
+        hidden_states = 0.0
+        for head, feature_map in zip(self.multi_scale_fusion_heads, feature_maps):
+            hidden_states = hidden_states + self.apply_fusion_head(head, feature_map)
+
+        return hidden_states
+
+
+@auto_docstring(
+    custom_intro="""
+    The Hiera Model transformer with the decoder on top for self-supervised pre-training.
+
+    <Tip>
+
+    Note that we provide a script to pre-train this model on custom data in our [examples
+    directory](https://github.com/huggingface/transformers/tree/main/examples/pytorch/image-pretraining).
+
+    </Tip>
+    """
+)
+class HieraForPreTraining(HieraPreTrainedModel):
+    def __init__(self, config: HieraConfig) -> None:
+        super().__init__(config)
+        # Encoder
+        self.hiera = HieraModel(config, add_pooling_layer=False, is_mae=True)
+        self.encoder_norm = nn.LayerNorm(self.hiera.num_features, eps=config.layer_norm_eps)
+        # Multi-scale fusion heads
+        self.multiscale_fusion = HieraMultiScaleHead(config)
+        # Decoder
+        self.decoder = HieraDecoder(config)
+        self.pred_stride = self.decoder.pred_stride
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_pixel_label_2d(self, pixel_values: torch.Tensor, bool_masked_pos: torch.BoolTensor) -> torch.Tensor:
+        # bool_masked_pos (boolean tensor): True means *masked*
+        pixel_values = pixel_values.permute(0, 2, 3, 1)
+
+        size = self.pred_stride
+        label = pixel_values.unfold(1, size, size).unfold(2, size, size)
+        label = label.flatten(1, 2).flatten(2)
+        label = label[bool_masked_pos]
+        if self.config.normalize_pixel_loss:
+            mean = label.mean(dim=-1, keepdim=True)
+            var = label.var(dim=-1, keepdim=True)
+            label = (label - mean) / (var + 1.0e-6) ** 0.5
+
+        return label
+
+    def forward_loss(self, pixel_values: torch.Tensor, logits: torch.Tensor, bool_masked_pos: torch.BoolTensor):
+        # We invert the bool_masked_pos such that 1.0 is *masked*
+        bool_masked_pos = ~bool_masked_pos
+        label = self.get_pixel_label_2d(pixel_values, bool_masked_pos)
+
+        logits = logits[bool_masked_pos]
+        loss = (logits - label) ** 2
+        loss = loss.mean()
+
+        return loss
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        noise: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, HieraForPreTrainingOutput]:
+        r"""
+        noise (`torch.FloatTensor` of shape `(batch_size, num_mask_units)`, *optional*):
+            Mainly used for testing purposes to control randomness and maintain the reproducibility
+
+        Examples:
+        ```python
+        >>> from transformers import AutoImageProcessor, HieraForPreTraining
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("facebook/hiera-tiny-224-mae-hf")
+        >>> model = HieraForPreTraining.from_pretrained("facebook/hiera-tiny-224-mae-hf")
+
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+        >>> logits = outputs.logits
+        >>> loss = outputs.loss
+        >>> print(list(logits.shape))
+        [1, 196, 768]
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        outputs = self.hiera(
+            pixel_values,
+            noise=noise,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=True,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            return_dict=return_dict,
+        )
+
+        feature_maps = outputs[-1]
+        bool_masked_pos = outputs[1]
+        ids_to_restore = outputs[2]
+        # Take only the query pooled and last hidden states
+        feature_maps = feature_maps[1 : self.hiera.config.num_query_pool + 1] + (feature_maps[-1],)
+        fused_hidden_states = self.multiscale_fusion(feature_maps)
+        fused_hidden_states = self.encoder_norm(fused_hidden_states)
+
+        # Reconstruct pixel values
+        logits, bool_masked_pos = self.decoder(
+            fused_hidden_states,
+            bool_masked_pos=bool_masked_pos,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+        )
+
+        loss = self.forward_loss(pixel_values, logits, bool_masked_pos)
+
+        if not return_dict:
+            output = (logits, bool_masked_pos, ids_to_restore)
+            if output_hidden_states:
+                output = output + (outputs[3],)
+            if output_attentions:
+                output = output + (outputs[4],)
+            if output_hidden_states:
+                output = output + (outputs[-1],)
+            return ((loss,) + output) if loss is not None else output
+
+        return HieraForPreTrainingOutput(
+            loss=loss,
+            logits=logits,
+            bool_masked_pos=bool_masked_pos,
+            ids_restore=ids_to_restore,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=outputs.attentions,
+            reshaped_hidden_states=outputs.reshaped_hidden_states if output_hidden_states else None,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    Hiera Model transformer with an image classification head on top (a linear layer on top of the final hidden state with
+    average pooling) e.g. for ImageNet.
+
+    <Tip>
+
+        Note that it's possible to fine-tune Hiera on higher resolution images than the ones it has been trained on, by
+        setting `interpolate_pos_encoding` to `True` in the forward of the model. This will interpolate the pre-trained
+        position embeddings to the higher resolution.
+
+    </Tip>
+    """
+)
+class HieraForImageClassification(HieraPreTrainedModel):
+    def __init__(self, config: HieraConfig) -> None:
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.hiera = HieraModel(config, add_pooling_layer=True, is_mae=False)
+
+        # Classifier head
+        self.classifier = (
+            nn.Linear(self.hiera.num_features, config.num_labels) if config.num_labels > 0 else nn.Identity()
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values,
+        head_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        interpolate_pos_encoding: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, HieraForImageClassificationOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        outputs = self.hiera(
+            pixel_values,
+            head_mask=head_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            interpolate_pos_encoding=interpolate_pos_encoding,
+            return_dict=return_dict,
+        )
+
+        pooled_output = outputs[1]
+
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(labels, logits, self.config)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return HieraForImageClassificationOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            reshaped_hidden_states=outputs.reshaped_hidden_states,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    Hiera backbone, to be used with frameworks like DETR and MaskFormer.
+    """
+)
+class HieraBackbone(HieraPreTrainedModel, BackboneMixin):
+    def __init__(self, config: HieraConfig):
+        super().__init__(config)
+        super()._init_backbone(config)
+
+        self.num_features = [config.embed_dim] + [
+            int(config.embed_dim * config.embed_dim_multiplier**i) for i in range(len(config.depths))
+        ]
+        self.embeddings = HieraEmbeddings(config, is_mae=False)
+        self.encoder = HieraEncoder(config)
+
+        # Add layer norms to hidden states of out_features
+        hidden_states_norms = {}
+        for stage, num_channels in zip(self._out_features, self.channels):
+            hidden_states_norms[stage] = nn.LayerNorm(num_channels)
+        self.hidden_states_norms = nn.ModuleDict(hidden_states_norms)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.embeddings.patch_embeddings
+
+    def forward(
+        self,
+        pixel_values: torch.Tensor,
+        output_hidden_states: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> BackboneOutput:
+        """
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, AutoBackbone
+        >>> import torch
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> processor = AutoImageProcessor.from_pretrained("facebook/hiera-tiny-224-hf")
+        >>> model = AutoBackbone.from_pretrained(
+        ...     "facebook/hiera-tiny-224-hf", out_features=["stage1", "stage2", "stage3", "stage4"]
+        ... )
+
+        >>> inputs = processor(image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> feature_maps = outputs.feature_maps
+        >>> list(feature_maps[-1].shape)
+        [1, 768, 7, 7]
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        embedding_output, _, _ = self.embeddings(pixel_values)
+
+        outputs = self.encoder(
+            embedding_output,
+            head_mask=None,
+            output_attentions=output_attentions,
+            output_hidden_states=True,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[-1]
+
+        feature_maps = ()
+        for stage, hidden_state in zip(self.stage_names, hidden_states):
+            if stage in self.out_features:
+                batch_size, height, width, num_channels = hidden_state.shape
+                hidden_state = hidden_state.view(batch_size, height * width, num_channels)
+                hidden_state = self.hidden_states_norms[stage](hidden_state)
+                hidden_state = hidden_state.view(batch_size, height, width, num_channels)
+                hidden_state = hidden_state.permute(0, 3, 1, 2).contiguous()
+                feature_maps += (hidden_state,)
+
+        if not return_dict:
+            output = (feature_maps,)
+            if output_hidden_states:
+                output += (outputs[1],)
+            if output_attentions:
+                output += (outputs[2],)
+            return output
+
+        return BackboneOutput(
+            feature_maps=feature_maps,
+            hidden_states=outputs[1] if output_hidden_states else None,
+            attentions=outputs[2] if output_attentions else None,
+        )
+
+
+__all__ = ["HieraForImageClassification", "HieraForPreTraining", "HieraBackbone", "HieraModel", "HieraPreTrainedModel"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hubert/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hubert/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fcb18edb8e1d5475ea5c043da2da32aae492cc17
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hubert/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hubert/__pycache__/configuration_hubert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hubert/__pycache__/configuration_hubert.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9ab9ce8301fb802060f302b4f299e31a9e9aca24
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hubert/__pycache__/configuration_hubert.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hubert/__pycache__/modeling_hubert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hubert/__pycache__/modeling_hubert.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f93b20e574add0aa57b5653088a798e79c361eb8
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hubert/__pycache__/modeling_hubert.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hubert/__pycache__/modeling_tf_hubert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hubert/__pycache__/modeling_tf_hubert.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c3573ce73b103545e61b4cd0e620aeb11245b96e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hubert/__pycache__/modeling_tf_hubert.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hubert/__pycache__/modular_hubert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hubert/__pycache__/modular_hubert.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ee8613086ddb94a330a963bc4b8abfac8ddd99db
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hubert/__pycache__/modular_hubert.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hunyuan_v1_dense/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hunyuan_v1_dense/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..735df04f3d1cb535c27dd77e995939ec10e0f72e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hunyuan_v1_dense/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hunyuan_v1_dense/__pycache__/configuration_hunyuan_v1_dense.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hunyuan_v1_dense/__pycache__/configuration_hunyuan_v1_dense.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..456198490d324ea8964879648e2d51e53047ee68
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hunyuan_v1_dense/__pycache__/configuration_hunyuan_v1_dense.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hunyuan_v1_dense/__pycache__/modeling_hunyuan_v1_dense.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hunyuan_v1_dense/__pycache__/modeling_hunyuan_v1_dense.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5e7d75b63b704f42cfc0ae5b7ea2597d7cc37c93
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hunyuan_v1_dense/__pycache__/modeling_hunyuan_v1_dense.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hunyuan_v1_dense/__pycache__/modular_hunyuan_v1_dense.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hunyuan_v1_dense/__pycache__/modular_hunyuan_v1_dense.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5dceea97da189f2b0d6a88c5fbb9a44af39dff10
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hunyuan_v1_dense/__pycache__/modular_hunyuan_v1_dense.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hunyuan_v1_moe/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hunyuan_v1_moe/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd107ee7a3c16d2527806035f67702e9220cae51
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hunyuan_v1_moe/__init__.py
@@ -0,0 +1,14 @@
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_hunyuan_v1_moe import *
+    from .modeling_hunyuan import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..386ddac1d3ebb37de330a3940ad5ac556be5bcf6
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hunyuan_v1_moe/configuration_hunyuan_v1_moe.py
@@ -0,0 +1,204 @@
+# coding=utf-8
+# Copyright (C) 2025 THL A29 Limited, a Tencent company and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""HunYuanMoEV1 model configuration"""
+
+from typing import Union
+
+from transformers.configuration_utils import PretrainedConfig
+from transformers.utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class HunYuanMoEV1Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`HunYuanMoEV1Model`]. It is used to instantiate an
+    HunYuan model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the HunYuan-7B.
+    Hunyuan-A13B-Instruct [tencent/Hunyuan-A13B-Instruct](https://huggingface.co/tencent/Hunyuan-A13B-Instruct).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 290943):
+            Vocabulary size of the HunYuan model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`HunYuanMoEV1Model`]
+        hidden_size (`int`, *optional*, defaults to 4096):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 11008):
+            Dimension of the MLP representations or shared MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 32):
+            Number of hidden layers in the Transformer decoder.
+        num_attention_heads (`int`, *optional*, defaults to 32):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details checkout [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to
+            `num_attention_heads`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model might ever be used with.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            Padding token id.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            Beginning of stream token id.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            End of stream token id.
+        eod_token_id (int, *optional*, defaults to 3):
+            Token ID representing the end-of-document marker. Used to indicate the termination of a text sequence.
+            Example: In multi-document processing, this token helps the model distinguish between separate documents.
+        sep_token_id (`int`, *optional*, defaults to 4):
+            Token ID representing the separator token (`[SEP]`), used to demarcate boundaries between different text segments.
+        pretraining_tp (`int`, *optional*, defaults to 1):
+            Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
+            document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
+            necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
+            issue](https://github.com/pytorch/pytorch/issues/76232).
+        tie_word_embeddings (`bool`, *optional*, defaults to `False`):
+            Whether to tie weight embeddings
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
+            strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
+            `{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
+            `max_position_embeddings` to the expected new maximum. See the following thread for more information on how
+            these scaling strategies behave:
+            https://www.reddit.com/r/LocalLLaMA/comments/14mrgpr/dynamically_scaled_rope_further_increases/. This is an
+            experimental feature, subject to breaking API changes in future versions.
+        attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
+            Whether to use a bias in the query, key, value and output projection layers during self-attention.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        num_experts (`int` or `List`, *optional*, defaults to 1):
+            The number of experts for moe. If it is a list, it will be used as the number of experts for each layer.
+        moe_topk (int or List, *optional*, defaults to 1):
+            Number of experts selected per token (Top-K routing). List form enables layer-wise customization.
+        head_dim (`int`, *optional*, defaults to 128):
+            The attention head dimension.
+    """
+
+    model_type = "hunyuan_v1_moe"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=290943,
+        hidden_size=4096,
+        intermediate_size: int = 11008,
+        num_hidden_layers=32,
+        num_attention_heads=32,
+        num_key_value_heads=None,
+        hidden_act="silu",
+        max_position_embeddings=2048,
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        eod_token_id=3,
+        sep_token_id=4,
+        pretraining_tp=1,
+        tie_word_embeddings=False,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        attention_bias=False,
+        attention_dropout=0.0,
+        num_experts: Union[int, list] = 1,
+        moe_topk: Union[int, list] = 1,
+        head_dim=None,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.num_experts = num_experts
+        self.moe_topk = moe_topk
+
+        self.head_dim = head_dim
+        # for backward compatibility
+        if num_key_value_heads is None:
+            num_key_value_heads = num_attention_heads
+
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+        self.pretraining_tp = pretraining_tp
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        # self._rope_scaling_validation()   # TODO: Need validation?
+        self.attention_bias = attention_bias
+        self.attention_dropout = attention_dropout
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            sep_token_id=sep_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    def _rope_scaling_validation(self):
+        """
+        Validate the `rope_scaling` configuration.
+        """
+        if self.rope_scaling is None:
+            return
+
+        if not isinstance(self.rope_scaling, dict) or len(self.rope_scaling) != 2:
+            raise ValueError(
+                "`rope_scaling` must be a dictionary with with two fields, `type` and `factor` or `type` and `alpha`, "
+                f"got {self.rope_scaling}"
+            )
+        rope_scaling_type = self.rope_scaling.get("type", None)
+        rope_scaling_factor = self.rope_scaling.get("factor", None)
+        rope_scaling_alpha = self.rope_scaling.get("alpha", None)
+        if rope_scaling_type is None or rope_scaling_type not in ["linear", "dynamic"]:
+            raise ValueError(
+                f"`rope_scaling`'s type field must be one of ['linear', 'dynamic'], got {rope_scaling_type}"
+            )
+        if rope_scaling_factor is None and rope_scaling_alpha is None:
+            raise ValueError("`rope_scaling`'s factor or alpha field must be have one, got both of none")
+        if rope_scaling_factor is not None:
+            if not isinstance(rope_scaling_factor, float) or rope_scaling_factor <= 1.0:
+                raise ValueError(f"`rope_scaling`'s factor field must be a float > 1.0, got {rope_scaling_factor}")
+        if rope_scaling_alpha is not None:
+            if not isinstance(rope_scaling_alpha, float) or rope_scaling_alpha <= 1.0:
+                raise ValueError(f"`rope_scaling`'s alpha field must be a float > 1.0, got {rope_scaling_alpha}")
+
+
+__all__ = ["HunYuanMoEV1Config"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..043d1f8243a35638747f3ff44946748dc472fcf1
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hunyuan_v1_moe/modeling_hunyuan_v1_moe.py
@@ -0,0 +1,584 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_hunyuan_v1_moe.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright (C) 2025 THL A29 Limited, a Tencent company and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, Optional, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from transformers.cache_utils import Cache
+
+from ...activations import ACT2FN
+from ...cache_utils import DynamicCache
+from ...generation import GenerationMixin
+from ...integrations import use_kernel_forward_from_hub
+from ...masking_utils import create_causal_mask
+from ...modeling_layers import GenericForSequenceClassification, GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
+from ...utils.deprecation import deprecate_kwarg
+from ...utils.generic import check_model_inputs
+from .configuration_hunyuan_v1_moe import HunYuanMoEV1Config
+
+
+@use_kernel_forward_from_hub("RMSNorm")
+class HunYuanMoEV1RMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        HunYuanMoEV1RMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+class HunYuanMoEV1MLP(nn.Module):
+    def __init__(self, config: HunYuanMoEV1Config, layer_idx=None, is_shared_mlp=False):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+        self.layer_idx = layer_idx
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs: Unpack[TransformersKwargs],
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class HunYuanMoEV1Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: HunYuanMoEV1Config, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", config.hidden_size // config.num_attention_heads)
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+
+        self.q_proj = nn.Linear(
+            config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.k_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.v_proj = nn.Linear(
+            config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.attention_bias
+        )
+        self.o_proj = nn.Linear(
+            config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.attention_bias
+        )
+        self.query_layernorm = HunYuanMoEV1RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        self.key_layernorm = HunYuanMoEV1RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        query_states = self.query_layernorm(query_states)
+        key_states = self.key_layernorm(key_states)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class HunYuanMoEV1Gate(nn.Module):
+    def __init__(self, config: HunYuanMoEV1Config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        num_experts = config.num_experts if isinstance(config.num_experts, int) else config.num_experts[layer_idx]
+        self.wg = nn.Linear(config.hidden_size, num_experts, bias=False, dtype=torch.float32)
+
+    def forward(self, hidden_states):
+        bsz, seq_len, hidden_size = hidden_states.shape
+        hidden_states = hidden_states.reshape(-1, hidden_size)
+        if self.wg.weight.dtype == torch.float32:
+            hidden_states = hidden_states.float()
+        logits = self.wg(hidden_states)
+        return logits
+
+
+class HunYuanMoEV1Moe(nn.Module):
+    def __init__(self, config: HunYuanMoEV1Config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.num_experts = config.num_experts if isinstance(config.num_experts, int) else config.num_experts[layer_idx]
+        self.top_k = config.moe_topk if isinstance(config.moe_topk, int) else config.moe_topk[layer_idx]
+        self.gate = HunYuanMoEV1Gate(config, layer_idx=layer_idx)
+        # self.wg = nn.Linear(config.hidden_size, config.num_experts, bias=False, dtype=torch.float32)
+        self.experts = nn.ModuleList(
+            [HunYuanMoEV1MLP(config, layer_idx=layer_idx, is_shared_mlp=False) for _ in range(self.num_experts)]
+        )
+
+        self.shared_mlp = HunYuanMoEV1MLP(config, layer_idx=layer_idx, is_shared_mlp=True)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+        hidden_states_mlp = self.shared_mlp(hidden_states)
+        router_logits = self.gate(hidden_states)
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        # router_logits: (batch * sequence_length, n_experts)
+
+        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+        routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
+        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+        # we cast back to the input dtype
+        routing_weights = routing_weights.to(hidden_states.dtype)
+
+        final_hidden_states = torch.zeros(
+            (batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
+        )
+
+        # One hot encode the selected experts to create an expert mask
+        # this will be used to easily index which expert is going to be sollicitated
+        expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)
+
+        # Loop over all available experts in the model and perform the computation on each expert
+        expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
+        for expert_idx in expert_hit:
+            expert_layer = self.experts[expert_idx]
+            idx, top_x = torch.where(expert_mask[expert_idx].squeeze(0))
+
+            # Index the correct hidden states and compute the expert hidden state for
+            # the current expert. We need to make sure to multiply the output hidden
+            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
+            current_state = hidden_states[None, top_x].reshape(-1, hidden_dim)
+            current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None]
+
+            # However `index_add_` only support torch tensors for indexing so we'll use
+            # the `top_x` tensor here.
+            final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))
+        final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
+        return final_hidden_states + hidden_states_mlp
+
+
+class HunYuanMoEV1DecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: HunYuanMoEV1Config, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = HunYuanMoEV1Attention(config=config, layer_idx=layer_idx)
+        self.mlp = HunYuanMoEV1Moe(config, layer_idx=layer_idx)
+        self.input_layernorm = HunYuanMoEV1RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = HunYuanMoEV1RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.layer_idx = layer_idx
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+@auto_docstring
+class HunYuanMoEV1PreTrainedModel(PreTrainedModel):
+    config: HunYuanMoEV1Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["HunYuanMoEV1DecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _can_compile_fullgraph = False
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": HunYuanMoEV1DecoderLayer,
+        "attentions": HunYuanMoEV1Attention,
+    }
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+class HunYuanMoEV1RotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: HunYuanMoEV1Config, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        if self.rope_type == "dynamic" and config.rope_scaling["alpha"]:
+            # DynamicNTKAlphaRotary
+            self.dim = config.head_dim
+            base = config.rope_theta * config.rope_scaling.get("alpha") ** (self.dim / (self.dim - 2))
+            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+            self.attention_scaling = 1.0
+        else:
+            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+@auto_docstring
+class HunYuanMoEV1Model(HunYuanMoEV1PreTrainedModel):
+    def __init__(self, config: HunYuanMoEV1Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [HunYuanMoEV1DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = HunYuanMoEV1RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.rotary_emb = HunYuanMoEV1RotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @check_model_inputs
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds: torch.Tensor = self.embed_tokens(input_ids)
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position: torch.Tensor = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = create_causal_mask(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+
+        hidden_states = inputs_embeds
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            hidden_states = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+
+        hidden_states = self.norm(hidden_states)
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+        )
+
+
+@auto_docstring
+class HunYuanMoEV1ForCausalLM(HunYuanMoEV1PreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = HunYuanMoEV1Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> CausalLMOutputWithPast:
+        r"""
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, HunYuanMoEV1ForCausalLM
+
+        >>> model = HunYuanMoEV1ForCausalLM.from_pretrained("meta-hunyuan_v1_moe/HunYuanMoEV1-2-7b-hf")
+        >>> tokenizer = AutoTokenizer.from_pretrained("meta-hunyuan_v1_moe/HunYuanMoEV1-2-7b-hf")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class HunYuanMoEV1ForSequenceClassification(GenericForSequenceClassification, HunYuanMoEV1PreTrainedModel):
+    pass
+
+
+__all__ = [
+    "HunYuanMoEV1ForCausalLM",
+    "HunYuanMoEV1Model",
+    "HunYuanMoEV1PreTrainedModel",
+    "HunYuanMoEV1ForSequenceClassification",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py
new file mode 100644
index 0000000000000000000000000000000000000000..a72d6268fe7021b472a141beee79ea3eadca5c84
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/hunyuan_v1_moe/modular_hunyuan_v1_moe.py
@@ -0,0 +1,273 @@
+# coding=utf-8
+# Copyright (C) 2025 THL A29 Limited, a Tencent company and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch HunYuanMoEV1 model."""
+
+from typing import Callable, Optional
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from transformers.cache_utils import Cache
+from transformers.utils import (
+    logging,
+)
+
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs
+from ..llama.modeling_llama import (
+    LlamaAttention,
+    LlamaDecoderLayer,
+    LlamaForCausalLM,
+    LlamaForSequenceClassification,
+    LlamaMLP,
+    LlamaModel,
+    LlamaPreTrainedModel,
+    LlamaRMSNorm,
+    apply_rotary_pos_emb,
+    eager_attention_forward,
+)
+from .configuration_hunyuan_v1_moe import HunYuanMoEV1Config
+
+
+logger = logging.get_logger(__name__)
+
+
+class HunYuanMoEV1RMSNorm(LlamaRMSNorm):
+    pass
+
+
+class HunYuanMoEV1MLP(LlamaMLP):
+    def __init__(self, config: HunYuanMoEV1Config, layer_idx=None, is_shared_mlp=False):
+        super().__init__(config)
+        self.layer_idx = layer_idx
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+
+
+class HunYuanMoEV1Attention(LlamaAttention):
+    def __init__(self, config: HunYuanMoEV1Config, layer_idx: int):
+        super().__init__(config, layer_idx)
+        self.query_layernorm = HunYuanMoEV1RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+        self.key_layernorm = HunYuanMoEV1RMSNorm(self.head_dim, eps=config.rms_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+        query_states = self.query_layernorm(query_states)
+        key_states = self.key_layernorm(key_states)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class HunYuanMoEV1Gate(nn.Module):
+    def __init__(self, config: HunYuanMoEV1Config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        num_experts = config.num_experts if isinstance(config.num_experts, int) else config.num_experts[layer_idx]
+        self.wg = nn.Linear(config.hidden_size, num_experts, bias=False, dtype=torch.float32)
+
+    def forward(self, hidden_states):
+        bsz, seq_len, hidden_size = hidden_states.shape
+        hidden_states = hidden_states.reshape(-1, hidden_size)
+        if self.wg.weight.dtype == torch.float32:
+            hidden_states = hidden_states.float()
+        logits = self.wg(hidden_states)
+        return logits
+
+
+class HunYuanMoEV1Moe(nn.Module):
+    def __init__(self, config: HunYuanMoEV1Config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.num_experts = config.num_experts if isinstance(config.num_experts, int) else config.num_experts[layer_idx]
+        self.top_k = config.moe_topk if isinstance(config.moe_topk, int) else config.moe_topk[layer_idx]
+        self.gate = HunYuanMoEV1Gate(config, layer_idx=layer_idx)
+        # self.wg = nn.Linear(config.hidden_size, config.num_experts, bias=False, dtype=torch.float32)
+        self.experts = nn.ModuleList(
+            [HunYuanMoEV1MLP(config, layer_idx=layer_idx, is_shared_mlp=False) for _ in range(self.num_experts)]
+        )
+
+        self.shared_mlp = HunYuanMoEV1MLP(config, layer_idx=layer_idx, is_shared_mlp=True)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        batch_size, sequence_length, hidden_dim = hidden_states.shape
+        hidden_states_mlp = self.shared_mlp(hidden_states)
+        router_logits = self.gate(hidden_states)
+        hidden_states = hidden_states.view(-1, hidden_dim)
+        # router_logits: (batch * sequence_length, n_experts)
+
+        routing_weights = F.softmax(router_logits, dim=1, dtype=torch.float)
+        routing_weights, selected_experts = torch.topk(routing_weights, self.top_k, dim=-1)
+        routing_weights /= routing_weights.sum(dim=-1, keepdim=True)
+        # we cast back to the input dtype
+        routing_weights = routing_weights.to(hidden_states.dtype)
+
+        final_hidden_states = torch.zeros(
+            (batch_size * sequence_length, hidden_dim), dtype=hidden_states.dtype, device=hidden_states.device
+        )
+
+        # One hot encode the selected experts to create an expert mask
+        # this will be used to easily index which expert is going to be sollicitated
+        expert_mask = torch.nn.functional.one_hot(selected_experts, num_classes=self.num_experts).permute(2, 1, 0)
+
+        # Loop over all available experts in the model and perform the computation on each expert
+        expert_hit = torch.greater(expert_mask.sum(dim=(-1, -2)), 0).nonzero()
+        for expert_idx in expert_hit:
+            expert_layer = self.experts[expert_idx]
+            idx, top_x = torch.where(expert_mask[expert_idx].squeeze(0))
+
+            # Index the correct hidden states and compute the expert hidden state for
+            # the current expert. We need to make sure to multiply the output hidden
+            # states by `routing_weights` on the corresponding tokens (top-1 and top-2)
+            current_state = hidden_states[None, top_x].reshape(-1, hidden_dim)
+            current_hidden_states = expert_layer(current_state) * routing_weights[top_x, idx, None]
+
+            # However `index_add_` only support torch tensors for indexing so we'll use
+            # the `top_x` tensor here.
+            final_hidden_states.index_add_(0, top_x, current_hidden_states.to(hidden_states.dtype))
+        final_hidden_states = final_hidden_states.reshape(batch_size, sequence_length, hidden_dim)
+        return final_hidden_states + hidden_states_mlp
+
+
+class HunYuanMoEV1DecoderLayer(LlamaDecoderLayer):
+    def __init__(self, config: HunYuanMoEV1Config, layer_idx: int):
+        super().__init__(config, layer_idx)
+        self.hidden_size = config.hidden_size
+        self.self_attn = HunYuanMoEV1Attention(config=config, layer_idx=layer_idx)
+        self.mlp = HunYuanMoEV1Moe(config, layer_idx=layer_idx)
+        self.input_layernorm = HunYuanMoEV1RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.post_attention_layernorm = HunYuanMoEV1RMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.layer_idx = layer_idx
+
+
+class HunYuanMoEV1PreTrainedModel(LlamaPreTrainedModel):
+    _can_compile_fullgraph = False
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+
+
+class HunYuanMoEV1RotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: HunYuanMoEV1Config, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+        if self.rope_type == "dynamic" and config.rope_scaling["alpha"]:
+            # DynamicNTKAlphaRotary
+            self.dim = config.head_dim
+            base = config.rope_theta * config.rope_scaling.get("alpha") ** (self.dim / (self.dim - 2))
+            inv_freq = 1.0 / (base ** (torch.arange(0, self.dim, 2).float().to(device) / self.dim))
+            self.attention_scaling = 1.0
+        else:
+            inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+class HunYuanMoEV1Model(LlamaModel):
+    pass
+
+
+class HunYuanMoEV1ForCausalLM(LlamaForCausalLM):
+    pass
+
+
+class HunYuanMoEV1ForSequenceClassification(LlamaForSequenceClassification):
+    pass
+
+
+__all__ = [
+    "HunYuanMoEV1ForCausalLM",
+    "HunYuanMoEV1Model",
+    "HunYuanMoEV1PreTrainedModel",
+    "HunYuanMoEV1ForSequenceClassification",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..143013a133fc20073555e166a94228e2241f25a1
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/configuration_idefics.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/configuration_idefics.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..981c0618c278bd8d1d23f62988d920ca507b7c94
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/configuration_idefics.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/image_processing_idefics.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/image_processing_idefics.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..323499ede03b53efbd99cd3baf5d3af24dc89c70
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/image_processing_idefics.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/modeling_idefics.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/modeling_idefics.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aad95d179b851666680cec2dd310b8b6e53a005a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/modeling_idefics.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/modeling_tf_idefics.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/modeling_tf_idefics.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eb2c0ea9b0ee43736ca176f79bec3c6ead26a3ff
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/modeling_tf_idefics.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/perceiver.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/perceiver.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cb42eb4ec0f44b66bdb3ae7dfe50b83ce815ee6a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/perceiver.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/perceiver_tf.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/perceiver_tf.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b4c72bdff14fe118db3819d187d50318b6fde18c
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/perceiver_tf.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/processing_idefics.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/processing_idefics.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..54660b51492032e42f517fbcfb8af655a089547b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/processing_idefics.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/vision.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/vision.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1e67c74777e4f05b50ff3a3cdb473c4f193f702c
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/vision.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/vision_tf.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/vision_tf.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9aa9c84903da03b786a12576b074d3490d222fd6
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/idefics/__pycache__/vision_tf.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/imagegpt/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/imagegpt/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..098ffb6296f547e6dd9f1f990d21e28bc5cb0f7b
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/imagegpt/__init__.py
@@ -0,0 +1,30 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_imagegpt import *
+    from .feature_extraction_imagegpt import *
+    from .image_processing_imagegpt import *
+    from .image_processing_imagegpt_fast import *
+    from .modeling_imagegpt import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/imagegpt/configuration_imagegpt.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/imagegpt/configuration_imagegpt.py
new file mode 100644
index 0000000000000000000000000000000000000000..8cfa8d5e47826e4323f74b33430fa872005c88ea
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/imagegpt/configuration_imagegpt.py
@@ -0,0 +1,200 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""OpenAI ImageGPT configuration"""
+
+from collections import OrderedDict
+from collections.abc import Mapping
+from typing import TYPE_CHECKING, Any, Optional
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+
+
+if TYPE_CHECKING:
+    from ... import FeatureExtractionMixin, TensorType
+
+logger = logging.get_logger(__name__)
+
+
+class ImageGPTConfig(PretrainedConfig):
+    """
+    This is the configuration class to store the configuration of a [`ImageGPTModel`] or a [`TFImageGPTModel`]. It is
+    used to instantiate a GPT-2 model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the ImageGPT
+    [openai/imagegpt-small](https://huggingface.co/openai/imagegpt-small) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 512):
+            Vocabulary size of the GPT-2 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`ImageGPTModel`] or [`TFImageGPTModel`].
+        n_positions (`int`, *optional*, defaults to 32*32):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        n_embd (`int`, *optional*, defaults to 512):
+            Dimensionality of the embeddings and hidden states.
+        n_layer (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer encoder.
+        n_head (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        n_inner (`int`, *optional*, defaults to None):
+            Dimensionality of the inner feed-forward layers. `None` will set it to 4 times n_embd
+        activation_function (`str`, *optional*, defaults to `"quick_gelu"`):
+            Activation function (can be one of the activation functions defined in src/transformers/activations.py).
+            Defaults to "quick_gelu".
+        resid_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        embd_pdrop (`int`, *optional*, defaults to 0.1):
+            The dropout ratio for the embeddings.
+        attn_pdrop (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention.
+        layer_norm_epsilon (`float`, *optional*, defaults to 1e-5):
+            The epsilon to use in the layer normalization layers.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        scale_attn_weights (`bool`, *optional*, defaults to `True`):
+            Scale attention weights by dividing by sqrt(hidden_size)..
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        scale_attn_by_inverse_layer_idx (`bool`, *optional*, defaults to `False`):
+            Whether to additionally scale attention weights by `1 / layer_idx + 1`.
+        reorder_and_upcast_attn (`bool`, *optional*, defaults to `False`):
+            Whether to scale keys (K) prior to computing attention (dot-product) and upcast attention
+            dot-product/softmax to float() when training with mixed precision.
+
+    Example:
+
+    ```python
+    >>> from transformers import ImageGPTConfig, ImageGPTModel
+
+    >>> # Initializing a ImageGPT configuration
+    >>> configuration = ImageGPTConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = ImageGPTModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "imagegpt"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {
+        "hidden_size": "n_embd",
+        "max_position_embeddings": "n_positions",
+        "num_attention_heads": "n_head",
+        "num_hidden_layers": "n_layer",
+    }
+
+    def __init__(
+        self,
+        vocab_size=512 + 1,  # add one for start of sentence (sos) token
+        n_positions=32 * 32,
+        n_embd=512,
+        n_layer=24,
+        n_head=8,
+        n_inner=None,
+        activation_function="quick_gelu",
+        resid_pdrop=0.1,
+        embd_pdrop=0.1,
+        attn_pdrop=0.1,
+        layer_norm_epsilon=1e-5,
+        initializer_range=0.02,
+        scale_attn_weights=True,
+        use_cache=True,
+        tie_word_embeddings=False,
+        scale_attn_by_inverse_layer_idx=False,
+        reorder_and_upcast_attn=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.n_positions = n_positions
+        self.n_embd = n_embd
+        self.n_layer = n_layer
+        self.n_head = n_head
+        self.n_inner = n_inner
+        self.activation_function = activation_function
+        self.resid_pdrop = resid_pdrop
+        self.embd_pdrop = embd_pdrop
+        self.attn_pdrop = attn_pdrop
+        self.layer_norm_epsilon = layer_norm_epsilon
+        self.initializer_range = initializer_range
+        self.scale_attn_weights = scale_attn_weights
+        self.use_cache = use_cache
+        self.scale_attn_by_inverse_layer_idx = scale_attn_by_inverse_layer_idx
+        self.reorder_and_upcast_attn = reorder_and_upcast_attn
+        self.tie_word_embeddings = tie_word_embeddings
+
+        super().__init__(tie_word_embeddings=tie_word_embeddings, **kwargs)
+
+
+class ImageGPTOnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("input_ids", {0: "batch", 1: "sequence"}),
+            ]
+        )
+
+    def generate_dummy_inputs(
+        self,
+        preprocessor: "FeatureExtractionMixin",
+        batch_size: int = 1,
+        seq_length: int = -1,
+        is_pair: bool = False,
+        framework: Optional["TensorType"] = None,
+        num_channels: int = 3,
+        image_width: int = 32,
+        image_height: int = 32,
+    ) -> Mapping[str, Any]:
+        """
+        Generate inputs to provide to the ONNX exporter for the specific framework
+
+        Args:
+            preprocessor ([`PreTrainedTokenizerBase`] or [`FeatureExtractionMixin`]):
+                The preprocessor associated with this model configuration.
+            batch_size (`int`, *optional*, defaults to -1):
+                The batch size to export the model for (-1 means dynamic axis).
+            num_choices (`int`, *optional*, defaults to -1):
+                The number of candidate answers provided for multiple choice task (-1 means dynamic axis).
+            seq_length (`int`, *optional*, defaults to -1):
+                The sequence length to export the model for (-1 means dynamic axis).
+            is_pair (`bool`, *optional*, defaults to `False`):
+                Indicate if the input is a pair (sentence 1, sentence 2)
+            framework (`TensorType`, *optional*, defaults to `None`):
+                The framework (PyTorch or TensorFlow) that the tokenizer will generate tensors for.
+            num_channels (`int`, *optional*, defaults to 3):
+                The number of channels of the generated images.
+            image_width (`int`, *optional*, defaults to 40):
+                The width of the generated images.
+            image_height (`int`, *optional*, defaults to 40):
+                The height of the generated images.
+
+        Returns:
+            Mapping[str, Tensor] holding the kwargs to provide to the model's forward function
+        """
+
+        input_image = self._generate_dummy_images(batch_size, num_channels, image_height, image_width)
+        inputs = dict(preprocessor(images=input_image, return_tensors=framework))
+
+        return inputs
+
+
+__all__ = ["ImageGPTConfig", "ImageGPTOnnxConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/imagegpt/feature_extraction_imagegpt.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/imagegpt/feature_extraction_imagegpt.py
new file mode 100644
index 0000000000000000000000000000000000000000..46787f139f10a0c339e4cea9524d34c00a03ceb6
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/imagegpt/feature_extraction_imagegpt.py
@@ -0,0 +1,38 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for ImageGPT."""
+
+import warnings
+
+from ...utils import logging
+from ...utils.import_utils import requires
+from .image_processing_imagegpt import ImageGPTImageProcessor
+
+
+logger = logging.get_logger(__name__)
+
+
+@requires(backends=("vision",))
+class ImageGPTFeatureExtractor(ImageGPTImageProcessor):
+    def __init__(self, *args, **kwargs) -> None:
+        warnings.warn(
+            "The class ImageGPTFeatureExtractor is deprecated and will be removed in version 5 of Transformers."
+            " Please use ImageGPTImageProcessor instead.",
+            FutureWarning,
+        )
+        super().__init__(*args, **kwargs)
+
+
+__all__ = ["ImageGPTFeatureExtractor"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/imagegpt/image_processing_imagegpt.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/imagegpt/image_processing_imagegpt.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa2114509f7026d9f266be952dc220f2de1b632d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/imagegpt/image_processing_imagegpt.py
@@ -0,0 +1,314 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for ImageGPT."""
+
+from typing import Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import rescale, resize, to_channel_dimension_format
+from ...image_utils import (
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from ...utils import TensorType, filter_out_non_signature_kwargs, is_vision_available, logging
+from ...utils.import_utils import requires
+
+
+if is_vision_available():
+    import PIL
+
+
+logger = logging.get_logger(__name__)
+
+
+def squared_euclidean_distance(a, b):
+    b = b.T
+    a2 = np.sum(np.square(a), axis=1)
+    b2 = np.sum(np.square(b), axis=0)
+    ab = np.matmul(a, b)
+    d = a2[:, None] - 2 * ab + b2[None, :]
+    return d
+
+
+def color_quantize(x, clusters):
+    x = x.reshape(-1, 3)
+    d = squared_euclidean_distance(x, clusters)
+    return np.argmin(d, axis=1)
+
+
+@requires(backends=("vision",))
+class ImageGPTImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a ImageGPT image processor. This image processor can be used to resize images to a smaller resolution
+    (such as 32x32 or 64x64), normalize them and finally color quantize them to obtain sequences of "pixel values"
+    (color clusters).
+
+    Args:
+        clusters (`np.ndarray` or `list[list[int]]`, *optional*):
+            The color clusters to use, of shape `(n_clusters, 3)` when color quantizing. Can be overridden by `clusters`
+            in `preprocess`.
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's dimensions to `(size["height"], size["width"])`. Can be overridden by
+            `do_resize` in `preprocess`.
+        size (`dict[str, int]` *optional*, defaults to `{"height": 256, "width": 256}`):
+            Size of the image after resizing. Can be overridden by `size` in `preprocess`.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BILINEAR`):
+            Resampling filter to use if resizing the image. Can be overridden by `resample` in `preprocess`.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image pixel value to between [-1, 1]. Can be overridden by `do_normalize` in
+            `preprocess`.
+        do_color_quantize (`bool`, *optional*, defaults to `True`):
+            Whether to color quantize the image. Can be overridden by `do_color_quantize` in `preprocess`.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        # clusters is a first argument to maintain backwards compatibility with the old ImageGPTImageProcessor
+        clusters: Optional[Union[list[list[int]], np.ndarray]] = None,
+        do_resize: bool = True,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_normalize: bool = True,
+        do_color_quantize: bool = True,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 256, "width": 256}
+        size = get_size_dict(size)
+        self.clusters = np.array(clusters) if clusters is not None else None
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_normalize = do_normalize
+        self.do_color_quantize = do_color_quantize
+
+    # Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize
+    def resize(
+        self,
+        image: np.ndarray,
+        size: dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image to `(size["height"], size["width"])`.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`dict[str, int]`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+
+        Returns:
+            `np.ndarray`: The resized image.
+        """
+        size = get_size_dict(size)
+        if "height" not in size or "width" not in size:
+            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
+        output_size = (size["height"], size["width"])
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    def normalize(
+        self,
+        image: np.ndarray,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """
+        Normalizes an images' pixel values to between [-1, 1].
+
+        Args:
+            image (`np.ndarray`):
+                Image to normalize.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        image = rescale(image=image, scale=1 / 127.5, data_format=data_format, input_data_format=input_data_format)
+        image = image - 1
+        return image
+
+    @filter_out_non_signature_kwargs()
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample: Optional[PILImageResampling] = None,
+        do_normalize: Optional[bool] = None,
+        do_color_quantize: Optional[bool] = None,
+        clusters: Optional[Union[list[list[int]], np.ndarray]] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[Union[str, ChannelDimension]] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_normalize=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`, Only
+                has an effect if `do_resize` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image
+            do_color_quantize (`bool`, *optional*, defaults to `self.do_color_quantize`):
+                Whether to color quantize the image.
+            clusters (`np.ndarray` or `list[list[int]]`, *optional*, defaults to `self.clusters`):
+                Clusters used to quantize the image of shape `(n_clusters, 3)`. Only has an effect if
+                `do_color_quantize` is set to `True`.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                    - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                    - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                Only has an effect if `do_color_quantize` is set to `False`.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        size = get_size_dict(size)
+        resample = resample if resample is not None else self.resample
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        do_color_quantize = do_color_quantize if do_color_quantize is not None else self.do_color_quantize
+        clusters = clusters if clusters is not None else self.clusters
+        clusters = np.array(clusters)
+
+        images = make_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+
+        # Here, normalize() is using a constant factor to divide pixel values.
+        # hence, the method does not need image_mean and image_std.
+        validate_preprocess_arguments(
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+
+        if do_color_quantize and clusters is None:
+            raise ValueError("Clusters must be specified if do_color_quantize is True.")
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if do_normalize and is_scaled_image(images[0]):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If you wish to do this, "
+                "make sure to set `do_normalize` to `False` and that pixel values are between [-1, 1].",
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        if do_resize:
+            images = [
+                self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        if do_normalize:
+            images = [self.normalize(image=image, input_data_format=input_data_format) for image in images]
+
+        if do_color_quantize:
+            images = [to_channel_dimension_format(image, ChannelDimension.LAST, input_data_format) for image in images]
+            # color quantize from (batch_size, height, width, 3) to (batch_size, height, width)
+            images = np.array(images)
+            images = color_quantize(images, clusters).reshape(images.shape[:-1])
+
+            # flatten to (batch_size, height*width)
+            batch_size = images.shape[0]
+            images = images.reshape(batch_size, -1)
+
+            # We need to convert back to a list of images to keep consistent behaviour across processors.
+            images = list(images)
+            data = {"input_ids": images}
+        else:
+            images = [to_channel_dimension_format(image, data_format, input_data_format) for image in images]
+            data = {"pixel_values": images}
+        return BatchFeature(data=data, tensor_type=return_tensors)
+
+    def to_dict(self):
+        output = super().to_dict()
+        # Ensure clusters are JSON/equality friendly
+        if output.get("clusters") is not None and isinstance(output["clusters"], np.ndarray):
+            output["clusters"] = output["clusters"].tolist()
+        # Need to set missing keys from slow processor to match the expected behavior in save/load tests compared to fast processor
+        missing_keys = ["image_mean", "image_std", "rescale_factor", "do_rescale"]
+        for key in missing_keys:
+            if key in output:
+                output[key] = None
+
+        return output
+
+
+__all__ = ["ImageGPTImageProcessor"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/imagegpt/image_processing_imagegpt_fast.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/imagegpt/image_processing_imagegpt_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..7a6bcc53ae1acaefa1f4a923f7d5b422c7a62d8e
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/imagegpt/image_processing_imagegpt_fast.py
@@ -0,0 +1,198 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Image processor class for ImageGPT."""
+
+from typing import Optional, Union
+
+import numpy as np
+import torch
+from torchvision.transforms.v2 import functional as F
+
+from ...image_processing_utils import BatchFeature
+from ...image_processing_utils_fast import (
+    BaseImageProcessorFast,
+    DefaultFastImageProcessorKwargs,
+)
+from ...image_transforms import group_images_by_shape, reorder_images
+from ...image_utils import PILImageResampling
+from ...processing_utils import Unpack
+from ...utils import (
+    TensorType,
+    auto_docstring,
+)
+
+
+def squared_euclidean_distance_torch(a: torch.Tensor, b: torch.Tensor) -> torch.Tensor:
+    """
+    Compute squared Euclidean distances between all pixels and clusters.
+
+    Args:
+        a: (N, 3) tensor of pixel RGB values
+        b: (M, 3) tensor of cluster RGB values
+
+    Returns:
+        (N, M) tensor of squared distances
+    """
+    b = b.t()  # (3, M)
+    a2 = torch.sum(a**2, dim=1)  # (N,)
+    b2 = torch.sum(b**2, dim=0)  # (M,)
+    ab = torch.matmul(a, b)  # (N, M)
+    d = a2[:, None] - 2 * ab + b2[None, :]  # Squared Euclidean Distance: a^2 - 2ab + b^2
+    return d  # (N, M) tensor of squared distances
+
+
+def color_quantize_torch(x: torch.Tensor, clusters: torch.Tensor) -> torch.Tensor:
+    """
+    Assign each pixel to its nearest color cluster.
+
+    Args:
+        x: (H*W, 3) tensor of flattened pixel RGB values
+        clusters: (n_clusters, 3) tensor of cluster RGB values
+
+    Returns:
+        (H*W,) tensor of cluster indices
+    """
+    d = squared_euclidean_distance_torch(x, clusters)
+    return torch.argmin(d, dim=1)
+
+
+class ImageGPTFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+    """
+    clusters (`np.ndarray` or `list[list[int]]` or `torch.Tensor`, *optional*):
+        The color clusters to use, of shape `(n_clusters, 3)` when color quantizing. Can be overridden by `clusters`
+        in `preprocess`.
+    do_color_quantize (`bool`, *optional*, defaults to `True`):
+        Controls whether to apply color quantization to convert continuous pixel values to discrete cluster indices.
+        When True, each pixel is assigned to its nearest color cluster, enabling ImageGPT's discrete token modeling.
+    """
+
+    clusters: Optional[Union[np.ndarray, list[list[int]], torch.Tensor]]
+    do_color_quantize: Optional[bool]
+
+
+@auto_docstring
+class ImageGPTImageProcessorFast(BaseImageProcessorFast):
+    model_input_names = ["input_ids"]
+    resample = PILImageResampling.BILINEAR
+    do_color_quantize = True
+    clusters = None
+    image_mean = [0.5, 0.5, 0.5]
+    image_std = [0.5, 0.5, 0.5]
+    do_rescale = True
+    do_normalize = True
+    valid_kwargs = ImageGPTFastImageProcessorKwargs
+
+    def __init__(
+        self,
+        clusters: Optional[Union[list, np.ndarray, torch.Tensor]] = None,  # keep as arg for backwards compatibility
+        **kwargs: Unpack[ImageGPTFastImageProcessorKwargs],
+    ):
+        r"""
+        clusters (`np.ndarray` or `list[list[int]]` or `torch.Tensor`, *optional*):
+            The color clusters to use, of shape `(n_clusters, 3)` when color quantizing. Can be overridden by `clusters`
+            in `preprocess`.
+        """
+        clusters = torch.as_tensor(clusters, dtype=torch.float32) if clusters is not None else None
+        super().__init__(clusters=clusters, **kwargs)
+
+    def _preprocess(
+        self,
+        images: list["torch.Tensor"],
+        do_resize: bool,
+        size: dict[str, int],
+        interpolation: Optional["F.InterpolationMode"],
+        do_center_crop: bool,
+        crop_size: dict[str, int],
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Optional[Union[float, list[float]]],
+        image_std: Optional[Union[float, list[float]]],
+        do_color_quantize: Optional[bool] = None,
+        clusters: Optional[Union[list, np.ndarray, torch.Tensor]] = None,
+        disable_grouping: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ):
+        # Group images by size for batched resizing
+        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
+        resized_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_resize:
+                stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation)
+            resized_images_grouped[shape] = stacked_images
+        resized_images = reorder_images(resized_images_grouped, grouped_images_index)
+
+        # Group images by size for further processing
+        # Needed in case do_resize is False, or resize returns images with different sizes
+        grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
+        processed_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_center_crop:
+                stacked_images = self.center_crop(stacked_images, crop_size)
+            # Fused rescale and normalize
+            stacked_images = self.rescale_and_normalize(
+                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            )
+            processed_images_grouped[shape] = stacked_images
+
+        pixel_values = reorder_images(processed_images_grouped, grouped_images_index)
+
+        # If color quantization is requested, perform it; otherwise return pixel values
+        if do_color_quantize:
+            # Prepare clusters
+            if clusters is None:
+                raise ValueError("Clusters must be provided for color quantization.")
+            # Convert to torch tensor if needed (clusters might be passed as list/numpy)
+            clusters_torch = (
+                torch.as_tensor(clusters, dtype=torch.float32) if not isinstance(clusters, torch.Tensor) else clusters
+            ).to(pixel_values[0].device, dtype=pixel_values[0].dtype)
+
+            # Group images by shape for batch processing
+            # We need to check if the pixel values are a tensor or a list of tensors
+            grouped_images, grouped_images_index = group_images_by_shape(
+                pixel_values, disable_grouping=disable_grouping
+            )
+            # Process each group
+            input_ids_grouped = {}
+
+            for shape, stacked_images in grouped_images.items():
+                input_ids = color_quantize_torch(
+                    stacked_images.permute(0, 2, 3, 1).reshape(-1, 3), clusters_torch
+                )  # (B*H*W, C)
+                input_ids_grouped[shape] = input_ids.reshape(stacked_images.shape[0], -1).reshape(
+                    stacked_images.shape[0], -1
+                )  # (B, H, W)
+
+            input_ids = reorder_images(input_ids_grouped, grouped_images_index)
+
+            return BatchFeature(
+                data={"input_ids": torch.stack(input_ids, dim=0) if return_tensors else input_ids},
+                tensor_type=return_tensors,
+            )
+
+        pixel_values = torch.stack(pixel_values, dim=0) if return_tensors else pixel_values
+        return BatchFeature(data={"pixel_values": pixel_values}, tensor_type=return_tensors)
+
+    def to_dict(self):
+        # Convert torch tensors to lists for JSON serialization
+        output = super().to_dict()
+        if output.get("clusters") is not None and isinstance(output["clusters"], torch.Tensor):
+            output["clusters"] = output["clusters"].tolist()
+
+        return output
+
+
+__all__ = ["ImageGPTImageProcessorFast"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/imagegpt/modeling_imagegpt.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/imagegpt/modeling_imagegpt.py
new file mode 100644
index 0000000000000000000000000000000000000000..a962141e447946633a2598b3c0bfe43b46f10e6d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/imagegpt/modeling_imagegpt.py
@@ -0,0 +1,1024 @@
+# coding=utf-8
+# Copyright 2021 The OpenAI Team Authors and HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch OpenAI ImageGPT model."""
+
+import math
+import os
+from typing import Any, Optional, Union
+
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
+from ...generation import GenerationMixin
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    SequenceClassifierOutputWithPast,
+)
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import Conv1D, find_pruneable_heads_and_indices, prune_conv1d_layer
+from ...utils import (
+    auto_docstring,
+    logging,
+    torch_float,
+)
+from .configuration_imagegpt import ImageGPTConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+def load_tf_weights_in_imagegpt(model, config, imagegpt_checkpoint_path):
+    """
+    Load tf checkpoints in a pytorch model
+    """
+    try:
+        import re
+
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            "Loading a TensorFlow model in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
+        raise
+    tf_path = os.path.abspath(imagegpt_checkpoint_path)
+    logger.info(f"Converting TensorFlow checkpoint from {tf_path}")
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_path)
+    names = []
+    arrays = []
+
+    for name, shape in init_vars:
+        logger.info(f"Loading TF weight {name} with shape {shape}")
+        array = tf.train.load_variable(tf_path, name)
+        names.append(name)
+        arrays.append(array.squeeze())
+
+    for name, array in zip(names, arrays):
+        name = name[6:]  # skip "model/"
+        name = name.split("/")
+
+        # adam_v and adam_m are variables used in AdamWeightDecayOptimizer to calculated m and v
+        # which are not required for using pretrained model
+        if any(
+            n in ["adam_v", "adam_m", "AdamWeightDecayOptimizer", "AdamWeightDecayOptimizer_1", "global_step"]
+            for n in name
+        ) or name[-1] in ["_step"]:
+            logger.info("Skipping {}".format("/".join(name)))
+            continue
+
+        pointer = model
+        if name[-1] not in ["wtet"]:
+            pointer = getattr(pointer, "transformer")
+
+        for m_name in name:
+            if re.fullmatch(r"[A-Za-z]+\d+", m_name):
+                scope_names = re.split(r"(\d+)", m_name)
+            else:
+                scope_names = [m_name]
+
+            if scope_names[0] == "w" or scope_names[0] == "g":
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "b":
+                pointer = getattr(pointer, "bias")
+            elif scope_names[0] == "wpe" or scope_names[0] == "wte":
+                pointer = getattr(pointer, scope_names[0])
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] in ["q_proj", "k_proj", "v_proj"]:
+                pointer = getattr(pointer, "c_attn")
+                pointer = getattr(pointer, "weight")
+            elif len(name) == 3 and name[1] == "attn" and scope_names[0] == "c_proj":
+                pointer = getattr(pointer, scope_names[0])
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "wtet":
+                pointer = getattr(pointer, "lm_head")
+                pointer = getattr(pointer, "weight")
+            elif scope_names[0] == "sos":
+                pointer = getattr(pointer, "wte")
+                pointer = getattr(pointer, "weight")
+            else:
+                pointer = getattr(pointer, scope_names[0])
+            if len(scope_names) >= 2:
+                num = int(scope_names[1])
+                pointer = pointer[num]
+
+        if len(name) > 1 and name[1] == "attn" or name[-1] == "wtet" or name[-1] == "sos" or name[-1] == "wte":
+            pass  # array is used to initialize only part of the pointer so sizes won't match
+        else:
+            try:
+                assert pointer.shape == array.shape
+            except AssertionError as e:
+                e.args += (pointer.shape, array.shape)
+                raise
+
+        logger.info(f"Initialize PyTorch weight {name}")
+
+        if name[-1] == "q_proj":
+            pointer.data[:, : config.n_embd] = torch.from_numpy(array.reshape(config.n_embd, config.n_embd)).T
+        elif name[-1] == "k_proj":
+            pointer.data[:, config.n_embd : 2 * config.n_embd] = torch.from_numpy(
+                array.reshape(config.n_embd, config.n_embd)
+            ).T
+        elif name[-1] == "v_proj":
+            pointer.data[:, 2 * config.n_embd :] = torch.from_numpy(array.reshape(config.n_embd, config.n_embd)).T
+        elif len(name) == 3 and name[1] == "attn" and name[2] == "c_proj":
+            pointer.data = torch.from_numpy(array.reshape(config.n_embd, config.n_embd))
+        elif name[-1] == "wtet":
+            pointer.data = torch.from_numpy(array)
+        elif name[-1] == "wte":
+            pointer.data[: config.vocab_size - 1, :] = torch.from_numpy(array)
+        elif name[-1] == "sos":
+            pointer.data[-1] = torch.from_numpy(array)
+        else:
+            pointer.data = torch.from_numpy(array)
+
+    return model
+
+
+class ImageGPTLayerNorm(nn.Module):
+    def __init__(self, hidden_size: tuple[int], eps: float = 1e-5):
+        super().__init__()
+        self.eps = eps
+        self.weight = nn.Parameter(torch.Tensor(hidden_size))
+
+    def forward(self, tensor: torch.Tensor) -> torch.Tensor:
+        # input is not mean centered
+        tensor = tensor / torch.sqrt(torch.mean(torch.square(tensor), axis=-1, keepdim=True) + self.eps)
+        tensor = tensor * self.weight
+        return tensor
+
+
+class ImageGPTAttention(nn.Module):
+    def __init__(self, config, is_cross_attention: Optional[bool] = False, layer_idx: Optional[int] = None):
+        super().__init__()
+
+        max_positions = config.max_position_embeddings
+        self.register_buffer(
+            "bias",
+            torch.tril(torch.ones((max_positions, max_positions), dtype=torch.bool)).view(
+                1, 1, max_positions, max_positions
+            ),
+            persistent=False,
+        )
+        self.register_buffer("masked_bias", torch.tensor(-1e4), persistent=False)
+
+        self.embed_dim = config.hidden_size
+        self.num_heads = config.num_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        self.split_size = self.embed_dim
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"`embed_dim` must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {self.num_heads})."
+            )
+
+        self.scale_attn_weights = config.scale_attn_weights
+        self.is_cross_attention = is_cross_attention
+
+        # Layer-wise attention scaling, reordering, and upcasting
+        self.scale_attn_by_inverse_layer_idx = config.scale_attn_by_inverse_layer_idx
+        self.layer_idx = layer_idx
+        self.reorder_and_upcast_attn = config.reorder_and_upcast_attn
+
+        if self.is_cross_attention:
+            self.c_attn = Conv1D(2 * self.embed_dim, self.embed_dim)
+            self.q_attn = Conv1D(self.embed_dim, self.embed_dim)
+        else:
+            self.c_attn = Conv1D(3 * self.embed_dim, self.embed_dim)
+        self.c_proj = Conv1D(self.embed_dim, self.embed_dim)
+
+        self.attn_dropout = nn.Dropout(config.attn_pdrop)
+        self.resid_dropout = nn.Dropout(config.resid_pdrop)
+
+        self.pruned_heads = set()
+
+    def prune_heads(self, heads):
+        if len(heads) == 0:
+            return
+        heads, index = find_pruneable_heads_and_indices(heads, self.num_heads, self.head_dim, self.pruned_heads)
+        index_attn = torch.cat([index, index + self.split_size, index + (2 * self.split_size)])
+
+        # Prune conv1d layers
+        self.c_attn = prune_conv1d_layer(self.c_attn, index_attn, dim=1)
+        self.c_proj = prune_conv1d_layer(self.c_proj, index, dim=0)
+
+        # Update hyper params
+        self.split_size = (self.split_size // self.num_heads) * (self.num_heads - len(heads))
+        self.num_heads = self.num_heads - len(heads)
+        self.pruned_heads = self.pruned_heads.union(heads)
+
+    def _attn(self, query, key, value, attention_mask=None, head_mask=None):
+        attn_weights = torch.matmul(query, key.transpose(-1, -2))
+
+        if self.scale_attn_weights:
+            attn_weights = attn_weights / torch_float(value.size(-1) ** 0.5)
+
+        # Layer-wise attention scaling
+        if self.scale_attn_by_inverse_layer_idx:
+            attn_weights = attn_weights / float(self.layer_idx + 1)
+
+        if not self.is_cross_attention:
+            # if only "normal" attention layer implements causal mask
+            query_length, key_length = query.size(-2), key.size(-2)
+            causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]
+            mask_value = torch.finfo(attn_weights.dtype).min
+            # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
+            # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
+            mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype, device=attn_weights.device)
+            attn_weights = torch.where(causal_mask, attn_weights, mask_value)
+
+        if attention_mask is not None:
+            # Apply the attention mask
+            attn_weights = attn_weights + attention_mask
+
+        attn_weights = nn.Softmax(dim=-1)(attn_weights)
+
+        # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op otherwise
+        attn_weights = attn_weights.type(value.dtype)
+        attn_weights = self.attn_dropout(attn_weights)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_weights = attn_weights * head_mask
+
+        attn_output = torch.matmul(attn_weights, value)
+
+        return attn_output, attn_weights
+
+    def _upcast_and_reordered_attn(self, query, key, value, attention_mask=None, head_mask=None):
+        # Use `torch.baddbmm` (a bit more efficient w/ alpha param for scaling -- from Megatron-LM)
+        bsz, num_heads, q_seq_len, dk = query.size()
+        _, _, k_seq_len, _ = key.size()
+
+        # Preallocate attn_weights for `baddbmm`
+        attn_weights = torch.empty(bsz * num_heads, q_seq_len, k_seq_len, dtype=torch.float32, device=query.device)
+
+        # Compute Scale Factor
+        scale_factor = 1.0
+        if self.scale_attn_weights:
+            scale_factor /= float(value.size(-1)) ** 0.5
+
+        if self.scale_attn_by_inverse_layer_idx:
+            scale_factor /= float(self.layer_idx + 1)
+
+        # Upcast (turn off autocast) and reorder (Scale K by 1 / root(dk))
+        with torch.autocast(query.device.type, enabled=False):
+            q, k = query.reshape(-1, q_seq_len, dk), key.transpose(-1, -2).reshape(-1, dk, k_seq_len)
+            attn_weights = torch.baddbmm(attn_weights, q.float(), k.float(), beta=0, alpha=scale_factor)
+            attn_weights = attn_weights.reshape(bsz, num_heads, q_seq_len, k_seq_len)
+
+        if not self.is_cross_attention:
+            # if only "normal" attention layer implements causal mask
+            query_length, key_length = query.size(-2), key.size(-2)
+            causal_mask = self.bias[:, :, key_length - query_length : key_length, :key_length]
+            mask_value = torch.finfo(attn_weights.dtype).min
+            # Need to be a tensor, otherwise we get error: `RuntimeError: expected scalar type float but found double`.
+            # Need to be on the same device, otherwise `RuntimeError: ..., x and y to be on the same device`
+            mask_value = torch.tensor(mask_value, dtype=attn_weights.dtype, device=attn_weights.device)
+            attn_weights = torch.where(causal_mask, attn_weights, mask_value)
+
+        if attention_mask is not None:
+            # Apply the attention mask
+            attn_weights = attn_weights + attention_mask
+
+        attn_weights = nn.Softmax(dim=-1)(attn_weights)
+
+        # Downcast (if necessary) back to V's dtype (if in mixed-precision) -- No-Op if otherwise
+        if attn_weights.dtype != torch.float32:
+            raise RuntimeError("Error with upcasting, attn_weights does not have dtype torch.float32")
+        attn_weights = attn_weights.type(value.dtype)
+        attn_weights = self.attn_dropout(attn_weights)
+
+        # Mask heads if we want to
+        if head_mask is not None:
+            attn_weights = attn_weights * head_mask
+
+        attn_output = torch.matmul(attn_weights, value)
+
+        return attn_output, attn_weights
+
+    def _split_heads(self, tensor, num_heads, attn_head_size):
+        """
+        Splits hidden_size dim into attn_head_size and num_heads
+        """
+        new_shape = tensor.size()[:-1] + (num_heads, attn_head_size)
+        tensor = tensor.view(*new_shape)
+        return tensor.permute(0, 2, 1, 3)  # (batch, head, seq_length, head_features)
+
+    def _merge_heads(self, tensor, num_heads, attn_head_size):
+        """
+        Merges attn_head_size dim and num_attn_heads dim into hidden_size
+        """
+        tensor = tensor.permute(0, 2, 1, 3).contiguous()
+        new_shape = tensor.size()[:-2] + (num_heads * attn_head_size,)
+        return tensor.view(new_shape)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        layer_past: Optional[Cache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> tuple:
+        is_cross_attention = encoder_hidden_states is not None
+        bsz, seq_len, _ = hidden_states.shape
+
+        if layer_past is not None:
+            if isinstance(layer_past, EncoderDecoderCache):
+                is_updated = layer_past.is_updated.get(self.layer_idx)
+                if is_cross_attention:
+                    # after the first generated id, we can subsequently re-use all key/value_states from cache
+                    curr_past_key_value = layer_past.cross_attention_cache
+                else:
+                    curr_past_key_value = layer_past.self_attention_cache
+            else:
+                curr_past_key_value = layer_past
+
+        current_states = encoder_hidden_states if is_cross_attention else hidden_states
+        if is_cross_attention:
+            if not hasattr(self, "q_attn"):
+                raise ValueError(
+                    "If class is used as cross attention, the weights `q_attn` have to be defined. "
+                    "Please make sure to instantiate class with `ImageGPTAttention(..., is_cross_attention=True)`."
+                )
+
+            if layer_past is not None and is_updated:
+                # reuse k,v, cross_attentions, and compute only q
+                query = self.q_attn(hidden_states)
+                key = curr_past_key_value.layers[self.layer_idx].keys
+                value = curr_past_key_value.layers[self.layer_idx].values
+            else:
+                query = self.q_attn(hidden_states)
+                key, value = self.c_attn(current_states).split(self.split_size, dim=2)
+                key = key.view(bsz, -1, self.num_heads, self.head_dim).transpose(1, 2)
+                value = value.view(bsz, -1, self.num_heads, self.head_dim).transpose(1, 2)
+        else:
+            query, key, value = self.c_attn(current_states).split(self.split_size, dim=2)
+            key = key.view(bsz, -1, self.num_heads, self.head_dim).transpose(1, 2)
+            value = value.view(bsz, -1, self.num_heads, self.head_dim).transpose(1, 2)
+
+        if layer_past is not None:
+            # save all key/value_states to cache to be re-used for fast auto-regressive generation
+            cache_position = cache_position if not is_cross_attention else None
+            key, value = curr_past_key_value.update(key, value, self.layer_idx, {"cache_position": cache_position})
+            # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls
+            if is_cross_attention:
+                layer_past.is_updated[self.layer_idx] = True
+
+        query = query.view(bsz, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+
+        if self.reorder_and_upcast_attn:
+            attn_output, attn_weights = self._upcast_and_reordered_attn(query, key, value, attention_mask, head_mask)
+        else:
+            attn_output, attn_weights = self._attn(query, key, value, attention_mask, head_mask)
+
+        attn_output = self._merge_heads(attn_output, self.num_heads, self.head_dim)
+        attn_output = self.c_proj(attn_output)
+        attn_output = self.resid_dropout(attn_output)
+
+        return attn_output, attn_weights
+
+
+class ImageGPTMLP(nn.Module):
+    def __init__(self, intermediate_size, config):
+        super().__init__()
+        embed_dim = config.hidden_size
+        self.c_fc = Conv1D(intermediate_size, embed_dim)
+        self.c_proj = Conv1D(embed_dim, intermediate_size)
+        self.act = ACT2FN[config.activation_function]
+        self.dropout = nn.Dropout(config.resid_pdrop)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.c_fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.c_proj(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+class ImageGPTBlock(GradientCheckpointingLayer):
+    def __init__(self, config, layer_idx=None):
+        super().__init__()
+        hidden_size = config.hidden_size
+        inner_dim = config.n_inner if config.n_inner is not None else 4 * hidden_size
+
+        self.ln_1 = ImageGPTLayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+        self.attn = ImageGPTAttention(config, layer_idx=layer_idx)
+        self.ln_2 = ImageGPTLayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+
+        if config.add_cross_attention:
+            self.crossattention = ImageGPTAttention(config, is_cross_attention=True, layer_idx=layer_idx)
+            self.ln_cross_attn = ImageGPTLayerNorm(hidden_size, eps=config.layer_norm_epsilon)
+
+        self.mlp = ImageGPTMLP(inner_dim, config)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        layer_past: Optional[Cache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = False,
+        output_attentions: Optional[bool] = False,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> tuple:
+        residual = hidden_states
+        hidden_states = self.ln_1(hidden_states)
+        attn_outputs = self.attn(
+            hidden_states,
+            layer_past=layer_past,
+            attention_mask=attention_mask,
+            head_mask=head_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            cache_position=cache_position,
+        )
+        attn_output = attn_outputs[0]
+        outputs = attn_outputs[1:]
+        # residual connection
+        hidden_states = attn_output + residual
+
+        if encoder_hidden_states is not None:
+            # add one self-attention block for cross-attention
+            if not hasattr(self, "crossattention"):
+                raise ValueError(
+                    f"If `encoder_hidden_states` are passed, {self} has to be instantiated with "
+                    "cross-attention layers by setting `config.add_cross_attention=True`"
+                )
+            residual = hidden_states
+            hidden_states = self.ln_cross_attn(hidden_states)
+            cross_attn_outputs = self.crossattention(
+                hidden_states,
+                layer_past=layer_past,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                output_attentions=output_attentions,
+                cache_position=cache_position,
+            )
+            attn_output = cross_attn_outputs[0]
+            # residual connection
+            hidden_states = residual + attn_output
+            outputs = outputs + cross_attn_outputs[1:]  # add cross attentions if we output attention weights
+
+        residual = hidden_states
+        hidden_states = self.ln_2(hidden_states)
+        feed_forward_hidden_states = self.mlp(hidden_states)
+        # residual connection
+        hidden_states = residual + feed_forward_hidden_states
+
+        return (hidden_states,) + outputs
+
+
+@auto_docstring
+class ImageGPTPreTrainedModel(PreTrainedModel):
+    config: ImageGPTConfig
+    load_tf_weights = load_tf_weights_in_imagegpt
+    base_model_prefix = "transformer"
+    main_input_name = "input_ids"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["ImageGPTBlock"]
+
+    def __init__(self, *inputs, **kwargs):
+        super().__init__(*inputs, **kwargs)
+
+    def _init_weights(self, module):
+        """Initialize the weights."""
+        if isinstance(module, (nn.Linear, Conv1D)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, ImageGPTLayerNorm):
+            module.weight.data.fill_(1.0)
+
+        # Reinitialize selected weights subject to the OpenAI GPT-2 Paper Scheme:
+        #   > A modified initialization which accounts for the accumulation on the residual path with model depth. Scale
+        #   > the weights of residual layers at initialization by a factor of 1/√N where N is the # of residual layers.
+        #   >   -- GPT-2 :: https://openai.com/blog/better-language-models/
+        #
+        # Reference (Megatron-LM): https://github.com/NVIDIA/Megatron-LM/blob/main/megatron/model/gpt_model.py
+        for name, p in module.named_parameters():
+            if "c_proj" in name and "weight" in name:
+                # Special Scaled Initialization --> There are 2 Layer Norms per Transformer Block
+                p.data.normal_(mean=0.0, std=(self.config.initializer_range / math.sqrt(2 * self.config.n_layer)))
+
+
+@auto_docstring
+class ImageGPTModel(ImageGPTPreTrainedModel):
+    def __init__(self, config: ImageGPTConfig):
+        super().__init__(config)
+
+        self.embed_dim = config.hidden_size
+
+        self.wte = nn.Embedding(config.vocab_size, self.embed_dim)
+        self.wpe = nn.Embedding(config.max_position_embeddings, self.embed_dim)
+
+        self.drop = nn.Dropout(config.embd_pdrop)
+        self.h = nn.ModuleList([ImageGPTBlock(config, layer_idx=i) for i in range(config.num_hidden_layers)])
+        self.ln_f = ImageGPTLayerNorm(self.embed_dim, eps=config.layer_norm_epsilon)
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.wte
+
+    def set_input_embeddings(self, new_embeddings):
+        self.wte = new_embeddings
+
+    def _prune_heads(self, heads_to_prune):
+        """
+        Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer}
+        """
+        for layer, heads in heads_to_prune.items():
+            self.h[layer].attn.prune_heads(heads)
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+        **kwargs: Any,
+    ) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
+            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
+            sequence tokens in the vocabulary.
+
+            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
+            `input_ids`.
+
+            Indices can be obtained using [`AutoImageProcessor`]. See [`ImageGPTImageProcessor.__call__`] for details.
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, ImageGPTModel
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("openai/imagegpt-small")
+        >>> model = ImageGPTModel.from_pretrained("openai/imagegpt-small")
+
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> last_hidden_states = outputs.last_hidden_state
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+            batch_size = input_ids.shape[0]
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            batch_size = inputs_embeds.shape[0]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        device = input_ids.device if input_ids is not None else inputs_embeds.device
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        if use_cache and past_key_values is None:
+            past_key_values = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config))
+        if use_cache and isinstance(past_key_values, tuple):
+            logger.warning_once(
+                "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. "
+                "You should pass an instance of `EncoderDecoderCache` instead, e.g. "
+                "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`."
+            )
+            past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values)
+
+        past_length = past_key_values.get_seq_length() if past_key_values is not None else past_key_values
+
+        if token_type_ids is not None:
+            token_type_ids = token_type_ids.view(-1, input_shape[-1])
+
+        if position_ids is None:
+            position_ids = torch.arange(past_length, input_shape[-1] + past_length, dtype=torch.long, device=device)
+            position_ids = position_ids.unsqueeze(0)
+
+        # ImageGPTAttention mask.
+        if attention_mask is not None:
+            if batch_size <= 0:
+                raise ValueError("batch_size has to be defined and > 0")
+            attention_mask = attention_mask.view(batch_size, -1)
+            # We create a 3D attention mask from a 2D tensor mask.
+            # Sizes are [batch_size, 1, 1, to_seq_length]
+            # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+            # this attention mask is more simple than the triangular masking of causal attention
+            # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+            attention_mask = attention_mask[:, None, None, :]
+
+            # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+            # masked positions, this operation will create a tensor which is 0.0 for
+            # positions we want to attend and the dtype's smallest value for masked positions.
+            # Since we are adding it to the raw scores before the softmax, this is
+            # effectively the same as removing these entirely.
+            attention_mask = attention_mask.to(dtype=self.dtype)  # fp16 compatibility
+            attention_mask = (1.0 - attention_mask) * torch.finfo(self.dtype).min
+
+        # If a 2D or 3D attention mask is provided for the cross-attention
+        # we need to make broadcastable to [batch_size, num_heads, seq_length, seq_length]
+        if self.config.add_cross_attention and encoder_hidden_states is not None:
+            encoder_batch_size, encoder_sequence_length, _ = encoder_hidden_states.size()
+            encoder_hidden_shape = (encoder_batch_size, encoder_sequence_length)
+            if encoder_attention_mask is None:
+                encoder_attention_mask = torch.ones(encoder_hidden_shape, device=device)
+            encoder_attention_mask = self.invert_attention_mask(encoder_attention_mask)
+        else:
+            encoder_attention_mask = None
+
+        # Prepare head mask if needed
+        # 1.0 in head_mask indicate we keep the head
+        # attention_probs has shape bsz x n_heads x N x N
+        # head_mask has shape n_layer x batch x n_heads x N x N
+        head_mask = self.get_head_mask(head_mask, self.config.n_layer)
+
+        if inputs_embeds is None:
+            inputs_embeds = self.wte(input_ids)
+        position_embeds = self.wpe(position_ids)
+        hidden_states = inputs_embeds + position_embeds.to(inputs_embeds.device)
+
+        if token_type_ids is not None:
+            token_type_embeds = self.wte(token_type_ids)
+            hidden_states = hidden_states + token_type_embeds
+
+        hidden_states = self.drop(hidden_states)
+        output_shape = input_shape + (hidden_states.size(-1),)
+
+        all_self_attentions = () if output_attentions else None
+        all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+        all_hidden_states = () if output_hidden_states else None
+        for i, block in enumerate(self.h):
+            # Model parallel
+            if self.model_parallel:
+                torch.cuda.set_device(hidden_states.device)
+                # Ensure that attention_mask is always on the same device as hidden_states
+                if attention_mask is not None:
+                    attention_mask = attention_mask.to(hidden_states.device)
+                if isinstance(head_mask, torch.Tensor):
+                    head_mask = head_mask.to(hidden_states.device)
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            outputs = block(
+                hidden_states,
+                past_key_values,
+                attention_mask,
+                head_mask[i],
+                encoder_hidden_states,  # as a positional argument for gradient checkpointing
+                encoder_attention_mask=encoder_attention_mask,
+                use_cache=use_cache,
+                output_attentions=output_attentions,
+                cache_position=cache_position,
+            )
+
+            hidden_states = outputs[0]
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (outputs[1],)
+                if self.config.add_cross_attention:
+                    all_cross_attentions = all_cross_attentions + (outputs[2],)
+
+            # Model Parallel: If it's the last layer for that device, put things on the next device
+            if self.model_parallel:
+                for k, v in self.device_map.items():
+                    if i == v[-1] and "cuda:" + str(k) != self.last_device:
+                        hidden_states = hidden_states.to("cuda:" + str(k + 1))
+
+        hidden_states = self.ln_f(hidden_states)
+        hidden_states = hidden_states.view(*output_shape)
+
+        # Add last hidden state
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, past_key_values, all_hidden_states, all_self_attentions, all_cross_attentions]
+                if v is not None
+            )
+
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    The ImageGPT Model transformer with a language modeling head on top (linear layer with weights tied to the input
+    embeddings).
+    """
+)
+class ImageGPTForCausalImageModeling(ImageGPTPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config: ImageGPTConfig):
+        super().__init__(config)
+        self.transformer = ImageGPTModel(config)
+        self.lm_head = nn.Linear(config.n_embd, config.vocab_size - 1, bias=False)
+
+        # Model parallel
+        self.model_parallel = False
+        self.device_map = None
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+        **kwargs: Any,
+    ) -> Union[tuple, CausalLMOutputWithCrossAttentions]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
+            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
+            sequence tokens in the vocabulary.
+
+            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
+            `input_ids`.
+
+            Indices can be obtained using [`AutoImageProcessor`]. See [`ImageGPTImageProcessor.__call__`] for details.
+        labels (`torch.LongTensor` of shape `(batch_size, input_ids_length)`, *optional*):
+            Labels for language modeling. Note that the labels **are shifted** inside the model, i.e. you can set
+            `labels = input_ids` Indices are selected in `[-100, 0, ..., config.vocab_size]` All labels set to `-100`
+            are ignored (masked), the loss is only computed for labels in `[0, ..., config.vocab_size]`
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, ImageGPTForCausalImageModeling
+        >>> import torch
+        >>> import matplotlib.pyplot as plt
+        >>> import numpy as np
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("openai/imagegpt-small")
+        >>> model = ImageGPTForCausalImageModeling.from_pretrained("openai/imagegpt-small")
+        >>> device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+        >>> model.to(device)  # doctest: +IGNORE_RESULT
+
+        >>> # unconditional generation of 8 images
+        >>> batch_size = 4
+        >>> context = torch.full((batch_size, 1), model.config.vocab_size - 1)  # initialize with SOS token
+        >>> context = context.to(device)
+        >>> output = model.generate(
+        ...     input_ids=context, max_length=model.config.n_positions + 1, temperature=1.0, do_sample=True, top_k=40
+        ... )
+
+        >>> clusters = image_processor.clusters
+        >>> height = image_processor.size["height"]
+        >>> width = image_processor.size["width"]
+
+        >>> samples = output[:, 1:].detach().cpu().numpy()
+        >>> samples_img = [
+        ...     np.reshape(np.rint(127.5 * (clusters[s] + 1.0)), [height, width, 3]).astype(np.uint8) for s in samples
+        ... ]  # convert color cluster tokens back to pixels
+        >>> f, axes = plt.subplots(1, batch_size, dpi=300)
+
+        >>> for img, ax in zip(samples_img, axes):  # doctest: +IGNORE_RESULT
+        ...     ax.axis("off")
+        ...     ax.imshow(img)
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+        hidden_states = transformer_outputs[0]
+
+        lm_logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # Shift so that tokens < n predict n
+            shift_logits = lm_logits[..., :-1, :].contiguous()
+            shift_labels = labels[..., 1:].contiguous()
+            # Flatten the tokens
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(shift_logits.view(-1, shift_logits.size(-1)), shift_labels.view(-1))
+
+        if not return_dict:
+            output = (lm_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=lm_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+            cross_attentions=transformer_outputs.cross_attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    The ImageGPT Model transformer with an image classification head on top (linear layer).
+    [`ImageGPTForImageClassification`] average-pools the hidden states in order to do the classification.
+    """
+)
+class ImageGPTForImageClassification(ImageGPTPreTrainedModel):
+    def __init__(self, config: ImageGPTConfig):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.transformer = ImageGPTModel(config)
+        self.score = nn.Linear(config.n_embd, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        token_type_ids: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs: Any,
+    ) -> Union[tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        input_ids (`torch.LongTensor` of shape `(batch_size, input_ids_length)`):
+            `input_ids_length` = `sequence_length` if `past_key_values` is `None` else
+            `past_key_values.get_seq_length()` (`sequence_length` of input past key value states). Indices of input
+            sequence tokens in the vocabulary.
+
+            If `past_key_values` is used, only `input_ids` that do not have their past calculated should be passed as
+            `input_ids`.
+
+            Indices can be obtained using [`AutoImageProcessor`]. See [`ImageGPTImageProcessor.__call__`] for details.
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, ImageGPTForImageClassification
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("openai/imagegpt-small")
+        >>> model = ImageGPTForImageClassification.from_pretrained("openai/imagegpt-small")
+
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+        >>> outputs = model(**inputs)
+        >>> logits = outputs.logits
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.transformer(
+            input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            token_type_ids=token_type_ids,
+            position_ids=position_ids,
+            head_mask=head_mask,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        # average-pool the hidden states along the sequence dimension
+        pooled_hidden_states = hidden_states.mean(dim=1)
+        # project from (batch_size, hidden_size) to (batch_size, num_labels)
+        logits = self.score(pooled_hidden_states)
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(labels, logits, self.config)
+
+        if not return_dict:
+            output = (logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+__all__ = [
+    "ImageGPTForCausalImageModeling",
+    "ImageGPTForImageClassification",
+    "ImageGPTModel",
+    "ImageGPTPreTrainedModel",
+    "load_tf_weights_in_imagegpt",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/informer/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/informer/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9564a1ea9ec92004d96ff6d980ab0ef4386b1d33
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/informer/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/informer/__pycache__/configuration_informer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/informer/__pycache__/configuration_informer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..448307e6dc8d00f1f1c48f624bbcdc7bd802d563
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/informer/__pycache__/configuration_informer.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/informer/__pycache__/modular_informer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/informer/__pycache__/modular_informer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8c13bb544a33bf33c9091b6f06115dd096f279db
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/informer/__pycache__/modular_informer.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblipvideo/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblipvideo/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..274fad9c1fa8509362f70ede73ba0dc6526805cd
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblipvideo/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblipvideo/__pycache__/configuration_instructblipvideo.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblipvideo/__pycache__/configuration_instructblipvideo.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1760c53ec46e5f27f6b46c593873eeae56f68da7
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblipvideo/__pycache__/configuration_instructblipvideo.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblipvideo/__pycache__/image_processing_instructblipvideo.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblipvideo/__pycache__/image_processing_instructblipvideo.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7134e9ca6e050e71448230857587d75b7d2c2a29
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblipvideo/__pycache__/image_processing_instructblipvideo.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblipvideo/__pycache__/modeling_instructblipvideo.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblipvideo/__pycache__/modeling_instructblipvideo.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cd1f85b47bad95a2adc0098a1fbdf59b3c550a75
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblipvideo/__pycache__/modeling_instructblipvideo.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblipvideo/__pycache__/modular_instructblipvideo.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblipvideo/__pycache__/modular_instructblipvideo.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d63e1eb666d33cafa5098c62a0bde4d127183083
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblipvideo/__pycache__/modular_instructblipvideo.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblipvideo/__pycache__/video_processing_instructblipvideo.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblipvideo/__pycache__/video_processing_instructblipvideo.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..19205bc39e25210cff274811b5ca1694c2943e17
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/instructblipvideo/__pycache__/video_processing_instructblipvideo.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4ea61afd214787c6c955e44bb43c4c5865be0cca
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/configuration_kyutai_speech_to_text.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/configuration_kyutai_speech_to_text.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..184b7900cc80b34a312be42477729aa679074238
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/configuration_kyutai_speech_to_text.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/feature_extraction_kyutai_speech_to_text.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/feature_extraction_kyutai_speech_to_text.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3396e2a546e8ab358f521ba74af9016d23bb2165
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/feature_extraction_kyutai_speech_to_text.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/modeling_kyutai_speech_to_text.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/modeling_kyutai_speech_to_text.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8ea4da3ecc078eeac929c3b68bbdb03d251637aa
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/modeling_kyutai_speech_to_text.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/modular_kyutai_speech_to_text.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/modular_kyutai_speech_to_text.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3745bb7c8577d6c508b66bf4f09af4f46c84c763
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/modular_kyutai_speech_to_text.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/processing_kyutai_speech_to_text.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/processing_kyutai_speech_to_text.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bd4cd5987c496f6fea693f308fdd5ede8996a7d8
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/kyutai_speech_to_text/__pycache__/processing_kyutai_speech_to_text.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/configuration_layoutlmv3.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/configuration_layoutlmv3.py
new file mode 100644
index 0000000000000000000000000000000000000000..c845bb43b346a2d00a93193125f8b3d6d72c1ebc
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/configuration_layoutlmv3.py
@@ -0,0 +1,294 @@
+# coding=utf-8
+# Copyright 2022 Microsoft Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""LayoutLMv3 model configuration"""
+
+from collections import OrderedDict
+from collections.abc import Mapping
+from typing import TYPE_CHECKING, Any, Optional
+
+from packaging import version
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...onnx.utils import compute_effective_axis_dimension
+from ...utils import logging
+
+
+if TYPE_CHECKING:
+    from ...processing_utils import ProcessorMixin
+    from ...utils import TensorType
+
+
+logger = logging.get_logger(__name__)
+
+
+class LayoutLMv3Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`LayoutLMv3Model`]. It is used to instantiate an
+    LayoutLMv3 model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the LayoutLMv3
+    [microsoft/layoutlmv3-base](https://huggingface.co/microsoft/layoutlmv3-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50265):
+            Vocabulary size of the LayoutLMv3 model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`LayoutLMv3Model`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimension of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (`int`, *optional*, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (`int`, *optional*, defaults to 2):
+            The vocabulary size of the `token_type_ids` passed when calling [`LayoutLMv3Model`].
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-5):
+            The epsilon used by the layer normalization layers.
+        max_2d_position_embeddings (`int`, *optional*, defaults to 1024):
+            The maximum value that the 2D position embedding might ever be used with. Typically set this to something
+            large just in case (e.g., 1024).
+        coordinate_size (`int`, *optional*, defaults to `128`):
+            Dimension of the coordinate embeddings.
+        shape_size (`int`, *optional*, defaults to `128`):
+            Dimension of the width and height embeddings.
+        has_relative_attention_bias (`bool`, *optional*, defaults to `True`):
+            Whether or not to use a relative attention bias in the self-attention mechanism.
+        rel_pos_bins (`int`, *optional*, defaults to 32):
+            The number of relative position bins to be used in the self-attention mechanism.
+        max_rel_pos (`int`, *optional*, defaults to 128):
+            The maximum number of relative positions to be used in the self-attention mechanism.
+        max_rel_2d_pos (`int`, *optional*, defaults to 256):
+            The maximum number of relative 2D positions in the self-attention mechanism.
+        rel_2d_pos_bins (`int`, *optional*, defaults to 64):
+            The number of 2D relative position bins in the self-attention mechanism.
+        has_spatial_attention_bias (`bool`, *optional*, defaults to `True`):
+            Whether or not to use a spatial attention bias in the self-attention mechanism.
+        visual_embed (`bool`, *optional*, defaults to `True`):
+            Whether or not to add patch embeddings.
+        input_size (`int`, *optional*, defaults to `224`):
+            The size (resolution) of the images.
+        num_channels (`int`, *optional*, defaults to `3`):
+            The number of channels of the images.
+        patch_size (`int`, *optional*, defaults to `16`)
+            The size (resolution) of the patches.
+        classifier_dropout (`float`, *optional*):
+            The dropout ratio for the classification head.
+
+    Example:
+
+    ```python
+    >>> from transformers import LayoutLMv3Config, LayoutLMv3Model
+
+    >>> # Initializing a LayoutLMv3 microsoft/layoutlmv3-base style configuration
+    >>> configuration = LayoutLMv3Config()
+
+    >>> # Initializing a model (with random weights) from the microsoft/layoutlmv3-base style configuration
+    >>> model = LayoutLMv3Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "layoutlmv3"
+
+    def __init__(
+        self,
+        vocab_size=50265,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        pad_token_id=1,
+        bos_token_id=0,
+        eos_token_id=2,
+        max_2d_position_embeddings=1024,
+        coordinate_size=128,
+        shape_size=128,
+        has_relative_attention_bias=True,
+        rel_pos_bins=32,
+        max_rel_pos=128,
+        rel_2d_pos_bins=64,
+        max_rel_2d_pos=256,
+        has_spatial_attention_bias=True,
+        text_embed=True,
+        visual_embed=True,
+        input_size=224,
+        num_channels=3,
+        patch_size=16,
+        classifier_dropout=None,
+        **kwargs,
+    ):
+        super().__init__(
+            vocab_size=vocab_size,
+            hidden_size=hidden_size,
+            num_hidden_layers=num_hidden_layers,
+            num_attention_heads=num_attention_heads,
+            intermediate_size=intermediate_size,
+            hidden_act=hidden_act,
+            hidden_dropout_prob=hidden_dropout_prob,
+            attention_probs_dropout_prob=attention_probs_dropout_prob,
+            max_position_embeddings=max_position_embeddings,
+            type_vocab_size=type_vocab_size,
+            initializer_range=initializer_range,
+            layer_norm_eps=layer_norm_eps,
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs,
+        )
+        self.max_2d_position_embeddings = max_2d_position_embeddings
+        self.coordinate_size = coordinate_size
+        self.shape_size = shape_size
+        self.has_relative_attention_bias = has_relative_attention_bias
+        self.rel_pos_bins = rel_pos_bins
+        self.max_rel_pos = max_rel_pos
+        self.has_spatial_attention_bias = has_spatial_attention_bias
+        self.rel_2d_pos_bins = rel_2d_pos_bins
+        self.max_rel_2d_pos = max_rel_2d_pos
+        self.text_embed = text_embed
+        self.visual_embed = visual_embed
+        self.input_size = input_size
+        self.num_channels = num_channels
+        self.patch_size = patch_size
+        self.classifier_dropout = classifier_dropout
+
+
+class LayoutLMv3OnnxConfig(OnnxConfig):
+    torch_onnx_minimum_version = version.parse("1.12")
+
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        # The order of inputs is different for question answering and sequence classification
+        if self.task in ["question-answering", "sequence-classification"]:
+            return OrderedDict(
+                [
+                    ("input_ids", {0: "batch", 1: "sequence"}),
+                    ("attention_mask", {0: "batch", 1: "sequence"}),
+                    ("bbox", {0: "batch", 1: "sequence"}),
+                    ("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"}),
+                ]
+            )
+        else:
+            return OrderedDict(
+                [
+                    ("input_ids", {0: "batch", 1: "sequence"}),
+                    ("bbox", {0: "batch", 1: "sequence"}),
+                    ("attention_mask", {0: "batch", 1: "sequence"}),
+                    ("pixel_values", {0: "batch", 1: "num_channels"}),
+                ]
+            )
+
+    @property
+    def atol_for_validation(self) -> float:
+        return 1e-5
+
+    @property
+    def default_onnx_opset(self) -> int:
+        return 12
+
+    def generate_dummy_inputs(
+        self,
+        processor: "ProcessorMixin",
+        batch_size: int = -1,
+        seq_length: int = -1,
+        is_pair: bool = False,
+        framework: Optional["TensorType"] = None,
+        num_channels: int = 3,
+        image_width: int = 40,
+        image_height: int = 40,
+    ) -> Mapping[str, Any]:
+        """
+        Generate inputs to provide to the ONNX exporter for the specific framework
+
+        Args:
+            processor ([`ProcessorMixin`]):
+                The processor associated with this model configuration.
+            batch_size (`int`, *optional*, defaults to -1):
+                The batch size to export the model for (-1 means dynamic axis).
+            seq_length (`int`, *optional*, defaults to -1):
+                The sequence length to export the model for (-1 means dynamic axis).
+            is_pair (`bool`, *optional*, defaults to `False`):
+                Indicate if the input is a pair (sentence 1, sentence 2).
+            framework (`TensorType`, *optional*, defaults to `None`):
+                The framework (PyTorch or TensorFlow) that the processor will generate tensors for.
+            num_channels (`int`, *optional*, defaults to 3):
+                The number of channels of the generated images.
+            image_width (`int`, *optional*, defaults to 40):
+                The width of the generated images.
+            image_height (`int`, *optional*, defaults to 40):
+                The height of the generated images.
+
+        Returns:
+            Mapping[str, Any]: holding the kwargs to provide to the model's forward function
+        """
+
+        # A dummy image is used so OCR should not be applied
+        setattr(processor.image_processor, "apply_ocr", False)
+
+        # If dynamic axis (-1) we forward with a fixed dimension of 2 samples to avoid optimizations made by ONNX
+        batch_size = compute_effective_axis_dimension(
+            batch_size, fixed_dimension=OnnxConfig.default_fixed_batch, num_token_to_add=0
+        )
+        # If dynamic axis (-1) we forward with a fixed dimension of 8 tokens to avoid optimizations made by ONNX
+        token_to_add = processor.tokenizer.num_special_tokens_to_add(is_pair)
+        seq_length = compute_effective_axis_dimension(
+            seq_length, fixed_dimension=OnnxConfig.default_fixed_sequence, num_token_to_add=token_to_add
+        )
+        # Generate dummy inputs according to compute batch and sequence
+        dummy_text = [[" ".join([processor.tokenizer.unk_token]) * seq_length]] * batch_size
+
+        # Generate dummy bounding boxes
+        dummy_bboxes = [[[48, 84, 73, 128]]] * batch_size
+
+        # If dynamic axis (-1) we forward with a fixed dimension of 2 samples to avoid optimizations made by ONNX
+        # batch_size = compute_effective_axis_dimension(batch_size, fixed_dimension=OnnxConfig.default_fixed_batch)
+        dummy_image = self._generate_dummy_images(batch_size, num_channels, image_height, image_width)
+
+        inputs = dict(
+            processor(
+                dummy_image,
+                text=dummy_text,
+                boxes=dummy_bboxes,
+                return_tensors=framework,
+            )
+        )
+
+        return inputs
+
+
+__all__ = ["LayoutLMv3Config", "LayoutLMv3OnnxConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/image_processing_layoutlmv3.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/image_processing_layoutlmv3.py
new file mode 100644
index 0000000000000000000000000000000000000000..8189abf673119cccbf87387f65e15f8dd43bde80
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/image_processing_layoutlmv3.py
@@ -0,0 +1,380 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for LayoutLMv3."""
+
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import resize, to_channel_dimension_format, to_pil_image
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_flat_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_preprocess_arguments,
+)
+from ...utils import (
+    TensorType,
+    filter_out_non_signature_kwargs,
+    is_pytesseract_available,
+    is_vision_available,
+    logging,
+    requires_backends,
+)
+from ...utils.import_utils import requires
+
+
+if is_vision_available():
+    import PIL
+
+# soft dependency
+if is_pytesseract_available():
+    import pytesseract
+
+logger = logging.get_logger(__name__)
+
+
+def normalize_box(box, width, height):
+    return [
+        int(1000 * (box[0] / width)),
+        int(1000 * (box[1] / height)),
+        int(1000 * (box[2] / width)),
+        int(1000 * (box[3] / height)),
+    ]
+
+
+def apply_tesseract(
+    image: np.ndarray,
+    lang: Optional[str],
+    tesseract_config: Optional[str],
+    input_data_format: Optional[Union[ChannelDimension, str]] = None,
+):
+    """Applies Tesseract OCR on a document image, and returns recognized words + normalized bounding boxes."""
+
+    # apply OCR
+    pil_image = to_pil_image(image, input_data_format=input_data_format)
+    image_width, image_height = pil_image.size
+    data = pytesseract.image_to_data(pil_image, lang=lang, output_type="dict", config=tesseract_config)
+    words, left, top, width, height = data["text"], data["left"], data["top"], data["width"], data["height"]
+
+    # filter empty words and corresponding coordinates
+    irrelevant_indices = [idx for idx, word in enumerate(words) if not word.strip()]
+    words = [word for idx, word in enumerate(words) if idx not in irrelevant_indices]
+    left = [coord for idx, coord in enumerate(left) if idx not in irrelevant_indices]
+    top = [coord for idx, coord in enumerate(top) if idx not in irrelevant_indices]
+    width = [coord for idx, coord in enumerate(width) if idx not in irrelevant_indices]
+    height = [coord for idx, coord in enumerate(height) if idx not in irrelevant_indices]
+
+    # turn coordinates into (left, top, left+width, top+height) format
+    actual_boxes = []
+    for x, y, w, h in zip(left, top, width, height):
+        actual_box = [x, y, x + w, y + h]
+        actual_boxes.append(actual_box)
+
+    # finally, normalize the bounding boxes
+    normalized_boxes = []
+    for box in actual_boxes:
+        normalized_boxes.append(normalize_box(box, image_width, image_height))
+
+    assert len(words) == len(normalized_boxes), "Not as many words as there are bounding boxes"
+
+    return words, normalized_boxes
+
+
+@requires(backends=("vision",))
+class LayoutLMv3ImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a LayoutLMv3 image processor.
+
+    Args:
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to `(size["height"], size["width"])`. Can be
+            overridden by `do_resize` in `preprocess`.
+        size (`dict[str, int]` *optional*, defaults to `{"height": 224, "width": 224}`):
+            Size of the image after resizing. Can be overridden by `size` in `preprocess`.
+        resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+            Resampling filter to use if resizing the image. Can be overridden by `resample` in `preprocess`.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image's pixel values by the specified `rescale_value`. Can be overridden by
+            `do_rescale` in `preprocess`.
+        rescale_factor (`float`, *optional*, defaults to 1 / 255):
+            Value by which the image's pixel values are rescaled. Can be overridden by `rescale_factor` in
+            `preprocess`.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by the `do_normalize` parameter in the `preprocess`
+            method.
+        image_mean (`Iterable[float]` or `float`, *optional*, defaults to `IMAGENET_STANDARD_MEAN`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`Iterable[float]` or `float`, *optional*, defaults to `IMAGENET_STANDARD_STD`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+        apply_ocr (`bool`, *optional*, defaults to `True`):
+            Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes. Can be overridden by
+            the `apply_ocr` parameter in the `preprocess` method.
+        ocr_lang (`str`, *optional*):
+            The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
+            used. Can be overridden by the `ocr_lang` parameter in the `preprocess` method.
+        tesseract_config (`str`, *optional*):
+            Any additional custom configuration flags that are forwarded to the `config` parameter when calling
+            Tesseract. For example: '--psm 6'. Can be overridden by the `tesseract_config` parameter in the
+            `preprocess` method.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_resize: bool = True,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        do_rescale: bool = True,
+        rescale_value: float = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, Iterable[float]]] = None,
+        image_std: Optional[Union[float, Iterable[float]]] = None,
+        apply_ocr: bool = True,
+        ocr_lang: Optional[str] = None,
+        tesseract_config: Optional[str] = "",
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"height": 224, "width": 224}
+        size = get_size_dict(size)
+
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_value
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else IMAGENET_STANDARD_MEAN
+        self.image_std = image_std if image_std is not None else IMAGENET_STANDARD_STD
+        self.apply_ocr = apply_ocr
+        self.ocr_lang = ocr_lang
+        self.tesseract_config = tesseract_config
+
+    # Copied from transformers.models.vit.image_processing_vit.ViTImageProcessor.resize
+    def resize(
+        self,
+        image: np.ndarray,
+        size: dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BILINEAR,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image to `(size["height"], size["width"])`.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`dict[str, int]`):
+                Dictionary in the format `{"height": int, "width": int}` specifying the size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BILINEAR`):
+                `PILImageResampling` filter to use when resizing the image e.g. `PILImageResampling.BILINEAR`.
+            data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the output image. If unset, the channel dimension format of the input
+                image is used. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+
+        Returns:
+            `np.ndarray`: The resized image.
+        """
+        size = get_size_dict(size)
+        if "height" not in size or "width" not in size:
+            raise ValueError(f"The `size` dictionary must contain the keys `height` and `width`. Got {size.keys()}")
+        output_size = (size["height"], size["width"])
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    @filter_out_non_signature_kwargs()
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample=None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, Iterable[float]]] = None,
+        image_std: Optional[Union[float, Iterable[float]]] = None,
+        apply_ocr: Optional[bool] = None,
+        ocr_lang: Optional[str] = None,
+        tesseract_config: Optional[str] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: ChannelDimension = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Desired size of the output image after applying `resize`.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the `PILImageResampling` filters.
+                Only has an effect if `do_resize` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image pixel values between [0, 1].
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to apply to the image pixel values. Only has an effect if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `Iterable[float]`, *optional*, defaults to `self.image_mean`):
+                Mean values to be used for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `Iterable[float]`, *optional*, defaults to `self.image_std`):
+                Standard deviation values to be used for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            apply_ocr (`bool`, *optional*, defaults to `self.apply_ocr`):
+                Whether to apply the Tesseract OCR engine to get words + normalized bounding boxes.
+            ocr_lang (`str`, *optional*, defaults to `self.ocr_lang`):
+                The language, specified by its ISO code, to be used by the Tesseract OCR engine. By default, English is
+                used.
+            tesseract_config (`str`, *optional*, defaults to `self.tesseract_config`):
+                Any additional custom configuration flags that are forwarded to the `config` parameter when calling
+                Tesseract.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                    - Unset: Return a list of `np.ndarray`.
+                    - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                    - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                    - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                    - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                    - `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                    - `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        size = get_size_dict(size)
+        resample = resample if resample is not None else self.resample
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        apply_ocr = apply_ocr if apply_ocr is not None else self.apply_ocr
+        ocr_lang = ocr_lang if ocr_lang is not None else self.ocr_lang
+        tesseract_config = tesseract_config if tesseract_config is not None else self.tesseract_config
+        images = make_flat_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if do_rescale and is_scaled_image(images[0]):
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        # Tesseract OCR to get words + normalized bounding boxes
+        if apply_ocr:
+            requires_backends(self, "pytesseract")
+            words_batch = []
+            boxes_batch = []
+            for image in images:
+                words, boxes = apply_tesseract(image, ocr_lang, tesseract_config, input_data_format=input_data_format)
+                words_batch.append(words)
+                boxes_batch.append(boxes)
+
+        if do_resize:
+            images = [
+                self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        if do_rescale:
+            images = [
+                self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        if do_normalize:
+            images = [
+                self.normalize(image=image, mean=image_mean, std=image_std, input_data_format=input_data_format)
+                for image in images
+            ]
+
+        images = [
+            to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format) for image in images
+        ]
+
+        data = BatchFeature(data={"pixel_values": images}, tensor_type=return_tensors)
+
+        if apply_ocr:
+            data["words"] = words_batch
+            data["boxes"] = boxes_batch
+        return data
+
+
+__all__ = ["LayoutLMv3ImageProcessor"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/processing_layoutlmv3.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/processing_layoutlmv3.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f1b6cead6077b391a993e346b6ab572f73e5380
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutlmv3/processing_layoutlmv3.py
@@ -0,0 +1,184 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for LayoutLMv3.
+"""
+
+import warnings
+from typing import Optional, Union
+
+from ...processing_utils import ProcessorMixin
+from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
+from ...utils import TensorType
+
+
+class LayoutLMv3Processor(ProcessorMixin):
+    r"""
+    Constructs a LayoutLMv3 processor which combines a LayoutLMv3 image processor and a LayoutLMv3 tokenizer into a
+    single processor.
+
+    [`LayoutLMv3Processor`] offers all the functionalities you need to prepare data for the model.
+
+    It first uses [`LayoutLMv3ImageProcessor`] to resize and normalize document images, and optionally applies OCR to
+    get words and normalized bounding boxes. These are then provided to [`LayoutLMv3Tokenizer`] or
+    [`LayoutLMv3TokenizerFast`], which turns the words and bounding boxes into token-level `input_ids`,
+    `attention_mask`, `token_type_ids`, `bbox`. Optionally, one can provide integer `word_labels`, which are turned
+    into token-level `labels` for token classification tasks (such as FUNSD, CORD).
+
+    Args:
+        image_processor (`LayoutLMv3ImageProcessor`, *optional*):
+            An instance of [`LayoutLMv3ImageProcessor`]. The image processor is a required input.
+        tokenizer (`LayoutLMv3Tokenizer` or `LayoutLMv3TokenizerFast`, *optional*):
+            An instance of [`LayoutLMv3Tokenizer`] or [`LayoutLMv3TokenizerFast`]. The tokenizer is a required input.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "LayoutLMv3ImageProcessor"
+    tokenizer_class = ("LayoutLMv3Tokenizer", "LayoutLMv3TokenizerFast")
+
+    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        feature_extractor = None
+        if "feature_extractor" in kwargs:
+            warnings.warn(
+                "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
+                " instead.",
+                FutureWarning,
+            )
+            feature_extractor = kwargs.pop("feature_extractor")
+
+        image_processor = image_processor if image_processor is not None else feature_extractor
+
+        super().__init__(image_processor, tokenizer)
+
+    def __call__(
+        self,
+        images,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
+        text_pair: Optional[Union[PreTokenizedInput, list[PreTokenizedInput]]] = None,
+        boxes: Optional[Union[list[list[int]], list[list[list[int]]]]] = None,
+        word_labels: Optional[Union[list[int], list[list[int]]]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ) -> BatchEncoding:
+        """
+        This method first forwards the `images` argument to [`~LayoutLMv3ImageProcessor.__call__`]. In case
+        [`LayoutLMv3ImageProcessor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and
+        bounding boxes along with the additional arguments to [`~LayoutLMv3Tokenizer.__call__`] and returns the output,
+        together with resized and normalized `pixel_values`. In case [`LayoutLMv3ImageProcessor`] was initialized with
+        `apply_ocr` set to `False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user along
+        with the additional arguments to [`~LayoutLMv3Tokenizer.__call__`] and returns the output, together with
+        resized and normalized `pixel_values`.
+
+        Please refer to the docstring of the above two methods for more information.
+        """
+        # verify input
+        if self.image_processor.apply_ocr and (boxes is not None):
+            raise ValueError(
+                "You cannot provide bounding boxes if you initialized the image processor with apply_ocr set to True."
+            )
+
+        if self.image_processor.apply_ocr and (word_labels is not None):
+            raise ValueError(
+                "You cannot provide word labels if you initialized the image processor with apply_ocr set to True."
+            )
+
+        # first, apply the image processor
+        features = self.image_processor(images=images, return_tensors=return_tensors)
+
+        # second, apply the tokenizer
+        if text is not None and self.image_processor.apply_ocr and text_pair is None:
+            if isinstance(text, str):
+                text = [text]  # add batch dimension (as the image processor always adds a batch dimension)
+            text_pair = features["words"]
+
+        encoded_inputs = self.tokenizer(
+            text=text if text is not None else features["words"],
+            text_pair=text_pair if text_pair is not None else None,
+            boxes=boxes if boxes is not None else features["boxes"],
+            word_labels=word_labels,
+            add_special_tokens=add_special_tokens,
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            return_tensors=return_tensors,
+            **kwargs,
+        )
+
+        # add pixel values
+        images = features.pop("pixel_values")
+        if return_overflowing_tokens is True:
+            images = self.get_overflowing_images(images, encoded_inputs["overflow_to_sample_mapping"])
+        encoded_inputs["pixel_values"] = images
+
+        return encoded_inputs
+
+    def get_overflowing_images(self, images, overflow_to_sample_mapping):
+        # in case there's an overflow, ensure each `input_ids` sample is mapped to its corresponding image
+        images_with_overflow = []
+        for sample_idx in overflow_to_sample_mapping:
+            images_with_overflow.append(images[sample_idx])
+
+        if len(images_with_overflow) != len(overflow_to_sample_mapping):
+            raise ValueError(
+                "Expected length of images to be the same as the length of `overflow_to_sample_mapping`, but got"
+                f" {len(images_with_overflow)} and {len(overflow_to_sample_mapping)}"
+            )
+
+        return images_with_overflow
+
+    @property
+    def model_input_names(self):
+        return ["input_ids", "bbox", "attention_mask", "pixel_values"]
+
+    @property
+    def feature_extractor_class(self):
+        warnings.warn(
+            "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
+            FutureWarning,
+        )
+        return self.image_processor_class
+
+    @property
+    def feature_extractor(self):
+        warnings.warn(
+            "`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
+            FutureWarning,
+        )
+        return self.image_processor
+
+
+__all__ = ["LayoutLMv3Processor"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutxlm/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutxlm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f5a662e6afd2c83768e9731d896da83b0b89193
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutxlm/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .processing_layoutxlm import *
+    from .tokenization_layoutxlm import *
+    from .tokenization_layoutxlm_fast import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutxlm/processing_layoutxlm.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutxlm/processing_layoutxlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..e3ece89f434ba1ab69a1ec2ded35722559c32611
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutxlm/processing_layoutxlm.py
@@ -0,0 +1,185 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for LayoutXLM.
+"""
+
+import warnings
+from typing import Optional, Union
+
+from ...processing_utils import ProcessorMixin
+from ...tokenization_utils_base import BatchEncoding, PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
+from ...utils import TensorType
+
+
+class LayoutXLMProcessor(ProcessorMixin):
+    r"""
+    Constructs a LayoutXLM processor which combines a LayoutXLM image processor and a LayoutXLM tokenizer into a single
+    processor.
+
+    [`LayoutXLMProcessor`] offers all the functionalities you need to prepare data for the model.
+
+    It first uses [`LayoutLMv2ImageProcessor`] to resize document images to a fixed size, and optionally applies OCR to
+    get words and normalized bounding boxes. These are then provided to [`LayoutXLMTokenizer`] or
+    [`LayoutXLMTokenizerFast`], which turns the words and bounding boxes into token-level `input_ids`,
+    `attention_mask`, `token_type_ids`, `bbox`. Optionally, one can provide integer `word_labels`, which are turned
+    into token-level `labels` for token classification tasks (such as FUNSD, CORD).
+
+    Args:
+        image_processor (`LayoutLMv2ImageProcessor`, *optional*):
+            An instance of [`LayoutLMv2ImageProcessor`]. The image processor is a required input.
+        tokenizer (`LayoutXLMTokenizer` or `LayoutXLMTokenizerFast`, *optional*):
+            An instance of [`LayoutXLMTokenizer`] or [`LayoutXLMTokenizerFast`]. The tokenizer is a required input.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "LayoutLMv2ImageProcessor"
+    tokenizer_class = ("LayoutXLMTokenizer", "LayoutXLMTokenizerFast")
+
+    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        if "feature_extractor" in kwargs:
+            warnings.warn(
+                "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
+                " instead.",
+                FutureWarning,
+            )
+            feature_extractor = kwargs.pop("feature_extractor")
+
+        image_processor = image_processor if image_processor is not None else feature_extractor
+
+        super().__init__(image_processor, tokenizer)
+
+    def __call__(
+        self,
+        images,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
+        text_pair: Optional[Union[PreTokenizedInput, list[PreTokenizedInput]]] = None,
+        boxes: Optional[Union[list[list[int]], list[list[list[int]]]]] = None,
+        word_labels: Optional[Union[list[int], list[list[int]]]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        **kwargs,
+    ) -> BatchEncoding:
+        """
+        This method first forwards the `images` argument to [`~LayoutLMv2ImagePrpcessor.__call__`]. In case
+        [`LayoutLMv2ImagePrpcessor`] was initialized with `apply_ocr` set to `True`, it passes the obtained words and
+        bounding boxes along with the additional arguments to [`~LayoutXLMTokenizer.__call__`] and returns the output,
+        together with resized `images`. In case [`LayoutLMv2ImagePrpcessor`] was initialized with `apply_ocr` set to
+        `False`, it passes the words (`text`/``text_pair`) and `boxes` specified by the user along with the additional
+        arguments to [`~LayoutXLMTokenizer.__call__`] and returns the output, together with resized `images``.
+
+        Please refer to the docstring of the above two methods for more information.
+        """
+        # verify input
+        if self.image_processor.apply_ocr and (boxes is not None):
+            raise ValueError(
+                "You cannot provide bounding boxes if you initialized the image processor with apply_ocr set to True."
+            )
+
+        if self.image_processor.apply_ocr and (word_labels is not None):
+            raise ValueError(
+                "You cannot provide word labels if you initialized the image processor with apply_ocr set to True."
+            )
+
+        if return_overflowing_tokens is True and return_offsets_mapping is False:
+            raise ValueError("You cannot return overflowing tokens without returning the offsets mapping.")
+
+        # first, apply the image processor
+        features = self.image_processor(images=images, return_tensors=return_tensors)
+
+        # second, apply the tokenizer
+        if text is not None and self.image_processor.apply_ocr and text_pair is None:
+            if isinstance(text, str):
+                text = [text]  # add batch dimension (as the image processor always adds a batch dimension)
+            text_pair = features["words"]
+
+        encoded_inputs = self.tokenizer(
+            text=text if text is not None else features["words"],
+            text_pair=text_pair if text_pair is not None else None,
+            boxes=boxes if boxes is not None else features["boxes"],
+            word_labels=word_labels,
+            add_special_tokens=add_special_tokens,
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            return_tensors=return_tensors,
+            **kwargs,
+        )
+
+        # add pixel values
+        images = features.pop("pixel_values")
+        if return_overflowing_tokens is True:
+            images = self.get_overflowing_images(images, encoded_inputs["overflow_to_sample_mapping"])
+        encoded_inputs["image"] = images
+
+        return encoded_inputs
+
+    def get_overflowing_images(self, images, overflow_to_sample_mapping):
+        # in case there's an overflow, ensure each `input_ids` sample is mapped to its corresponding image
+        images_with_overflow = []
+        for sample_idx in overflow_to_sample_mapping:
+            images_with_overflow.append(images[sample_idx])
+
+        if len(images_with_overflow) != len(overflow_to_sample_mapping):
+            raise ValueError(
+                "Expected length of images to be the same as the length of `overflow_to_sample_mapping`, but got"
+                f" {len(images_with_overflow)} and {len(overflow_to_sample_mapping)}"
+            )
+
+        return images_with_overflow
+
+    @property
+    def model_input_names(self):
+        return ["input_ids", "bbox", "attention_mask", "image"]
+
+    @property
+    def feature_extractor_class(self):
+        warnings.warn(
+            "`feature_extractor_class` is deprecated and will be removed in v5. Use `image_processor_class` instead.",
+            FutureWarning,
+        )
+        return self.image_processor_class
+
+    @property
+    def feature_extractor(self):
+        warnings.warn(
+            "`feature_extractor` is deprecated and will be removed in v5. Use `image_processor` instead.",
+            FutureWarning,
+        )
+        return self.image_processor
+
+
+__all__ = ["LayoutXLMProcessor"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutxlm/tokenization_layoutxlm.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutxlm/tokenization_layoutxlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..3dbe3c21a6365b5a20988626cc57520ea37d5fc3
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutxlm/tokenization_layoutxlm.py
@@ -0,0 +1,1191 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+"""Tokenization classes for LayoutXLM model."""
+
+import os
+from shutil import copyfile
+from typing import Any, Optional, Union
+
+import sentencepiece as spm
+
+from ...tokenization_utils import AddedToken, PreTrainedTokenizer
+from ...tokenization_utils_base import (
+    BatchEncoding,
+    EncodedInput,
+    PreTokenizedInput,
+    TextInput,
+    TextInputPair,
+    TruncationStrategy,
+)
+from ...utils import PaddingStrategy, TensorType, add_end_docstrings, logging
+from ...utils.import_utils import requires
+from ..xlm_roberta.tokenization_xlm_roberta import (
+    SPIECE_UNDERLINE,
+    VOCAB_FILES_NAMES,
+)
+
+
+logger = logging.get_logger(__name__)
+
+
+LAYOUTXLM_ENCODE_KWARGS_DOCSTRING = r"""
+            add_special_tokens (`bool`, *optional*, defaults to `True`):
+                Whether or not to encode the sequences with the special tokens relative to their model.
+            padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `False`):
+                Activates and controls padding. Accepts the following values:
+
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):
+                Activates and controls truncation. Accepts the following values:
+
+                - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or
+                  to the maximum acceptable input length for the model if that argument is not provided. This will
+                  truncate token by token, removing a token from the longest sequence in the pair if a pair of
+                  sequences (or a batch of pairs) is provided.
+                - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided. This will only
+                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+                - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided. This will only
+                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+                - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
+                  greater than the model maximum admissible input size).
+            max_length (`int`, *optional*):
+                Controls the maximum length to use by one of the truncation/padding parameters.
+
+                If left unset or set to `None`, this will use the predefined model maximum length if a maximum length
+                is required by one of the truncation/padding parameters. If the model has no specific maximum input
+                length (like XLNet) truncation/padding to a maximum length will be deactivated.
+            stride (`int`, *optional*, defaults to 0):
+                If set to a number along with `max_length`, the overflowing tokens returned when
+                `return_overflowing_tokens=True` will contain some tokens from the end of the truncated sequence
+                returned to provide some overlap between truncated and overflowing sequences. The value of this
+                argument defines the number of overlapping tokens.
+            pad_to_multiple_of (`int`, *optional*):
+                If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
+                the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+            return_token_type_ids (`bool`, *optional*):
+                Whether to return token type IDs. If left to the default, will return the token type IDs according to
+                the specific tokenizer's default, defined by the `return_outputs` attribute.
+
+                [What are token type IDs?](../glossary#token-type-ids)
+            return_attention_mask (`bool`, *optional*):
+                Whether to return the attention mask. If left to the default, will return the attention mask according
+                to the specific tokenizer's default, defined by the `return_outputs` attribute.
+
+                [What are attention masks?](../glossary#attention-mask)
+            return_overflowing_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not to return overflowing token sequences. If a pair of sequences of input ids (or a batch
+                of pairs) is provided with `truncation_strategy = longest_first` or `True`, an error is raised instead
+                of returning overflowing tokens.
+            return_special_tokens_mask (`bool`, *optional*, defaults to `False`):
+                Whether or not to return special tokens mask information.
+            return_offsets_mapping (`bool`, *optional*, defaults to `False`):
+                Whether or not to return `(char_start, char_end)` for each token.
+
+                This is only available on fast tokenizers inheriting from [`PreTrainedTokenizerFast`], if using
+                Python's tokenizer, this method will raise `NotImplementedError`.
+            return_length  (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the lengths of the encoded inputs.
+            verbose (`bool`, *optional*, defaults to `True`):
+                Whether or not to print more information and warnings.
+            **kwargs: passed to the `self.tokenize()` method
+
+        Return:
+            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model.
+
+              [What are input IDs?](../glossary#input-ids)
+
+            - **bbox** -- List of bounding boxes to be fed to a model.
+
+            - **token_type_ids** -- List of token type ids to be fed to a model (when `return_token_type_ids=True` or
+              if *"token_type_ids"* is in `self.model_input_names`).
+
+              [What are token type IDs?](../glossary#token-type-ids)
+
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names`).
+
+              [What are attention masks?](../glossary#attention-mask)
+
+            - **labels** -- List of labels to be fed to a model. (when `word_labels` is specified).
+            - **overflowing_tokens** -- List of overflowing tokens sequences (when a `max_length` is specified and
+              `return_overflowing_tokens=True`).
+            - **num_truncated_tokens** -- Number of tokens truncated (when a `max_length` is specified and
+              `return_overflowing_tokens=True`).
+            - **special_tokens_mask** -- List of 0s and 1s, with 1 specifying added special tokens and 0 specifying
+              regular sequence tokens (when `add_special_tokens=True` and `return_special_tokens_mask=True`).
+            - **length** -- The length of the inputs (when `return_length=True`).
+"""
+
+
+@requires(backends=("sentencepiece",))
+class LayoutXLMTokenizer(PreTrainedTokenizer):
+    """
+    Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
+    [SentencePiece](https://github.com/google/sentencepiece).
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
+            The token used is the `sep_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        cls_token_box (`list[int]`, *optional*, defaults to `[0, 0, 0, 0]`):
+            The bounding box to use for the special [CLS] token.
+        sep_token_box (`list[int]`, *optional*, defaults to `[1000, 1000, 1000, 1000]`):
+            The bounding box to use for the special [SEP] token.
+        pad_token_box (`list[int]`, *optional*, defaults to `[0, 0, 0, 0]`):
+            The bounding box to use for the special [PAD] token.
+        pad_token_label (`int`, *optional*, defaults to -100):
+            The label to use for padding tokens. Defaults to -100, which is the `ignore_index` of PyTorch's
+            CrossEntropyLoss.
+        only_label_first_subword (`bool`, *optional*, defaults to `True`):
+            Whether or not to only label the first subword, in case word labels are provided.
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
+            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
+            to set:
+
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
+
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+                using forward-filtering-and-backward-sampling algorithm.
+
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+              BPE-dropout.
+
+    Attributes:
+        sp_model (`SentencePieceProcessor`):
+            The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        bos_token="<s>",
+        eos_token="</s>",
+        sep_token="</s>",
+        cls_token="<s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        mask_token="<mask>",
+        cls_token_box=[0, 0, 0, 0],
+        sep_token_box=[1000, 1000, 1000, 1000],
+        pad_token_box=[0, 0, 0, 0],
+        pad_token_label=-100,
+        only_label_first_subword=True,
+        sp_model_kwargs: Optional[dict[str, Any]] = None,
+        **kwargs,
+    ) -> None:
+        # Mask token behave like a normal word, i.e. include the space before it
+        mask_token = AddedToken(mask_token, lstrip=True, special=True) if isinstance(mask_token, str) else mask_token
+
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(str(vocab_file))
+        self.vocab_file = vocab_file
+
+        # Original fairseq vocab and spm vocab must be "aligned":
+        # Vocab    |    0    |    1    |   2    |    3    |  4  |  5  |  6  |   7   |   8   |  9
+        # -------- | ------- | ------- | ------ | ------- | --- | --- | --- | ----- | ----- | ----
+        # fairseq  | '<s>'   | '<pad>' | '</s>' | '<unk>' | ',' | '.' | '▁' | 's'   | '▁de' | '-'
+        # spm      | '<unk>' | '<s>'   | '</s>' | ','     | '.' | '▁' | 's' | '▁de' | '-'   | '▁a'
+
+        # Mimic fairseq token-to-id alignment for the first 4 token
+        self.fairseq_tokens_to_ids = {"<s>": 0, "<pad>": 1, "</s>": 2, "<unk>": 3}
+
+        # The first "real" token "," has position 4 in the original fairseq vocab and position 3 in the spm vocab
+        self.fairseq_offset = 1
+
+        self.fairseq_tokens_to_ids["<mask>"] = len(self.sp_model) + self.fairseq_offset
+        self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
+
+        # additional properties
+        self.cls_token_box = cls_token_box
+        self.sep_token_box = sep_token_box
+        self.pad_token_box = pad_token_box
+        self.pad_token_label = pad_token_label
+        self.only_label_first_subword = only_label_first_subword
+
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            cls_token_box=cls_token_box,
+            sep_token_box=sep_token_box,
+            pad_token_box=pad_token_box,
+            pad_token_label=pad_token_label,
+            only_label_first_subword=only_label_first_subword,
+            sp_model_kwargs=self.sp_model_kwargs,
+            **kwargs,
+        )
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        state["sp_model_proto"] = self.sp_model.serialized_model_proto()
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__.update(d)
+
+        # for backward compatibility
+        if not hasattr(self, "sp_model_kwargs"):
+            self.sp_model_kwargs = {}
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. An XLM-RoBERTa sequence has the following format:
+
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s></s> B </s>`
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
+
+    def get_special_tokens_mask(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
+    ) -> list[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        if token_ids_1 is None:
+            return [1] + ([0] * len(token_ids_0)) + [1]
+        return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does
+        not make use of token type ids, therefore a list of zeros is returned.
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `list[int]`: List of zeros.
+
+        """
+
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
+
+    @property
+    def vocab_size(self):
+        return len(self.sp_model) + self.fairseq_offset + 1  # Add the <mask> token
+
+    def get_vocab(self):
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def _tokenize(self, text: str) -> list[str]:
+        return self.sp_model.encode(text, out_type=str)
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        if token in self.fairseq_tokens_to_ids:
+            return self.fairseq_tokens_to_ids[token]
+        spm_id = self.sp_model.PieceToId(token)
+
+        # Need to return unknown token if the SP model returned 0
+        return spm_id + self.fairseq_offset if spm_id else self.unk_token_id
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        if index in self.fairseq_ids_to_tokens:
+            return self.fairseq_ids_to_tokens[index]
+        return self.sp_model.IdToPiece(index - self.fairseq_offset)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (strings for sub-words) in a single string."""
+        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
+        return out_string
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+
+        return (out_vocab_file,)
+
+    @add_end_docstrings(LAYOUTXLM_ENCODE_KWARGS_DOCSTRING)
+    def __call__(
+        self,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]],
+        text_pair: Optional[Union[PreTokenizedInput, list[PreTokenizedInput]]] = None,
+        boxes: Optional[Union[list[list[int]], list[list[list[int]]]]] = None,
+        word_labels: Optional[Union[list[int], list[list[int]]]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        """
+        Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
+        sequences with word-level normalized bounding boxes and optional labels.
+
+        Args:
+            text (`str`, `list[str]`, `list[list[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string, a list of strings
+                (words of a single example or questions of a batch of examples) or a list of list of strings (batch of
+                words).
+            text_pair (`list[str]`, `list[list[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence should be a list of strings
+                (pretokenized string).
+            boxes (`list[list[int]]`, `list[list[list[int]]]`):
+                Word-level bounding boxes. Each bounding box should be normalized to be on a 0-1000 scale.
+            word_labels (`list[int]`, `list[list[int]]`, *optional*):
+                Word-level integer labels (for token classification tasks such as FUNSD, CORD).
+        """
+
+        # Input type checking for clearer error
+        def _is_valid_text_input(t):
+            if isinstance(t, str):
+                # Strings are fine
+                return True
+            elif isinstance(t, (list, tuple)):
+                # List are fine as long as they are...
+                if len(t) == 0:
+                    # ... empty
+                    return True
+                elif isinstance(t[0], str):
+                    # ... list of strings
+                    return True
+                elif isinstance(t[0], (list, tuple)):
+                    # ... list with an empty list or with a list of strings
+                    return len(t[0]) == 0 or isinstance(t[0][0], str)
+                else:
+                    return False
+            else:
+                return False
+
+        if text_pair is not None:
+            # in case text + text_pair are provided, text = questions, text_pair = words
+            if not _is_valid_text_input(text):
+                raise ValueError("text input must of type `str` (single example) or `list[str]` (batch of examples). ")
+            if not isinstance(text_pair, (list, tuple)):
+                raise ValueError(
+                    "words must of type `list[str]` (single pretokenized example), "
+                    "or `list[list[str]]` (batch of pretokenized examples)."
+                )
+        else:
+            # in case only text is provided => must be words
+            if not isinstance(text, (list, tuple)):
+                raise ValueError(
+                    "Words must of type `list[str]` (single pretokenized example), "
+                    "or `list[list[str]]` (batch of pretokenized examples)."
+                )
+
+        if text_pair is not None:
+            is_batched = isinstance(text, (list, tuple))
+        else:
+            is_batched = isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple))
+
+        words = text if text_pair is None else text_pair
+        if boxes is None:
+            raise ValueError("You must provide corresponding bounding boxes")
+        if is_batched:
+            if len(words) != len(boxes):
+                raise ValueError("You must provide words and boxes for an equal amount of examples")
+            for words_example, boxes_example in zip(words, boxes):
+                if len(words_example) != len(boxes_example):
+                    raise ValueError("You must provide as many words as there are bounding boxes")
+        else:
+            if len(words) != len(boxes):
+                raise ValueError("You must provide as many words as there are bounding boxes")
+
+        if is_batched:
+            if text_pair is not None and len(text) != len(text_pair):
+                raise ValueError(
+                    f"batch length of `text`: {len(text)} does not match batch length of `text_pair`:"
+                    f" {len(text_pair)}."
+                )
+            batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text
+            is_pair = bool(text_pair is not None)
+            return self.batch_encode_plus(
+                batch_text_or_text_pairs=batch_text_or_text_pairs,
+                is_pair=is_pair,
+                boxes=boxes,
+                word_labels=word_labels,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs,
+            )
+        else:
+            return self.encode_plus(
+                text=text,
+                text_pair=text_pair,
+                boxes=boxes,
+                word_labels=word_labels,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs,
+            )
+
+    def _batch_encode_plus(
+        self,
+        batch_text_or_text_pairs: Union[
+            list[TextInput],
+            list[TextInputPair],
+            list[PreTokenizedInput],
+        ],
+        is_pair: Optional[bool] = None,
+        boxes: Optional[list[list[list[int]]]] = None,
+        word_labels: Optional[list[list[int]]] = None,
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        if return_offsets_mapping:
+            raise NotImplementedError(
+                "return_offset_mapping is not available when using Python tokenizers. "
+                "To use this feature, change your tokenizer to one deriving from "
+                "transformers.PreTrainedTokenizerFast."
+            )
+
+        batch_outputs = self._batch_prepare_for_model(
+            batch_text_or_text_pairs=batch_text_or_text_pairs,
+            is_pair=is_pair,
+            boxes=boxes,
+            word_labels=word_labels,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
+            return_attention_mask=return_attention_mask,
+            return_token_type_ids=return_token_type_ids,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_length=return_length,
+            return_tensors=return_tensors,
+            verbose=verbose,
+        )
+
+        return BatchEncoding(batch_outputs)
+
+    @add_end_docstrings(LAYOUTXLM_ENCODE_KWARGS_DOCSTRING)
+    def _batch_prepare_for_model(
+        self,
+        batch_text_or_text_pairs,
+        is_pair: Optional[bool] = None,
+        boxes: Optional[list[list[int]]] = None,
+        word_labels: Optional[list[list[int]]] = None,
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_tensors: Optional[str] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+    ) -> BatchEncoding:
+        """
+        Prepares a sequence of input id, or a pair of sequences of inputs ids so that it can be used by the model. It
+        adds special tokens, truncates sequences if overflowing while taking into account the special tokens and
+        manages a moving window (with user defined stride) for overflowing tokens
+
+        Args:
+            batch_ids_pairs: list of tokenized input ids or input ids pairs
+        """
+
+        batch_outputs = {}
+        for idx, example in enumerate(zip(batch_text_or_text_pairs, boxes)):
+            batch_text_or_text_pair, boxes_example = example
+            outputs = self.prepare_for_model(
+                batch_text_or_text_pair[0] if is_pair else batch_text_or_text_pair,
+                batch_text_or_text_pair[1] if is_pair else None,
+                boxes_example,
+                word_labels=word_labels[idx] if word_labels is not None else None,
+                add_special_tokens=add_special_tokens,
+                padding=PaddingStrategy.DO_NOT_PAD.value,  # we pad in batch afterward
+                truncation=truncation_strategy.value,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=None,  # we pad in batch afterward
+                padding_side=None,  # we pad in batch afterward
+                return_attention_mask=False,  # we pad in batch afterward
+                return_token_type_ids=return_token_type_ids,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_length=return_length,
+                return_tensors=None,  # We convert the whole batch to tensors at the end
+                prepend_batch_axis=False,
+                verbose=verbose,
+            )
+
+            for key, value in outputs.items():
+                if key not in batch_outputs:
+                    batch_outputs[key] = []
+                batch_outputs[key].append(value)
+
+        batch_outputs = self.pad(
+            batch_outputs,
+            padding=padding_strategy.value,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
+            return_attention_mask=return_attention_mask,
+        )
+
+        batch_outputs = BatchEncoding(batch_outputs, tensor_type=return_tensors)
+
+        return batch_outputs
+
+    def _encode_plus(
+        self,
+        text: Union[TextInput, PreTokenizedInput],
+        text_pair: Optional[PreTokenizedInput] = None,
+        boxes: Optional[list[list[int]]] = None,
+        word_labels: Optional[list[int]] = None,
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        if return_offsets_mapping:
+            raise NotImplementedError(
+                "return_offset_mapping is not available when using Python tokenizers. "
+                "To use this feature, change your tokenizer to one deriving from "
+                "transformers.PreTrainedTokenizerFast. "
+                "More information on available tokenizers at "
+                "https://github.com/huggingface/transformers/pull/2674"
+            )
+
+        return self.prepare_for_model(
+            text=text,
+            text_pair=text_pair,
+            boxes=boxes,
+            word_labels=word_labels,
+            add_special_tokens=add_special_tokens,
+            padding=padding_strategy.value,
+            truncation=truncation_strategy.value,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
+            return_tensors=return_tensors,
+            prepend_batch_axis=True,
+            return_attention_mask=return_attention_mask,
+            return_token_type_ids=return_token_type_ids,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_length=return_length,
+            verbose=verbose,
+        )
+
+    @add_end_docstrings(LAYOUTXLM_ENCODE_KWARGS_DOCSTRING)
+    def prepare_for_model(
+        self,
+        text: Union[TextInput, PreTokenizedInput],
+        text_pair: Optional[PreTokenizedInput] = None,
+        boxes: Optional[list[list[int]]] = None,
+        word_labels: Optional[list[int]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        prepend_batch_axis: bool = False,
+        **kwargs,
+    ) -> BatchEncoding:
+        """
+        Prepares a sequence or a pair of sequences so that it can be used by the model. It adds special tokens,
+        truncates sequences if overflowing while taking into account the special tokens and manages a moving window
+        (with user defined stride) for overflowing tokens.
+
+        Word-level `boxes` are turned into token-level `bbox`. If provided, word-level `word_labels` are turned into
+        token-level `labels`. The word label is used for the first token of the word, while remaining tokens are
+        labeled with -100, such that they will be ignored by the loss function.
+
+        Args:
+            text (`str`, `list[str]`, `list[list[str]]`):
+                The first sequence to be encoded. This can be a string, a list of strings or a list of list of strings.
+            text_pair (`list[str]` or `list[int]`, *optional*):
+                Optional second sequence to be encoded. This can be a list of strings (words of a single example) or a
+                list of list of strings (words of a batch of examples).
+        """
+
+        # Backward compatibility for 'truncation_strategy', 'pad_to_max_length'
+        padding_strategy, truncation_strategy, max_length, kwargs = self._get_padding_truncation_strategies(
+            padding=padding,
+            truncation=truncation,
+            max_length=max_length,
+            pad_to_multiple_of=pad_to_multiple_of,
+            verbose=verbose,
+            **kwargs,
+        )
+
+        tokens = []
+        pair_tokens = []
+        token_boxes = []
+        pair_token_boxes = []
+        labels = []
+
+        if text_pair is None:
+            if word_labels is None:
+                # CASE 1: document image classification (training + inference) + CASE 2: token classification (inference)
+                for word, box in zip(text, boxes):
+                    if len(word) < 1:  # skip empty words
+                        continue
+                    word_tokens = self.tokenize(word)
+                    tokens.extend(word_tokens)
+                    token_boxes.extend([box] * len(word_tokens))
+            else:
+                # CASE 2: token classification (training)
+                for word, box, label in zip(text, boxes, word_labels):
+                    if len(word) < 1:  # skip empty words
+                        continue
+                    word_tokens = self.tokenize(word)
+                    tokens.extend(word_tokens)
+                    token_boxes.extend([box] * len(word_tokens))
+                    if self.only_label_first_subword:
+                        # Use the real label id for the first token of the word, and padding ids for the remaining tokens
+                        labels.extend([label] + [self.pad_token_label] * (len(word_tokens) - 1))
+                    else:
+                        labels.extend([label] * len(word_tokens))
+        else:
+            # CASE 3: document visual question answering (inference)
+            # text = question
+            # text_pair = words
+            tokens = self.tokenize(text)
+            token_boxes = [self.pad_token_box for _ in range(len(tokens))] + [self.sep_token_box]
+
+            for word, box in zip(text_pair, boxes):
+                if len(word) < 1:  # skip empty words
+                    continue
+                word_tokens = self.tokenize(word)
+                pair_tokens.extend(word_tokens)
+                pair_token_boxes.extend([box] * len(word_tokens))
+
+        # Create ids + pair_ids
+        ids = self.convert_tokens_to_ids(tokens)
+        pair_ids = self.convert_tokens_to_ids(pair_tokens) if pair_tokens else None
+
+        # Compute the total size of the returned encodings
+        pair = bool(pair_ids is not None)
+        len_ids = len(ids)
+        len_pair_ids = len(pair_ids) if pair else 0
+        total_len = len_ids + len_pair_ids + (self.num_special_tokens_to_add(pair=pair) if add_special_tokens else 0)
+
+        # Truncation: Handle max sequence length
+        overflowing_tokens = []
+        overflowing_token_boxes = []
+        overflowing_labels = []
+        if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE and max_length and total_len > max_length:
+            (
+                ids,
+                token_boxes,
+                pair_ids,
+                pair_token_boxes,
+                labels,
+                overflowing_tokens,
+                overflowing_token_boxes,
+                overflowing_labels,
+            ) = self.truncate_sequences(
+                ids,
+                token_boxes,
+                pair_ids=pair_ids,
+                pair_token_boxes=pair_token_boxes,
+                labels=labels,
+                num_tokens_to_remove=total_len - max_length,
+                truncation_strategy=truncation_strategy,
+                stride=stride,
+            )
+
+        if return_token_type_ids and not add_special_tokens:
+            raise ValueError(
+                "Asking to return token_type_ids while setting add_special_tokens to False "
+                "results in an undefined behavior. Please set add_special_tokens to True or "
+                "set return_token_type_ids to None."
+            )
+
+        # Load from model defaults
+        if return_token_type_ids is None:
+            return_token_type_ids = "token_type_ids" in self.model_input_names
+        if return_attention_mask is None:
+            return_attention_mask = "attention_mask" in self.model_input_names
+
+        encoded_inputs = {}
+
+        if return_overflowing_tokens:
+            encoded_inputs["overflowing_tokens"] = overflowing_tokens
+            encoded_inputs["overflowing_token_boxes"] = overflowing_token_boxes
+            encoded_inputs["overflowing_labels"] = overflowing_labels
+            encoded_inputs["num_truncated_tokens"] = total_len - max_length
+
+        # Add special tokens
+        if add_special_tokens:
+            sequence = self.build_inputs_with_special_tokens(ids, pair_ids)
+            token_type_ids = self.create_token_type_ids_from_sequences(ids, pair_ids)
+            token_boxes = [self.cls_token_box] + token_boxes + [self.sep_token_box]
+            if pair_token_boxes:
+                pair_token_boxes = pair_token_boxes + [self.sep_token_box]
+            if labels:
+                labels = [self.pad_token_label] + labels + [self.pad_token_label]
+        else:
+            sequence = ids + pair_ids if pair else ids
+            token_type_ids = [0] * len(ids) + ([0] * len(pair_ids) if pair else [])
+
+        # Build output dictionary
+        encoded_inputs["input_ids"] = sequence
+        encoded_inputs["bbox"] = token_boxes + pair_token_boxes
+        if return_token_type_ids:
+            encoded_inputs["token_type_ids"] = token_type_ids
+        if return_special_tokens_mask:
+            if add_special_tokens:
+                encoded_inputs["special_tokens_mask"] = self.get_special_tokens_mask(ids, pair_ids)
+            else:
+                encoded_inputs["special_tokens_mask"] = [0] * len(sequence)
+
+        if labels:
+            encoded_inputs["labels"] = labels
+
+        # Check lengths
+        self._eventual_warn_about_too_long_sequence(encoded_inputs["input_ids"], max_length, verbose)
+
+        # Padding
+        if padding_strategy != PaddingStrategy.DO_NOT_PAD or return_attention_mask:
+            encoded_inputs = self.pad(
+                encoded_inputs,
+                max_length=max_length,
+                padding=padding_strategy.value,
+                pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
+                return_attention_mask=return_attention_mask,
+            )
+
+        if return_length:
+            encoded_inputs["length"] = len(encoded_inputs["input_ids"])
+
+        batch_outputs = BatchEncoding(
+            encoded_inputs, tensor_type=return_tensors, prepend_batch_axis=prepend_batch_axis
+        )
+
+        return batch_outputs
+
+    def truncate_sequences(
+        self,
+        ids: list[int],
+        token_boxes: list[list[int]],
+        pair_ids: Optional[list[int]] = None,
+        pair_token_boxes: Optional[list[list[int]]] = None,
+        labels: Optional[list[int]] = None,
+        num_tokens_to_remove: int = 0,
+        truncation_strategy: Union[str, TruncationStrategy] = "longest_first",
+        stride: int = 0,
+    ) -> tuple[list[int], list[int], list[int]]:
+        """
+        Truncates a sequence pair in-place following the strategy.
+
+        Args:
+            ids (`list[int]`):
+                Tokenized input ids of the first sequence. Can be obtained from a string by chaining the `tokenize` and
+                `convert_tokens_to_ids` methods.
+            token_boxes (`list[list[int]]`):
+                Bounding boxes of the first sequence.
+            pair_ids (`list[int]`, *optional*):
+                Tokenized input ids of the second sequence. Can be obtained from a string by chaining the `tokenize`
+                and `convert_tokens_to_ids` methods.
+            pair_token_boxes (`list[list[int]]`, *optional*):
+                Bounding boxes of the second sequence.
+            labels (`list[int]`, *optional*):
+                Labels of the first sequence (for token classification tasks).
+            num_tokens_to_remove (`int`, *optional*, defaults to 0):
+                Number of tokens to remove using the truncation strategy.
+            truncation_strategy (`str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):
+                The strategy to follow for truncation. Can be:
+
+                - `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided. This will truncate
+                  token by token, removing a token from the longest sequence in the pair if a pair of sequences (or a
+                  batch of pairs) is provided.
+                - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided. This will only
+                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+                - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided. This will only
+                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+                - `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths greater
+                  than the model maximum admissible input size).
+            stride (`int`, *optional*, defaults to 0):
+                If set to a positive number, the overflowing tokens returned will contain some tokens from the main
+                sequence returned. The value of this argument defines the number of additional tokens.
+
+        Returns:
+            `tuple[list[int], list[int], list[int]]`: The truncated `ids`, the truncated `pair_ids` and the list of
+            overflowing tokens.
+        """
+        if num_tokens_to_remove <= 0:
+            return ids, token_boxes, pair_ids, pair_token_boxes, labels, [], [], []
+
+        if not isinstance(truncation_strategy, TruncationStrategy):
+            truncation_strategy = TruncationStrategy(truncation_strategy)
+
+        overflowing_tokens = []
+        overflowing_token_boxes = []
+        overflowing_labels = []
+        if truncation_strategy == TruncationStrategy.LONGEST_FIRST:
+            for _ in range(num_tokens_to_remove):
+                if pair_ids is None or len(ids) > len(pair_ids):
+                    if not overflowing_tokens:
+                        window_len = min(len(ids), stride + 1)
+                    else:
+                        window_len = 1
+                    overflowing_tokens.extend(ids[-window_len:])
+                    overflowing_token_boxes.extend(token_boxes[-window_len:])
+                    overflowing_labels.extend(labels[-window_len:])
+                    ids = ids[:-1]
+                    token_boxes = token_boxes[:-1]
+                    labels = labels[:-1]
+                else:
+                    if not overflowing_tokens:
+                        window_len = min(len(pair_ids), stride + 1)
+                    else:
+                        window_len = 1
+                    overflowing_tokens.extend(pair_ids[-window_len:])
+                    overflowing_token_boxes.extend(pair_token_boxes[-window_len:])
+                    pair_ids = pair_ids[:-1]
+                    pair_token_boxes = pair_token_boxes[:-1]
+        elif truncation_strategy == TruncationStrategy.ONLY_FIRST:
+            if len(ids) > num_tokens_to_remove:
+                window_len = min(len(ids), stride + num_tokens_to_remove)
+                overflowing_tokens = ids[-window_len:]
+                overflowing_token_boxes = token_boxes[-window_len:]
+                overflowing_labels = labels[-window_len:]
+                ids = ids[:-num_tokens_to_remove]
+                token_boxes = token_boxes[:-num_tokens_to_remove]
+                labels = labels[:-num_tokens_to_remove]
+            else:
+                logger.error(
+                    f"We need to remove {num_tokens_to_remove} to truncate the input "
+                    f"but the first sequence has a length {len(ids)}. "
+                    f"Please select another truncation strategy than {truncation_strategy}, "
+                    "for instance 'longest_first' or 'only_second'."
+                )
+        elif truncation_strategy == TruncationStrategy.ONLY_SECOND and pair_ids is not None:
+            if len(pair_ids) > num_tokens_to_remove:
+                window_len = min(len(pair_ids), stride + num_tokens_to_remove)
+                overflowing_tokens = pair_ids[-window_len:]
+                overflowing_token_boxes = pair_token_boxes[-window_len:]
+                pair_ids = pair_ids[:-num_tokens_to_remove]
+                pair_token_boxes = pair_token_boxes[:-num_tokens_to_remove]
+            else:
+                logger.error(
+                    f"We need to remove {num_tokens_to_remove} to truncate the input "
+                    f"but the second sequence has a length {len(pair_ids)}. "
+                    f"Please select another truncation strategy than {truncation_strategy}, "
+                    "for instance 'longest_first' or 'only_first'."
+                )
+
+        return (
+            ids,
+            token_boxes,
+            pair_ids,
+            pair_token_boxes,
+            labels,
+            overflowing_tokens,
+            overflowing_token_boxes,
+            overflowing_labels,
+        )
+
+    def _pad(
+        self,
+        encoded_inputs: Union[dict[str, EncodedInput], BatchEncoding],
+        max_length: Optional[int] = None,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_attention_mask: Optional[bool] = None,
+    ) -> dict:
+        """
+        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
+
+        Args:
+            encoded_inputs:
+                Dictionary of tokenized inputs (`list[int]`) or batch of tokenized inputs (`list[list[int]]`).
+            max_length: maximum length of the returned list and optionally padding length (see below).
+                Will truncate by taking into account the special tokens.
+            padding_strategy: PaddingStrategy to use for padding.
+
+                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
+                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
+                - PaddingStrategy.DO_NOT_PAD: Do not pad
+                The tokenizer padding sides are defined in self.padding_side:
+
+                    - 'left': pads on the left of the sequences
+                    - 'right': pads on the right of the sequences
+            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
+                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta).
+            padding_side (`str`, *optional*):
+                The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+                Default value is picked from the class attribute of the same name.
+            return_attention_mask:
+                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
+        """
+        # Load from model defaults
+        if return_attention_mask is None:
+            return_attention_mask = "attention_mask" in self.model_input_names
+
+        required_input = encoded_inputs[self.model_input_names[0]]
+
+        if padding_strategy == PaddingStrategy.LONGEST:
+            max_length = len(required_input)
+
+        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+
+        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
+
+        # Initialize attention mask if not present.
+        if return_attention_mask and "attention_mask" not in encoded_inputs:
+            encoded_inputs["attention_mask"] = [1] * len(required_input)
+
+        if needs_to_be_padded:
+            difference = max_length - len(required_input)
+            padding_side = padding_side if padding_side is not None else self.padding_side
+            if padding_side == "right":
+                if return_attention_mask:
+                    encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
+                if "token_type_ids" in encoded_inputs:
+                    encoded_inputs["token_type_ids"] = (
+                        encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
+                    )
+                if "bbox" in encoded_inputs:
+                    encoded_inputs["bbox"] = encoded_inputs["bbox"] + [self.pad_token_box] * difference
+                if "labels" in encoded_inputs:
+                    encoded_inputs["labels"] = encoded_inputs["labels"] + [self.pad_token_label] * difference
+                if "special_tokens_mask" in encoded_inputs:
+                    encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
+                encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
+            elif padding_side == "left":
+                if return_attention_mask:
+                    encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
+                if "token_type_ids" in encoded_inputs:
+                    encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
+                        "token_type_ids"
+                    ]
+                if "bbox" in encoded_inputs:
+                    encoded_inputs["bbox"] = [self.pad_token_box] * difference + encoded_inputs["bbox"]
+                if "labels" in encoded_inputs:
+                    encoded_inputs["labels"] = [self.pad_token_label] * difference + encoded_inputs["labels"]
+                if "special_tokens_mask" in encoded_inputs:
+                    encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
+                encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
+            else:
+                raise ValueError("Invalid padding strategy:" + str(padding_side))
+
+        return encoded_inputs
+
+
+__all__ = ["LayoutXLMTokenizer"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..6710c6c8cb66ed08da2df391c28ad1db2e6cf81d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/layoutxlm/tokenization_layoutxlm_fast.py
@@ -0,0 +1,815 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License
+"""Tokenization classes for LayoutXLM model."""
+
+import os
+from shutil import copyfile
+from typing import Optional, Union
+
+from ...tokenization_utils import AddedToken
+from ...tokenization_utils_base import (
+    BatchEncoding,
+    EncodedInput,
+    PreTokenizedInput,
+    TextInput,
+    TextInputPair,
+    TruncationStrategy,
+)
+from ...tokenization_utils_fast import PreTrainedTokenizerFast
+from ...utils import PaddingStrategy, TensorType, add_end_docstrings, is_sentencepiece_available, logging
+from ..xlm_roberta.tokenization_xlm_roberta_fast import (
+    VOCAB_FILES_NAMES,
+)
+
+
+if is_sentencepiece_available():
+    from .tokenization_layoutxlm import LayoutXLMTokenizer
+else:
+    LayoutXLMTokenizer = None
+
+
+logger = logging.get_logger(__name__)
+
+LAYOUTXLM_ENCODE_KWARGS_DOCSTRING = r"""
+            add_special_tokens (`bool`, *optional*, defaults to `True`):
+                Whether or not to encode the sequences with the special tokens relative to their model.
+            padding (`bool`, `str` or [`~file_utils.PaddingStrategy`], *optional*, defaults to `False`):
+                Activates and controls padding. Accepts the following values:
+
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            truncation (`bool`, `str` or [`~tokenization_utils_base.TruncationStrategy`], *optional*, defaults to `False`):
+                Activates and controls truncation. Accepts the following values:
+
+                - `True` or `'longest_first'`: Truncate to a maximum length specified with the argument `max_length` or
+                  to the maximum acceptable input length for the model if that argument is not provided. This will
+                  truncate token by token, removing a token from the longest sequence in the pair if a pair of
+                  sequences (or a batch of pairs) is provided.
+                - `'only_first'`: Truncate to a maximum length specified with the argument `max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided. This will only
+                  truncate the first sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+                - `'only_second'`: Truncate to a maximum length specified with the argument `max_length` or to the
+                  maximum acceptable input length for the model if that argument is not provided. This will only
+                  truncate the second sequence of a pair if a pair of sequences (or a batch of pairs) is provided.
+                - `False` or `'do_not_truncate'` (default): No truncation (i.e., can output batch with sequence lengths
+                  greater than the model maximum admissible input size).
+            max_length (`int`, *optional*):
+                Controls the maximum length to use by one of the truncation/padding parameters.
+
+                If left unset or set to `None`, this will use the predefined model maximum length if a maximum length
+                is required by one of the truncation/padding parameters. If the model has no specific maximum input
+                length (like XLNet) truncation/padding to a maximum length will be deactivated.
+            stride (`int`, *optional*, defaults to 0):
+                If set to a number along with `max_length`, the overflowing tokens returned when
+                `return_overflowing_tokens=True` will contain some tokens from the end of the truncated sequence
+                returned to provide some overlap between truncated and overflowing sequences. The value of this
+                argument defines the number of overlapping tokens.
+            pad_to_multiple_of (`int`, *optional*):
+                If set will pad the sequence to a multiple of the provided value. This is especially useful to enable
+                the use of Tensor Cores on NVIDIA hardware with compute capability `>= 7.5` (Volta).
+            return_tensors (`str` or [`~file_utils.TensorType`], *optional*):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+            return_token_type_ids (`bool`, *optional*):
+                Whether to return token type IDs. If left to the default, will return the token type IDs according to
+                the specific tokenizer's default, defined by the `return_outputs` attribute.
+
+                [What are token type IDs?](../glossary#token-type-ids)
+            return_attention_mask (`bool`, *optional*):
+                Whether to return the attention mask. If left to the default, will return the attention mask according
+                to the specific tokenizer's default, defined by the `return_outputs` attribute.
+
+                [What are attention masks?](../glossary#attention-mask)
+            return_overflowing_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not to return overflowing token sequences. If a pair of sequences of input ids (or a batch
+                of pairs) is provided with `truncation_strategy = longest_first` or `True`, an error is raised instead
+                of returning overflowing tokens.
+            return_special_tokens_mask (`bool`, *optional*, defaults to `False`):
+                Whether or not to return special tokens mask information.
+            return_offsets_mapping (`bool`, *optional*, defaults to `False`):
+                Whether or not to return `(char_start, char_end)` for each token.
+
+                This is only available on fast tokenizers inheriting from [`PreTrainedTokenizerFast`], if using
+                Python's tokenizer, this method will raise `NotImplementedError`.
+            return_length  (`bool`, *optional*, defaults to `False`):
+                Whether or not to return the lengths of the encoded inputs.
+            verbose (`bool`, *optional*, defaults to `True`):
+                Whether or not to print more information and warnings.
+            **kwargs: passed to the `self.tokenize()` method
+
+        Return:
+            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model.
+
+              [What are input IDs?](../glossary#input-ids)
+
+            - **bbox** -- List of bounding boxes to be fed to a model.
+
+            - **token_type_ids** -- List of token type ids to be fed to a model (when `return_token_type_ids=True` or
+              if *"token_type_ids"* is in `self.model_input_names`).
+
+              [What are token type IDs?](../glossary#token-type-ids)
+
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names`).
+
+              [What are attention masks?](../glossary#attention-mask)
+
+            - **labels** -- List of labels to be fed to a model. (when `word_labels` is specified).
+            - **overflowing_tokens** -- List of overflowing tokens sequences (when a `max_length` is specified and
+              `return_overflowing_tokens=True`).
+            - **num_truncated_tokens** -- Number of tokens truncated (when a `max_length` is specified and
+              `return_overflowing_tokens=True`).
+            - **special_tokens_mask** -- List of 0s and 1s, with 1 specifying added special tokens and 0 specifying
+              regular sequence tokens (when `add_special_tokens=True` and `return_special_tokens_mask=True`).
+            - **length** -- The length of the inputs (when `return_length=True`).
+"""
+
+
+class LayoutXLMTokenizerFast(PreTrainedTokenizerFast):
+    """
+    Construct a "fast" LayoutXLM tokenizer (backed by HuggingFace's *tokenizers* library). Adapted from
+    [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
+    [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).
+
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
+            The token used is the `sep_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        mask_token (`str`, *optional*, defaults to `"<mask>"`):
+            The token used for masking values. This is the token used when training this model with masked language
+            modeling. This is the token which the model will try to predict.
+        cls_token_box (`list[int]`, *optional*, defaults to `[0, 0, 0, 0]`):
+            The bounding box to use for the special [CLS] token.
+        sep_token_box (`list[int]`, *optional*, defaults to `[1000, 1000, 1000, 1000]`):
+            The bounding box to use for the special [SEP] token.
+        pad_token_box (`list[int]`, *optional*, defaults to `[0, 0, 0, 0]`):
+            The bounding box to use for the special [PAD] token.
+        pad_token_label (`int`, *optional*, defaults to -100):
+            The label to use for padding tokens. Defaults to -100, which is the `ignore_index` of PyTorch's
+            CrossEntropyLoss.
+        only_label_first_subword (`bool`, *optional*, defaults to `True`):
+            Whether or not to only label the first subword, in case word labels are provided.
+        additional_special_tokens (`list[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
+            Additional special tokens used by the tokenizer.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+    slow_tokenizer_class = LayoutXLMTokenizer
+
+    def __init__(
+        self,
+        vocab_file=None,
+        tokenizer_file=None,
+        bos_token="<s>",
+        eos_token="</s>",
+        sep_token="</s>",
+        cls_token="<s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        mask_token="<mask>",
+        cls_token_box=[0, 0, 0, 0],
+        sep_token_box=[1000, 1000, 1000, 1000],
+        pad_token_box=[0, 0, 0, 0],
+        pad_token_label=-100,
+        only_label_first_subword=True,
+        **kwargs,
+    ):
+        # Mask token behave like a normal word, i.e. include the space before it
+        mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
+
+        super().__init__(
+            vocab_file,
+            tokenizer_file=tokenizer_file,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            mask_token=mask_token,
+            cls_token_box=cls_token_box,
+            sep_token_box=sep_token_box,
+            pad_token_box=pad_token_box,
+            pad_token_label=pad_token_label,
+            only_label_first_subword=only_label_first_subword,
+            **kwargs,
+        )
+
+        self.vocab_file = vocab_file
+
+        # additional properties
+        self.cls_token_box = cls_token_box
+        self.sep_token_box = sep_token_box
+        self.pad_token_box = pad_token_box
+        self.pad_token_label = pad_token_label
+        self.only_label_first_subword = only_label_first_subword
+
+    @add_end_docstrings(LAYOUTXLM_ENCODE_KWARGS_DOCSTRING)
+    def __call__(
+        self,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]],
+        text_pair: Optional[Union[PreTokenizedInput, list[PreTokenizedInput]]] = None,
+        boxes: Optional[Union[list[list[int]], list[list[list[int]]]]] = None,
+        word_labels: Optional[Union[list[int], list[list[int]]]] = None,
+        add_special_tokens: bool = True,
+        padding: Union[bool, str, PaddingStrategy] = False,
+        truncation: Union[bool, str, TruncationStrategy] = None,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        """
+        Main method to tokenize and prepare for the model one or several sequence(s) or one or several pair(s) of
+        sequences with word-level normalized bounding boxes and optional labels.
+
+        Args:
+            text (`str`, `list[str]`, `list[list[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string, a list of strings
+                (words of a single example or questions of a batch of examples) or a list of list of strings (batch of
+                words).
+            text_pair (`list[str]`, `list[list[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence should be a list of strings
+                (pretokenized string).
+            boxes (`list[list[int]]`, `list[list[list[int]]]`):
+                Word-level bounding boxes. Each bounding box should be normalized to be on a 0-1000 scale.
+            word_labels (`list[int]`, `list[list[int]]`, *optional*):
+                Word-level integer labels (for token classification tasks such as FUNSD, CORD).
+        """
+
+        # Input type checking for clearer error
+        def _is_valid_text_input(t):
+            if isinstance(t, str):
+                # Strings are fine
+                return True
+            elif isinstance(t, (list, tuple)):
+                # List are fine as long as they are...
+                if len(t) == 0:
+                    # ... empty
+                    return True
+                elif isinstance(t[0], str):
+                    # ... list of strings
+                    return True
+                elif isinstance(t[0], (list, tuple)):
+                    # ... list with an empty list or with a list of strings
+                    return len(t[0]) == 0 or isinstance(t[0][0], str)
+                else:
+                    return False
+            else:
+                return False
+
+        if text_pair is not None:
+            # in case text + text_pair are provided, text = questions, text_pair = words
+            if not _is_valid_text_input(text):
+                raise ValueError("text input must of type `str` (single example) or `list[str]` (batch of examples). ")
+            if not isinstance(text_pair, (list, tuple)):
+                raise ValueError(
+                    "words must of type `list[str]` (single pretokenized example), "
+                    "or `list[list[str]]` (batch of pretokenized examples)."
+                )
+        else:
+            # in case only text is provided => must be words
+            if not isinstance(text, (list, tuple)):
+                raise ValueError(
+                    "Words must of type `list[str]` (single pretokenized example), "
+                    "or `list[list[str]]` (batch of pretokenized examples)."
+                )
+
+        if text_pair is not None:
+            is_batched = isinstance(text, (list, tuple))
+        else:
+            is_batched = isinstance(text, (list, tuple)) and text and isinstance(text[0], (list, tuple))
+
+        words = text if text_pair is None else text_pair
+        if boxes is None:
+            raise ValueError("You must provide corresponding bounding boxes")
+        if is_batched:
+            if len(words) != len(boxes):
+                raise ValueError("You must provide words and boxes for an equal amount of examples")
+            for words_example, boxes_example in zip(words, boxes):
+                if len(words_example) != len(boxes_example):
+                    raise ValueError("You must provide as many words as there are bounding boxes")
+        else:
+            if len(words) != len(boxes):
+                raise ValueError("You must provide as many words as there are bounding boxes")
+
+        if is_batched:
+            if text_pair is not None and len(text) != len(text_pair):
+                raise ValueError(
+                    f"batch length of `text`: {len(text)} does not match batch length of `text_pair`:"
+                    f" {len(text_pair)}."
+                )
+            batch_text_or_text_pairs = list(zip(text, text_pair)) if text_pair is not None else text
+            is_pair = bool(text_pair is not None)
+            return self.batch_encode_plus(
+                batch_text_or_text_pairs=batch_text_or_text_pairs,
+                is_pair=is_pair,
+                boxes=boxes,
+                word_labels=word_labels,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs,
+            )
+        else:
+            return self.encode_plus(
+                text=text,
+                text_pair=text_pair,
+                boxes=boxes,
+                word_labels=word_labels,
+                add_special_tokens=add_special_tokens,
+                padding=padding,
+                truncation=truncation,
+                max_length=max_length,
+                stride=stride,
+                pad_to_multiple_of=pad_to_multiple_of,
+                padding_side=padding_side,
+                return_tensors=return_tensors,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=return_offsets_mapping,
+                return_length=return_length,
+                verbose=verbose,
+                **kwargs,
+            )
+
+    def tokenize(self, text: str, pair: Optional[str] = None, add_special_tokens: bool = False, **kwargs) -> list[str]:
+        batched_input = [(text, pair)] if pair else [text]
+
+        self._tokenizer.encode_special_tokens = kwargs.pop(
+            "split_special_tokens", self._tokenizer.encode_special_tokens
+        )
+
+        encodings = self._tokenizer.encode_batch(
+            batched_input, add_special_tokens=add_special_tokens, is_pretokenized=False, **kwargs
+        )
+
+        return encodings[0].tokens
+
+    def _batch_encode_plus(
+        self,
+        batch_text_or_text_pairs: Union[
+            list[TextInput],
+            list[TextInputPair],
+            list[PreTokenizedInput],
+        ],
+        is_pair: Optional[bool] = None,
+        boxes: Optional[list[list[list[int]]]] = None,
+        word_labels: Optional[list[list[int]]] = None,
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_tensors: Optional[str] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        if not isinstance(batch_text_or_text_pairs, list):
+            raise TypeError(f"batch_text_or_text_pairs has to be a list (got {type(batch_text_or_text_pairs)})")
+
+        # Set the truncation and padding strategy and restore the initial configuration
+        self.set_truncation_and_padding(
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
+        )
+
+        if is_pair:
+            batch_text_or_text_pairs = [(text.split(), text_pair) for text, text_pair in batch_text_or_text_pairs]
+
+        encodings = self._tokenizer.encode_batch(
+            batch_text_or_text_pairs,
+            add_special_tokens=add_special_tokens,
+            is_pretokenized=True,  # we set this to True as LayoutLMv2 always expects pretokenized inputs
+        )
+
+        # Convert encoding to dict
+        # `Tokens` has type: tuple[
+        #                       list[dict[str, list[list[int]]]] or list[dict[str, 2D-Tensor]],
+        #                       list[EncodingFast]
+        #                    ]
+        # with nested dimensions corresponding to batch, overflows, sequence length
+        tokens_and_encodings = [
+            self._convert_encoding(
+                encoding=encoding,
+                return_token_type_ids=return_token_type_ids,
+                return_attention_mask=return_attention_mask,
+                return_overflowing_tokens=return_overflowing_tokens,
+                return_special_tokens_mask=return_special_tokens_mask,
+                return_offsets_mapping=True
+                if word_labels is not None
+                else return_offsets_mapping,  # we use offsets to create the labels
+                return_length=return_length,
+                verbose=verbose,
+            )
+            for encoding in encodings
+        ]
+
+        # Convert the output to have dict[list] from list[dict] and remove the additional overflows dimension
+        # From (variable) shape (batch, overflows, sequence length) to ~ (batch * overflows, sequence length)
+        # (we say ~ because the number of overflow varies with the example in the batch)
+        #
+        # To match each overflowing sample with the original sample in the batch
+        # we add an overflow_to_sample_mapping array (see below)
+        sanitized_tokens = {}
+        for key in tokens_and_encodings[0][0]:
+            stack = [e for item, _ in tokens_and_encodings for e in item[key]]
+            sanitized_tokens[key] = stack
+        sanitized_encodings = [e for _, item in tokens_and_encodings for e in item]
+
+        # If returning overflowing tokens, we need to return a mapping
+        # from the batch idx to the original sample
+        if return_overflowing_tokens:
+            overflow_to_sample_mapping = []
+            for i, (toks, _) in enumerate(tokens_and_encodings):
+                overflow_to_sample_mapping += [i] * len(toks["input_ids"])
+            sanitized_tokens["overflow_to_sample_mapping"] = overflow_to_sample_mapping
+
+        for input_ids in sanitized_tokens["input_ids"]:
+            self._eventual_warn_about_too_long_sequence(input_ids, max_length, verbose)
+
+        # create the token boxes
+        token_boxes = []
+        for batch_index in range(len(sanitized_tokens["input_ids"])):
+            if return_overflowing_tokens:
+                original_index = sanitized_tokens["overflow_to_sample_mapping"][batch_index]
+            else:
+                original_index = batch_index
+            token_boxes_example = []
+            for id, sequence_id, word_id in zip(
+                sanitized_tokens["input_ids"][batch_index],
+                sanitized_encodings[batch_index].sequence_ids,
+                sanitized_encodings[batch_index].word_ids,
+            ):
+                if word_id is not None:
+                    if is_pair and sequence_id == 0:
+                        token_boxes_example.append(self.pad_token_box)
+                    else:
+                        token_boxes_example.append(boxes[original_index][word_id])
+                else:
+                    if id == self.cls_token_id:
+                        token_boxes_example.append(self.cls_token_box)
+                    elif id == self.sep_token_id:
+                        token_boxes_example.append(self.sep_token_box)
+                    elif id == self.pad_token_id:
+                        token_boxes_example.append(self.pad_token_box)
+                    else:
+                        raise ValueError("Id not recognized")
+            token_boxes.append(token_boxes_example)
+
+        sanitized_tokens["bbox"] = token_boxes
+
+        # optionally, create the labels
+        if word_labels is not None:
+            labels = []
+            for batch_index in range(len(sanitized_tokens["input_ids"])):
+                if return_overflowing_tokens:
+                    original_index = sanitized_tokens["overflow_to_sample_mapping"][batch_index]
+                else:
+                    original_index = batch_index
+                labels_example = []
+                for id, offset, word_id in zip(
+                    sanitized_tokens["input_ids"][batch_index],
+                    sanitized_tokens["offset_mapping"][batch_index],
+                    sanitized_encodings[batch_index].word_ids,
+                ):
+                    if word_id is not None:
+                        if self.only_label_first_subword:
+                            if offset[0] == 0:
+                                # Use the real label id for the first token of the word, and padding ids for the remaining tokens
+                                labels_example.append(word_labels[original_index][word_id])
+                            else:
+                                labels_example.append(self.pad_token_label)
+                        else:
+                            labels_example.append(word_labels[original_index][word_id])
+                    else:
+                        labels_example.append(self.pad_token_label)
+                labels.append(labels_example)
+
+            sanitized_tokens["labels"] = labels
+            # finally, remove offsets if the user didn't want them
+            if not return_offsets_mapping:
+                del sanitized_tokens["offset_mapping"]
+
+        return BatchEncoding(sanitized_tokens, sanitized_encodings, tensor_type=return_tensors)
+
+    def _encode_plus(
+        self,
+        text: Union[TextInput, PreTokenizedInput],
+        text_pair: Optional[PreTokenizedInput] = None,
+        boxes: Optional[list[list[int]]] = None,
+        word_labels: Optional[list[int]] = None,
+        add_special_tokens: bool = True,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        truncation_strategy: TruncationStrategy = TruncationStrategy.DO_NOT_TRUNCATE,
+        max_length: Optional[int] = None,
+        stride: int = 0,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_tensors: Optional[bool] = None,
+        return_token_type_ids: Optional[bool] = None,
+        return_attention_mask: Optional[bool] = None,
+        return_overflowing_tokens: bool = False,
+        return_special_tokens_mask: bool = False,
+        return_offsets_mapping: bool = False,
+        return_length: bool = False,
+        verbose: bool = True,
+        **kwargs,
+    ) -> BatchEncoding:
+        # make it a batched input
+        # 2 options:
+        # 1) only text, in case text must be a list of str
+        # 2) text + text_pair, in which case text = str and text_pair a list of str
+        batched_input = [(text, text_pair)] if text_pair else [text]
+        batched_boxes = [boxes]
+        batched_word_labels = [word_labels] if word_labels is not None else None
+        batched_output = self._batch_encode_plus(
+            batched_input,
+            is_pair=bool(text_pair is not None),
+            boxes=batched_boxes,
+            word_labels=batched_word_labels,
+            add_special_tokens=add_special_tokens,
+            padding_strategy=padding_strategy,
+            truncation_strategy=truncation_strategy,
+            max_length=max_length,
+            stride=stride,
+            pad_to_multiple_of=pad_to_multiple_of,
+            padding_side=padding_side,
+            return_tensors=return_tensors,
+            return_token_type_ids=return_token_type_ids,
+            return_attention_mask=return_attention_mask,
+            return_overflowing_tokens=return_overflowing_tokens,
+            return_special_tokens_mask=return_special_tokens_mask,
+            return_offsets_mapping=return_offsets_mapping,
+            return_length=return_length,
+            verbose=verbose,
+            **kwargs,
+        )
+
+        # Return tensor is None, then we can remove the leading batch axis
+        # Overflowing tokens are returned as a batch of output so we keep them in this case
+        if return_tensors is None and not return_overflowing_tokens:
+            batched_output = BatchEncoding(
+                {
+                    key: value[0] if len(value) > 0 and isinstance(value[0], list) else value
+                    for key, value in batched_output.items()
+                },
+                batched_output.encodings,
+            )
+
+        self._eventual_warn_about_too_long_sequence(batched_output["input_ids"], max_length, verbose)
+
+        return batched_output
+
+    def _pad(
+        self,
+        encoded_inputs: Union[dict[str, EncodedInput], BatchEncoding],
+        max_length: Optional[int] = None,
+        padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
+        pad_to_multiple_of: Optional[int] = None,
+        padding_side: Optional[str] = None,
+        return_attention_mask: Optional[bool] = None,
+    ) -> dict:
+        """
+        Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
+
+        Args:
+            encoded_inputs:
+                Dictionary of tokenized inputs (`list[int]`) or batch of tokenized inputs (`list[list[int]]`).
+            max_length: maximum length of the returned list and optionally padding length (see below).
+                Will truncate by taking into account the special tokens.
+            padding_strategy: PaddingStrategy to use for padding.
+
+                - PaddingStrategy.LONGEST Pad to the longest sequence in the batch
+                - PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
+                - PaddingStrategy.DO_NOT_PAD: Do not pad
+                The tokenizer padding sides are defined in self.padding_side:
+
+                    - 'left': pads on the left of the sequences
+                    - 'right': pads on the right of the sequences
+            pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
+                This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta).
+            padding_side (`str`, *optional*):
+                The side on which the model should have padding applied. Should be selected between ['right', 'left'].
+                Default value is picked from the class attribute of the same name.
+            return_attention_mask:
+                (optional) Set to False to avoid returning attention mask (default: set to model specifics)
+        """
+        # Load from model defaults
+        if return_attention_mask is None:
+            return_attention_mask = "attention_mask" in self.model_input_names
+
+        required_input = encoded_inputs[self.model_input_names[0]]
+
+        if padding_strategy == PaddingStrategy.LONGEST:
+            max_length = len(required_input)
+
+        if max_length is not None and pad_to_multiple_of is not None and (max_length % pad_to_multiple_of != 0):
+            max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
+
+        needs_to_be_padded = padding_strategy != PaddingStrategy.DO_NOT_PAD and len(required_input) != max_length
+
+        # Initialize attention mask if not present.
+        if return_attention_mask and "attention_mask" not in encoded_inputs:
+            encoded_inputs["attention_mask"] = [1] * len(required_input)
+
+        if needs_to_be_padded:
+            difference = max_length - len(required_input)
+            padding_side = padding_side if padding_side is not None else self.padding_side
+            if padding_side == "right":
+                if return_attention_mask:
+                    encoded_inputs["attention_mask"] = encoded_inputs["attention_mask"] + [0] * difference
+                if "token_type_ids" in encoded_inputs:
+                    encoded_inputs["token_type_ids"] = (
+                        encoded_inputs["token_type_ids"] + [self.pad_token_type_id] * difference
+                    )
+                if "bbox" in encoded_inputs:
+                    encoded_inputs["bbox"] = encoded_inputs["bbox"] + [self.pad_token_box] * difference
+                if "labels" in encoded_inputs:
+                    encoded_inputs["labels"] = encoded_inputs["labels"] + [self.pad_token_label] * difference
+                if "special_tokens_mask" in encoded_inputs:
+                    encoded_inputs["special_tokens_mask"] = encoded_inputs["special_tokens_mask"] + [1] * difference
+                encoded_inputs[self.model_input_names[0]] = required_input + [self.pad_token_id] * difference
+            elif padding_side == "left":
+                if return_attention_mask:
+                    encoded_inputs["attention_mask"] = [0] * difference + encoded_inputs["attention_mask"]
+                if "token_type_ids" in encoded_inputs:
+                    encoded_inputs["token_type_ids"] = [self.pad_token_type_id] * difference + encoded_inputs[
+                        "token_type_ids"
+                    ]
+                if "bbox" in encoded_inputs:
+                    encoded_inputs["bbox"] = [self.pad_token_box] * difference + encoded_inputs["bbox"]
+                if "labels" in encoded_inputs:
+                    encoded_inputs["labels"] = [self.pad_token_label] * difference + encoded_inputs["labels"]
+                if "special_tokens_mask" in encoded_inputs:
+                    encoded_inputs["special_tokens_mask"] = [1] * difference + encoded_inputs["special_tokens_mask"]
+                encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
+            else:
+                raise ValueError("Invalid padding strategy:" + str(padding_side))
+
+        return encoded_inputs
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. An XLM-RoBERTa sequence has the following format:
+
+        - single sequence: `<s> X </s>`
+        - pair of sequences: `<s> A </s></s> B </s>`
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+
+        if token_ids_1 is None:
+            return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
+        cls = [self.cls_token_id]
+        sep = [self.sep_token_id]
+        return cls + token_ids_0 + sep + sep + token_ids_1 + sep
+
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. XLM-RoBERTa does
+        not make use of token type ids, therefore a list of zeros is returned.
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `list[int]`: List of zeros.
+
+        """
+
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        if not self.can_save_slow_tokenizer:
+            raise ValueError(
+                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
+                "tokenizer."
+            )
+
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory.")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+
+        return (out_vocab_file,)
+
+
+__all__ = ["LayoutXLMTokenizerFast"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/led/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/led/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..13bef0d263b1e570728f7593eb606ec4491a2310
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/led/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/led/__pycache__/configuration_led.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/led/__pycache__/configuration_led.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fe54f28b76db1f8889c771322e20c5eb94e2e8be
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/led/__pycache__/configuration_led.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/led/__pycache__/tokenization_led.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/led/__pycache__/tokenization_led.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1fb863f97d75af5cc8df546f2636b74716ffcf7b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/led/__pycache__/tokenization_led.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/led/__pycache__/tokenization_led_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/led/__pycache__/tokenization_led_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0a9dd80ff334ead19a4de4f2c946098e5756d7d6
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/led/__pycache__/tokenization_led_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama4/__pycache__/image_processing_llama4_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama4/__pycache__/image_processing_llama4_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..543f9f65d03d97995b9dd1d9c8e00ce265714e47
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llama4/__pycache__/image_processing_llama4_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..aadd45dc13ed4074437cc6f224b0348de110f292
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_llava import *
+    from .image_processing_llava_fast import *
+    from .modeling_llava import *
+    from .processing_llava import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e960aeeb0023de5ae54bf6107ed10f87eb01dbbd
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/__pycache__/configuration_llava.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/__pycache__/configuration_llava.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aa366fd24ecde516610c83eead5646da88b3c7d1
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/__pycache__/configuration_llava.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/__pycache__/image_processing_llava.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/__pycache__/image_processing_llava.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..406e8fae9dee06d986176d158742cd72e7b9ea3c
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/__pycache__/image_processing_llava.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/__pycache__/image_processing_llava_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/__pycache__/image_processing_llava_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4606589f99a2bd97a42982e2a8251fc222213aea
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/__pycache__/image_processing_llava_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/__pycache__/modeling_llava.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/__pycache__/modeling_llava.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0e27e0d3caa375ac8c2605aab367328280fe42e8
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/__pycache__/modeling_llava.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/__pycache__/processing_llava.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/__pycache__/processing_llava.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2579fd4956d0f6f5b92399b30eb38365e4319c1a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/__pycache__/processing_llava.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/configuration_llava.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/configuration_llava.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ae710c011986f422d965adcdb758b29e3113690
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/configuration_llava.py
@@ -0,0 +1,137 @@
+# coding=utf-8
+# Copyright 2023 Microsoft Research & University of Wisconsin-Madison and the HuggingFace Inc. team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Llava model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ..auto import CONFIG_MAPPING, AutoConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class LlavaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`LlavaForConditionalGeneration`]. It is used to instantiate an
+    Llava model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Llava-9B.
+
+    e.g. [llava-hf/llava-9b](https://huggingface.co/llava-hf/llava-9b)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vision_config (`Union[AutoConfig, dict]`,  *optional*, defaults to `CLIPVisionConfig`):
+            The config object or dictionary of the vision backbone.
+        text_config (`Union[AutoConfig, dict]`, *optional*, defaults to `LlamaConfig`):
+            The config object or dictionary of the text backbone.
+        image_token_index (`int`, *optional*, defaults to 32000):
+            The image token index to encode the image prompt.
+        projector_hidden_act (`str`, *optional*, defaults to `"gelu"`):
+            The activation function used by the multimodal projector.
+        vision_feature_select_strategy (`str`, *optional*, defaults to `"default"`):
+            The feature selection strategy used to select the vision feature from the vision backbone.
+            Can be one of `"default"` or `"full"`.
+        vision_feature_layer (`Union[int, list[int]]`, *optional*, defaults to -2):
+            The index of the layer to select the vision feature. If multiple indices are provided,
+            the vision feature of the corresponding indices will be concatenated to form the
+            vision features.
+        image_seq_length (`int`, *optional*, defaults to 576):
+            Sequence length of one image embedding.
+        multimodal_projector_bias (`bool`, *optional*, defaults to `True`):
+            Whether to use bias in the multimodal projector.
+
+    Example:
+
+    ```python
+    >>> from transformers import LlavaForConditionalGeneration, LlavaConfig, CLIPVisionConfig, LlamaConfig
+
+    >>> # Initializing a CLIP-vision config
+    >>> vision_config = CLIPVisionConfig()
+
+    >>> # Initializing a Llama config
+    >>> text_config = LlamaConfig()
+
+    >>> # Initializing a Llava llava-1.5-7b style configuration
+    >>> configuration = LlavaConfig(vision_config, text_config)
+
+    >>> # Initializing a model from the llava-1.5-7b style configuration
+    >>> model = LlavaForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "llava"
+    attribute_map = {
+        "image_token_id": "image_token_index",
+    }
+    sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
+
+    def __init__(
+        self,
+        vision_config=None,
+        text_config=None,
+        image_token_index=32000,
+        projector_hidden_act="gelu",
+        vision_feature_select_strategy="default",
+        vision_feature_layer=-2,
+        image_seq_length=576,
+        multimodal_projector_bias=True,
+        **kwargs,
+    ):
+        self.image_token_index = image_token_index
+        self.projector_hidden_act = projector_hidden_act
+        self.image_seq_length = image_seq_length
+
+        if vision_feature_select_strategy not in ["default", "full"]:
+            raise ValueError(
+                "vision_feature_select_strategy should be one of 'default', 'full'."
+                f"Got: {vision_feature_select_strategy}"
+            )
+
+        self.vision_feature_select_strategy = vision_feature_select_strategy
+        self.vision_feature_layer = vision_feature_layer
+
+        if isinstance(vision_config, dict):
+            vision_config["model_type"] = vision_config.get("model_type", "clip_vision_model")
+            vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
+        elif vision_config is None:
+            vision_config = CONFIG_MAPPING["clip_vision_model"](
+                intermediate_size=4096,
+                hidden_size=1024,
+                patch_size=14,
+                image_size=336,
+                num_hidden_layers=24,
+                num_attention_heads=16,
+                vocab_size=32000,
+                projection_dim=768,
+            )
+
+        self.vision_config = vision_config
+
+        if isinstance(text_config, dict):
+            text_config["model_type"] = text_config.get("model_type", "llama")
+            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+        elif text_config is None:
+            text_config = CONFIG_MAPPING["llama"]()
+
+        self.text_config = text_config
+        self.multimodal_projector_bias = multimodal_projector_bias
+
+        super().__init__(**kwargs)
+
+
+__all__ = ["LlavaConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/image_processing_llava.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/image_processing_llava.py
new file mode 100644
index 0000000000000000000000000000000000000000..5420d6fe291861951c7e6b5bfc555b81c57cbc90
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/image_processing_llava.py
@@ -0,0 +1,437 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Image processor class for LLaVa."""
+
+from typing import Optional, Union
+
+import numpy as np
+
+from ...image_processing_utils import BaseImageProcessor, BatchFeature, get_size_dict
+from ...image_transforms import (
+    convert_to_rgb,
+    get_resize_output_image_size,
+    resize,
+    to_channel_dimension_format,
+)
+from ...image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    infer_channel_dimension_format,
+    is_scaled_image,
+    make_flat_list_of_images,
+    to_numpy_array,
+    valid_images,
+    validate_kwargs,
+    validate_preprocess_arguments,
+)
+from ...utils import TensorType, is_vision_available, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+if is_vision_available():
+    import PIL
+
+
+class LlavaImageProcessor(BaseImageProcessor):
+    r"""
+    Constructs a LLaVa image processor.
+
+    Args:
+        do_pad (`bool`, *optional*, defaults to `False`):
+            Whether to pad the image to a square based on the longest edge.
+            The padding value is determined by the `image_mean` parameter.
+            Can be overridden by `do_pad` in the `preprocess` method.
+        do_resize (`bool`, *optional*, defaults to `True`):
+            Whether to resize the image's (height, width) dimensions to the specified `size`. Can be overridden by
+            `do_resize` in the `preprocess` method.
+        size (`dict[str, int]` *optional*, defaults to `{"shortest_edge": 224}`):
+            Size of the image after resizing. The shortest edge of the image is resized to size["shortest_edge"], with
+            the longest edge resized to keep the input aspect ratio. Can be overridden by `size` in the `preprocess`
+            method.
+        resample (`PILImageResampling`, *optional*, defaults to `Resampling.BICUBIC`):
+            Resampling filter to use if resizing the image. Can be overridden by `resample` in the `preprocess` method.
+        do_center_crop (`bool`, *optional*, defaults to `True`):
+            Whether to center crop the image to the specified `crop_size`. Can be overridden by `do_center_crop` in the
+            `preprocess` method.
+        crop_size (`dict[str, int]` *optional*, defaults to 224):
+            Size of the output image after applying `center_crop`. Can be overridden by `crop_size` in the `preprocess`
+            method.
+        do_rescale (`bool`, *optional*, defaults to `True`):
+            Whether to rescale the image by the specified scale `rescale_factor`. Can be overridden by `do_rescale` in
+            the `preprocess` method.
+        rescale_factor (`int` or `float`, *optional*, defaults to `1/255`):
+            Scale factor to use if rescaling the image. Can be overridden by `rescale_factor` in the `preprocess`
+            method.
+        do_normalize (`bool`, *optional*, defaults to `True`):
+            Whether to normalize the image. Can be overridden by `do_normalize` in the `preprocess` method.
+        image_mean (`float` or `list[float]`, *optional*, defaults to `[0.48145466, 0.4578275, 0.40821073]`):
+            Mean to use if normalizing the image. This is a float or list of floats the length of the number of
+            channels in the image. Can be overridden by the `image_mean` parameter in the `preprocess` method.
+        image_std (`float` or `list[float]`, *optional*, defaults to `[0.26862954, 0.26130258, 0.27577711]`):
+            Standard deviation to use if normalizing the image. This is a float or list of floats the length of the
+            number of channels in the image. Can be overridden by the `image_std` parameter in the `preprocess` method.
+            Can be overridden by the `image_std` parameter in the `preprocess` method.
+        do_convert_rgb (`bool`, *optional*, defaults to `True`):
+            Whether to convert the image to RGB.
+    """
+
+    model_input_names = ["pixel_values"]
+
+    def __init__(
+        self,
+        do_pad: bool = False,
+        do_resize: bool = True,
+        size: Optional[dict[str, int]] = None,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        do_center_crop: bool = True,
+        crop_size: Optional[dict[str, int]] = None,
+        do_rescale: bool = True,
+        rescale_factor: Union[int, float] = 1 / 255,
+        do_normalize: bool = True,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_convert_rgb: bool = True,
+        **kwargs,
+    ) -> None:
+        super().__init__(**kwargs)
+        size = size if size is not None else {"shortest_edge": 224}
+        size = get_size_dict(size, default_to_square=False)
+        crop_size = crop_size if crop_size is not None else {"height": 224, "width": 224}
+        crop_size = get_size_dict(crop_size, default_to_square=True, param_name="crop_size")
+
+        self.do_pad = do_pad
+        self.do_resize = do_resize
+        self.size = size
+        self.resample = resample
+        self.do_center_crop = do_center_crop
+        self.crop_size = crop_size
+        self.do_rescale = do_rescale
+        self.rescale_factor = rescale_factor
+        self.do_normalize = do_normalize
+        self.image_mean = image_mean if image_mean is not None else OPENAI_CLIP_MEAN
+        self.image_std = image_std if image_std is not None else OPENAI_CLIP_STD
+        self.do_convert_rgb = do_convert_rgb
+        self._valid_processor_keys = [
+            "images",
+            "do_pad",
+            "do_resize",
+            "size",
+            "resample",
+            "do_center_crop",
+            "crop_size",
+            "do_rescale",
+            "rescale_factor",
+            "do_normalize",
+            "image_mean",
+            "image_std",
+            "do_convert_rgb",
+            "return_tensors",
+            "data_format",
+            "input_data_format",
+        ]
+
+    def pad_to_square(
+        self,
+        image: np.ndarray,
+        background_color: Union[int, tuple[int, int, int]] = 0,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> np.ndarray:
+        """
+        Pads an image to a square based on the longest edge.
+
+        Args:
+            image (`np.ndarray`):
+                The image to pad.
+            background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0):
+                The color to use for the padding. Can be an integer for single channel or a
+                tuple of integers representing for multi-channel images. If passed as integer
+                in multi-channel mode, it will default to `0` in subsequent channels.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the output image. Can be one of:
+                    - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                    - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                If unset, will use same as the input image.
+            input_data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format for the input image. Can be one of:
+                    - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                    - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                If unset, will use the inferred format of the input image.
+
+        Returns:
+            `np.ndarray`: The padded image.
+        """
+        height, width = get_image_size(image, input_data_format)
+        num_channels = image.shape[0] if input_data_format == ChannelDimension.FIRST else image.shape[-1]
+
+        if height == width:
+            image = (
+                to_channel_dimension_format(image, data_format, input_data_format)
+                if data_format is not None
+                else image
+            )
+            return image
+
+        max_dim = max(height, width)
+
+        # Ensure background_color is the correct shape
+        if isinstance(background_color, int):
+            background_color = [background_color]
+        elif len(background_color) != num_channels:
+            raise ValueError(
+                f"background_color must have no more than {num_channels} elements to match the number of channels"
+            )
+
+        if input_data_format == ChannelDimension.FIRST:
+            result = np.zeros((num_channels, max_dim, max_dim), dtype=image.dtype)
+            for i, color in enumerate(background_color):
+                result[i, :, :] = color
+            if width > height:
+                start = (max_dim - height) // 2
+                result[:, start : start + height, :] = image
+            else:
+                start = (max_dim - width) // 2
+                result[:, :, start : start + width] = image
+        else:
+            result = np.zeros((max_dim, max_dim, num_channels), dtype=image.dtype)
+            for i, color in enumerate(background_color):
+                result[:, :, i] = color
+            if width > height:
+                start = (max_dim - height) // 2
+                result[start : start + height, :, :] = image
+            else:
+                start = (max_dim - width) // 2
+                result[:, start : start + width, :] = image
+
+        image = (
+            to_channel_dimension_format(result, data_format, input_data_format) if data_format is not None else result
+        )
+        return image
+
+    # Copied from transformers.models.clip.image_processing_clip.CLIPImageProcessor.resize
+    def resize(
+        self,
+        image: np.ndarray,
+        size: dict[str, int],
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        data_format: Optional[Union[str, ChannelDimension]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> np.ndarray:
+        """
+        Resize an image. The shortest edge of the image is resized to size["shortest_edge"], with the longest edge
+        resized to keep the input aspect ratio.
+
+        Args:
+            image (`np.ndarray`):
+                Image to resize.
+            size (`dict[str, int]`):
+                Size of the output image.
+            resample (`PILImageResampling`, *optional*, defaults to `PILImageResampling.BICUBIC`):
+                Resampling filter to use when resiizing the image.
+            data_format (`str` or `ChannelDimension`, *optional*):
+                The channel dimension format of the image. If not provided, it will be the same as the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format of the input image. If not provided, it will be inferred.
+        """
+        default_to_square = True
+        if "shortest_edge" in size:
+            size = size["shortest_edge"]
+            default_to_square = False
+        elif "height" in size and "width" in size:
+            size = (size["height"], size["width"])
+        else:
+            raise ValueError("Size must contain either 'shortest_edge' or 'height' and 'width'.")
+
+        output_size = get_resize_output_image_size(
+            image,
+            size=size,
+            default_to_square=default_to_square,
+            input_data_format=input_data_format,
+        )
+        return resize(
+            image,
+            size=output_size,
+            resample=resample,
+            data_format=data_format,
+            input_data_format=input_data_format,
+            **kwargs,
+        )
+
+    def preprocess(
+        self,
+        images: ImageInput,
+        do_pad: Optional[bool] = None,
+        do_resize: Optional[bool] = None,
+        size: Optional[dict[str, int]] = None,
+        resample: Optional[PILImageResampling] = None,
+        do_center_crop: Optional[bool] = None,
+        crop_size: Optional[int] = None,
+        do_rescale: Optional[bool] = None,
+        rescale_factor: Optional[float] = None,
+        do_normalize: Optional[bool] = None,
+        image_mean: Optional[Union[float, list[float]]] = None,
+        image_std: Optional[Union[float, list[float]]] = None,
+        do_convert_rgb: Optional[bool] = None,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        data_format: Optional[ChannelDimension] = ChannelDimension.FIRST,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+        **kwargs,
+    ) -> PIL.Image.Image:
+        """
+        Preprocess an image or batch of images.
+
+        Args:
+            images (`ImageInput`):
+                Image to preprocess. Expects a single or batch of images with pixel values ranging from 0 to 255. If
+                passing in images with pixel values between 0 and 1, set `do_rescale=False`.
+            do_pad (`bool`, *optional*, defaults to `self.do_pad`):
+                Whether to pad the image to a square based on the longest edge.
+                The padding value is determined by the `image_mean` parameter.
+            do_resize (`bool`, *optional*, defaults to `self.do_resize`):
+                Whether to resize the image.
+            size (`dict[str, int]`, *optional*, defaults to `self.size`):
+                Size of the image after resizing. Shortest edge of the image is resized to size["shortest_edge"], with
+                the longest edge resized to keep the input aspect ratio.
+            resample (`int`, *optional*, defaults to `self.resample`):
+                Resampling filter to use if resizing the image. This can be one of the enum `PILImageResampling`. Only
+                has an effect if `do_resize` is set to `True`.
+            do_center_crop (`bool`, *optional*, defaults to `self.do_center_crop`):
+                Whether to center crop the image.
+            crop_size (`dict[str, int]`, *optional*, defaults to `self.crop_size`):
+                Size of the center crop. Only has an effect if `do_center_crop` is set to `True`.
+            do_rescale (`bool`, *optional*, defaults to `self.do_rescale`):
+                Whether to rescale the image.
+            rescale_factor (`float`, *optional*, defaults to `self.rescale_factor`):
+                Rescale factor to rescale the image by if `do_rescale` is set to `True`.
+            do_normalize (`bool`, *optional*, defaults to `self.do_normalize`):
+                Whether to normalize the image.
+            image_mean (`float` or `list[float]`, *optional*, defaults to `self.image_mean`):
+                Image mean to use for normalization. Only has an effect if `do_normalize` is set to `True`.
+            image_std (`float` or `list[float]`, *optional*, defaults to `self.image_std`):
+                Image standard deviation to use for normalization. Only has an effect if `do_normalize` is set to
+                `True`.
+            do_convert_rgb (`bool`, *optional*, defaults to `self.do_convert_rgb`):
+                Whether to convert the image to RGB.
+            return_tensors (`str` or `TensorType`, *optional*):
+                The type of tensors to return. Can be one of:
+                - Unset: Return a list of `np.ndarray`.
+                - `TensorType.TENSORFLOW` or `'tf'`: Return a batch of type `tf.Tensor`.
+                - `TensorType.PYTORCH` or `'pt'`: Return a batch of type `torch.Tensor`.
+                - `TensorType.NUMPY` or `'np'`: Return a batch of type `np.ndarray`.
+                - `TensorType.JAX` or `'jax'`: Return a batch of type `jax.numpy.ndarray`.
+            data_format (`ChannelDimension` or `str`, *optional*, defaults to `ChannelDimension.FIRST`):
+                The channel dimension format for the output image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - Unset: Use the channel dimension format of the input image.
+            input_data_format (`ChannelDimension` or `str`, *optional*):
+                The channel dimension format for the input image. If unset, the channel dimension format is inferred
+                from the input image. Can be one of:
+                - `"channels_first"` or `ChannelDimension.FIRST`: image in (num_channels, height, width) format.
+                - `"channels_last"` or `ChannelDimension.LAST`: image in (height, width, num_channels) format.
+                - `"none"` or `ChannelDimension.NONE`: image in (height, width) format.
+        """
+        do_pad = do_pad if do_pad is not None else self.do_pad
+        do_resize = do_resize if do_resize is not None else self.do_resize
+        size = size if size is not None else self.size
+        size = get_size_dict(size, param_name="size", default_to_square=False)
+        resample = resample if resample is not None else self.resample
+        do_center_crop = do_center_crop if do_center_crop is not None else self.do_center_crop
+        crop_size = crop_size if crop_size is not None else self.crop_size
+        crop_size = get_size_dict(crop_size, param_name="crop_size", default_to_square=True)
+        do_rescale = do_rescale if do_rescale is not None else self.do_rescale
+        rescale_factor = rescale_factor if rescale_factor is not None else self.rescale_factor
+        do_normalize = do_normalize if do_normalize is not None else self.do_normalize
+        image_mean = image_mean if image_mean is not None else self.image_mean
+        image_std = image_std if image_std is not None else self.image_std
+        do_convert_rgb = do_convert_rgb if do_convert_rgb is not None else self.do_convert_rgb
+
+        validate_kwargs(captured_kwargs=kwargs.keys(), valid_processor_keys=self._valid_processor_keys)
+
+        images = self.fetch_images(images)
+        images = make_flat_list_of_images(images)
+
+        if not valid_images(images):
+            raise ValueError(
+                "Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
+                "torch.Tensor, tf.Tensor or jax.ndarray."
+            )
+        # we don't pass `do_pad` here since LLaVa uses a custom padding to a square
+        validate_preprocess_arguments(
+            do_rescale=do_rescale,
+            rescale_factor=rescale_factor,
+            do_normalize=do_normalize,
+            image_mean=image_mean,
+            image_std=image_std,
+            do_center_crop=do_center_crop,
+            crop_size=crop_size,
+            do_resize=do_resize,
+            size=size,
+            resample=resample,
+        )
+
+        if do_convert_rgb:
+            images = [convert_to_rgb(image) for image in images]
+
+        # All transformations expect numpy arrays.
+        images = [to_numpy_array(image) for image in images]
+
+        if is_scaled_image(images[0]) and do_rescale:
+            logger.warning_once(
+                "It looks like you are trying to rescale already rescaled images. If the input"
+                " images have pixel values between 0 and 1, set `do_rescale=False` to avoid rescaling them again."
+            )
+
+        if input_data_format is None:
+            # We assume that all images have the same channel dimension format.
+            input_data_format = infer_channel_dimension_format(images[0])
+
+        processed_images = []
+        for image in images:
+            if do_pad:
+                image = self.pad_to_square(
+                    image=image,
+                    background_color=tuple(int(x * 255) for x in self.image_mean),
+                    input_data_format=input_data_format,
+                )
+
+            if do_resize:
+                image = self.resize(image=image, size=size, resample=resample, input_data_format=input_data_format)
+
+            if do_center_crop:
+                image = self.center_crop(image=image, size=crop_size, input_data_format=input_data_format)
+
+            if do_rescale:
+                image = self.rescale(image=image, scale=rescale_factor, input_data_format=input_data_format)
+
+            if do_normalize:
+                image = self.normalize(
+                    image=image, mean=image_mean, std=image_std, input_data_format=input_data_format
+                )
+
+            image = to_channel_dimension_format(image, data_format, input_channel_dim=input_data_format)
+            processed_images.append(image)
+
+        return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
+
+
+__all__ = ["LlavaImageProcessor"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/image_processing_llava_fast.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/image_processing_llava_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..5960700405494208325f970ee3a1495cfddc6a87
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/image_processing_llava_fast.py
@@ -0,0 +1,171 @@
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Image processor class for LLaVa."""
+
+from typing import Optional, Union
+
+import torch
+from torchvision.transforms.v2 import functional as F
+
+from ...image_processing_utils import BatchFeature
+from ...image_processing_utils_fast import (
+    BaseImageProcessorFast,
+    DefaultFastImageProcessorKwargs,
+    group_images_by_shape,
+    reorder_images,
+)
+from ...image_utils import (
+    OPENAI_CLIP_MEAN,
+    OPENAI_CLIP_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    SizeDict,
+    get_image_size,
+)
+from ...processing_utils import Unpack
+from ...utils import (
+    TensorType,
+    auto_docstring,
+)
+
+
+class LlavaFastImageProcessorKwargs(DefaultFastImageProcessorKwargs): ...
+
+
+@auto_docstring
+class LlavaImageProcessorFast(BaseImageProcessorFast):
+    resample = PILImageResampling.BICUBIC
+    image_mean = OPENAI_CLIP_MEAN
+    image_std = OPENAI_CLIP_STD
+    size = {"shortest_edge": 224}
+    default_to_square = False
+    crop_size = {"height": 224, "width": 224}
+    do_pad = False
+    do_resize = True
+    do_center_crop = True
+    do_rescale = True
+    do_normalize = True
+    do_convert_rgb = True
+    valid_kwargs = LlavaFastImageProcessorKwargs
+
+    def __init__(self, **kwargs: Unpack[LlavaFastImageProcessorKwargs]) -> None:
+        super().__init__(**kwargs)
+
+    @auto_docstring
+    def preprocess(self, images: ImageInput, **kwargs: Unpack[LlavaFastImageProcessorKwargs]) -> BatchFeature:
+        return super().preprocess(images, **kwargs)
+
+    def pad_to_square(
+        self,
+        images: "torch.Tensor",
+        background_color: Union[int, tuple[int, int, int]] = 0,
+    ) -> "torch.Tensor":
+        """
+        Pads an image to a square based on the longest edge.
+
+        Args:
+            images (`np.ndarray`):
+                The images to pad.
+            background_color (`int` or `tuple[int, int, int]`, *optional*, defaults to 0):
+                The color to use for the padding. Can be an integer for single channel or a
+                tuple of integers representing for multi-channel images. If passed as integer
+                in multi-channel mode, it will default to `0` in subsequent channels.
+        Returns:
+            `torch.Tensor`: The padded images.
+        """
+        height, width = get_image_size(images, ChannelDimension.FIRST)
+
+        if height == width:
+            return images
+
+        num_channels = images.shape[1] if len(images.shape) == 4 else images.shape[0]
+        if isinstance(background_color, int):
+            background_color = [background_color] + [0] * (num_channels - 1)
+        elif len(background_color) != num_channels:
+            raise ValueError(
+                f"background_color must have no more than {num_channels} elements to match the number of channels"
+            )
+
+        max_dim = max(height, width)
+        paste_x_left = (max_dim - width) // 2
+        paste_y_left = (max_dim - height) // 2
+        paste_x_right = max_dim - width - paste_x_left
+        paste_y_right = max_dim - height - paste_y_left
+        padded_images = F.pad(
+            images, padding=[paste_x_left, paste_y_left, paste_x_right, paste_y_right], fill=background_color
+        )
+
+        return padded_images
+
+    def _preprocess(
+        self,
+        images: list["torch.Tensor"],
+        do_resize: bool,
+        size: SizeDict,
+        interpolation: Optional["F.InterpolationMode"],
+        do_pad: bool,
+        do_center_crop: bool,
+        crop_size: SizeDict,
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        image_mean: Optional[Union[float, list[float]]],
+        image_std: Optional[Union[float, list[float]]],
+        disable_grouping: Optional[bool],
+        return_tensors: Optional[Union[str, TensorType]],
+        **kwargs,
+    ) -> BatchFeature:
+        # Group images by size for batched resizing
+        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
+        resized_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_pad:
+                stacked_images = self.pad_to_square(
+                    images=stacked_images, background_color=tuple(int(x * 255) for x in self.image_mean)
+                )
+            resized_images_grouped[shape] = stacked_images
+        padded_images = reorder_images(resized_images_grouped, grouped_images_index)
+
+        # Group images by size for batched resizing
+        # Needed in case do_pad is False, or padding returns images with different sizes
+        grouped_images, grouped_images_index = group_images_by_shape(padded_images, disable_grouping=disable_grouping)
+        resized_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_resize:
+                stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation)
+            resized_images_grouped[shape] = stacked_images
+        resized_images = reorder_images(resized_images_grouped, grouped_images_index)
+
+        # Group images by size for further processing
+        # Needed in case do_resize is False, or resize returns images with different sizes
+        grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
+        processed_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_center_crop:
+                stacked_images = self.center_crop(stacked_images, crop_size)
+            # Fused rescale and normalize
+            stacked_images = self.rescale_and_normalize(
+                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            )
+            processed_images_grouped[shape] = stacked_images
+
+        processed_images = reorder_images(processed_images_grouped, grouped_images_index)
+        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
+
+        return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
+
+
+__all__ = ["LlavaImageProcessorFast"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/modeling_llava.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/modeling_llava.py
new file mode 100644
index 0000000000000000000000000000000000000000..bc0bb0df7c7baa60aaee0f2fd585102368bea1db
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/modeling_llava.py
@@ -0,0 +1,484 @@
+# coding=utf-8
+# Copyright 2023 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Llava model."""
+
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache
+from ...generation import GenerationMixin
+from ...modeling_outputs import BaseModelOutputWithPast, ModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple, logging
+from ..auto import AutoModel
+from .configuration_llava import LlavaConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for Llava outputs, with hidden states and attentions.
+    """
+)
+class LlavaModelOutputWithPast(BaseModelOutputWithPast):
+    r"""
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    image_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    """
+
+    image_hidden_states: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for Llava causal language model (or autoregressive) outputs.
+    """
+)
+class LlavaCausalLMOutputWithPast(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Language modeling loss (for next-token prediction).
+    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    image_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[Cache] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[torch.FloatTensor] = None
+
+
+class LlavaMultiModalProjector(nn.Module):
+    def __init__(self, config: LlavaConfig):
+        super().__init__()
+        # We have hidden_size * the number of vision feature layers
+        num_feature_layers = 1 if isinstance(config.vision_feature_layer, int) else len(config.vision_feature_layer)
+        self.linear_1 = nn.Linear(
+            config.vision_config.hidden_size * num_feature_layers,
+            config.text_config.hidden_size,
+            bias=config.multimodal_projector_bias,
+        )
+        self.act = ACT2FN[config.projector_hidden_act]
+        self.linear_2 = nn.Linear(
+            config.text_config.hidden_size, config.text_config.hidden_size, bias=config.multimodal_projector_bias
+        )
+
+    def forward(self, image_features):
+        hidden_states = self.linear_1(image_features)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.linear_2(hidden_states)
+        return hidden_states
+
+
+@auto_docstring
+class LlavaPreTrainedModel(PreTrainedModel):
+    config: LlavaConfig
+    base_model_prefix = ""
+    supports_gradient_checkpointing = True
+    _skip_keys_device_placement = "past_key_values"
+
+    _supports_flash_attn = True
+    _supports_sdpa = True
+
+    _can_compile_fullgraph = True
+    _supports_flex_attn = True
+    _supports_attention_backend = True
+
+
+@auto_docstring(
+    custom_intro="""
+    The Llava model which consists of a vision backbone and a language model, without a language modeling head.
+    """
+)
+class LlavaModel(LlavaPreTrainedModel):
+    _checkpoint_conversion_mapping = {"language_model.model": "language_model"}
+
+    def __init__(self, config: LlavaConfig):
+        super().__init__(config)
+        self.vision_tower = AutoModel.from_config(config.vision_config)
+
+        self.multi_modal_projector = LlavaMultiModalProjector(config)
+        self.language_model = AutoModel.from_config(config.text_config)
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+
+    def set_decoder(self, decoder):
+        self.language_model = decoder
+
+    def get_decoder(self):
+        return self.language_model
+
+    def get_image_features(
+        self,
+        pixel_values: torch.FloatTensor,
+        vision_feature_layer: Optional[Union[int, list[int]]] = None,
+        vision_feature_select_strategy: Optional[str] = None,
+        **kwargs,
+    ):
+        """
+        Obtains image last hidden states from the vision tower and apply multimodal projection.
+
+        Args:
+            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`):
+               The tensors corresponding to the input images.
+            vision_feature_layer (`Union[int, list[int]]`, *optional*):
+                The index of the layer to select the vision feature. If multiple indices are provided,
+                the vision feature of the corresponding indices will be concatenated to form the
+                vision features.
+            vision_feature_select_strategy (`str`, *optional*):
+                The feature selection strategy used to select the vision feature from the vision backbone.
+                Can be one of `"default"` or `"full"`
+        Returns:
+            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
+        """
+        vision_feature_layer = (
+            vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
+        )
+        vision_feature_select_strategy = (
+            vision_feature_select_strategy
+            if vision_feature_select_strategy is not None
+            else self.config.vision_feature_select_strategy
+        )
+
+        if vision_feature_select_strategy not in ["default", "full"]:
+            raise ValueError(f"Unexpected select feature strategy: {self.config.vision_feature_select_strategy}")
+
+        kwargs = {k: v for k, v in kwargs.items() if v is not None}
+        # this is not memory efficient at all (output_hidden_states=True) will save all the hidden states.
+        image_outputs = self.vision_tower(pixel_values, output_hidden_states=True, **kwargs)
+
+        # If we have one vision feature layer, return the corresponding hidden states,
+        # otherwise, select the hidden states of each feature layer and concatenate them
+        if isinstance(vision_feature_layer, int):
+            selected_image_feature = image_outputs.hidden_states[vision_feature_layer]
+            if vision_feature_select_strategy == "default":
+                selected_image_feature = selected_image_feature[:, 1:]
+        else:
+            hs_pool = [image_outputs.hidden_states[layer_idx] for layer_idx in vision_feature_layer]
+            # For default; crop CLS from each hidden state in the hidden state pool
+            if vision_feature_select_strategy == "default":
+                hs_pool = [hs[:, 1:] for hs in hs_pool]
+            selected_image_feature = torch.cat(hs_pool, dim=-1)
+
+        image_features = self.multi_modal_projector(selected_image_feature)
+
+        if "image_sizes" in kwargs:
+            split_sizes = [
+                (height // self.vision_tower.patch_size) * (width // self.vision_tower.patch_size)
+                for height, width in kwargs["image_sizes"]
+            ]
+            image_features = torch.split(image_features.squeeze(0), split_sizes)
+        else:
+            image_features = list(image_features)
+        return image_features
+
+    def get_placeholder_mask(
+        self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor
+    ):
+        """
+        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
+        equal to the length of multimodal features. If the lengths are different, an error is raised.
+        """
+        if input_ids is None:
+            special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_image_mask = special_image_mask.all(-1)
+        else:
+            special_image_mask = input_ids == self.config.image_token_id
+
+        n_image_tokens = special_image_mask.sum()
+        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        n_image_features = image_features.shape[0] * image_features.shape[1]
+        if inputs_embeds[special_image_mask].numel() != image_features.numel():
+            raise ValueError(
+                f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+            )
+        return special_image_mask
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        vision_feature_layer: Optional[Union[int, list[int]]] = None,
+        vision_feature_select_strategy: Optional[str] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        image_sizes: Optional[torch.Tensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, LlavaModelOutputWithPast]:
+        vision_feature_layer = (
+            vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
+        )
+        vision_feature_select_strategy = (
+            vision_feature_select_strategy
+            if vision_feature_select_strategy is not None
+            else self.config.vision_feature_select_strategy
+        )
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+        if pixel_values is not None:
+            image_features = self.get_image_features(
+                pixel_values=pixel_values,
+                vision_feature_layer=vision_feature_layer,
+                vision_feature_select_strategy=vision_feature_select_strategy,
+                image_sizes=image_sizes,
+            )
+            image_features = torch.cat(image_features, dim=0).to(inputs_embeds.device, inputs_embeds.dtype)
+            special_image_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, image_features=image_features
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
+        outputs = self.language_model(
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        return LlavaModelOutputWithPast(
+            last_hidden_state=outputs.last_hidden_state,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=image_features if pixel_values is not None else None,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    The LLAVA model which consists of a vision backbone and a language model.
+    """
+)
+class LlavaForConditionalGeneration(LlavaPreTrainedModel, GenerationMixin):
+    _checkpoint_conversion_mapping = {
+        "^language_model.model": "model.language_model",
+        "^vision_tower": "model.vision_tower",
+        "^multi_modal_projector": "model.multi_modal_projector",
+        "^language_model.lm_head": "lm_head",
+    }
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config: LlavaConfig):
+        super().__init__(config)
+        self.model = LlavaModel(config)
+        self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.model.set_input_embeddings(value)
+
+    def get_output_embeddings(self) -> nn.Module:
+        return self.lm_head
+
+    def set_decoder(self, decoder):
+        self.model.set_decoder(decoder)
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    def get_image_features(
+        self,
+        pixel_values: torch.FloatTensor,
+        vision_feature_layer: Optional[Union[int, list[int]]] = None,
+        vision_feature_select_strategy: Optional[str] = None,
+        **kwargs,
+    ):
+        return self.model.get_image_features(
+            pixel_values=pixel_values,
+            vision_feature_layer=vision_feature_layer,
+            vision_feature_select_strategy=vision_feature_select_strategy,
+            **kwargs,
+        )
+
+    # Make modules available through conditional class for BC
+    @property
+    def language_model(self):
+        return self.model.language_model
+
+    @property
+    def vision_tower(self):
+        return self.model.vision_tower
+
+    @property
+    def multi_modal_projector(self):
+        return self.model.multi_modal_projector
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        vision_feature_layer: Optional[Union[int, list[int]]] = None,
+        vision_feature_select_strategy: Optional[str] = None,
+        labels: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        image_sizes: Optional[torch.Tensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, LlavaCausalLMOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, LlavaForConditionalGeneration
+
+        >>> model = LlavaForConditionalGeneration.from_pretrained("llava-hf/llava-1.5-7b-hf")
+        >>> processor = AutoProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
+
+        >>> prompt = "USER: <image>\nWhat's the content of the image? ASSISTANT:"
+        >>> url = "https://www.ilankelman.org/stopsigns/australia.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, text=prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(**inputs, max_new_tokens=15)
+        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "USER:  \nWhat's the content of the image? ASSISTANT: The image features a busy city street with a stop sign prominently displayed"
+        ```"""
+        vision_feature_layer = (
+            vision_feature_layer if vision_feature_layer is not None else self.config.vision_feature_layer
+        )
+        vision_feature_select_strategy = (
+            vision_feature_select_strategy
+            if vision_feature_select_strategy is not None
+            else self.config.vision_feature_select_strategy
+        )
+
+        outputs = self.model(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            vision_feature_layer=vision_feature_layer,
+            vision_feature_select_strategy=vision_feature_select_strategy,
+            cache_position=cache_position,
+            image_sizes=image_sizes,
+            **kwargs,
+        )
+
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(
+                logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
+            )
+
+        return LlavaCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=outputs.image_hidden_states,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        pixel_values=None,
+        attention_mask=None,
+        cache_position=None,
+        logits_to_keep=None,
+        **kwargs,
+    ):
+        # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
+
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
+
+        if cache_position[0] == 0:
+            # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+            # Otherwise we need pixel values to be passed to model
+            model_inputs["pixel_values"] = pixel_values
+
+        return model_inputs
+
+
+__all__ = ["LlavaForConditionalGeneration", "LlavaPreTrainedModel", "LlavaModel"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/processing_llava.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/processing_llava.py
new file mode 100644
index 0000000000000000000000000000000000000000..63c07c20cbb9974fb6d2597e88e5224cd0111a52
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava/processing_llava.py
@@ -0,0 +1,212 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for Llava.
+"""
+
+from typing import Optional, Union
+
+import numpy as np
+
+from ...feature_extraction_utils import BatchFeature
+from ...image_utils import ImageInput, get_image_size, to_numpy_array
+from ...processing_utils import (
+    MultiModalData,
+    ProcessingKwargs,
+    ProcessorMixin,
+    Unpack,
+)
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class LlavaProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "text_kwargs": {"padding": False, "return_mm_token_type_ids": False},
+        "images_kwargs": {},
+    }
+
+
+class LlavaProcessor(ProcessorMixin):
+    r"""
+    Constructs a LLaVa processor which wraps a LLaVa image processor and a LLaMa tokenizer into a single processor.
+
+    [`LlavaProcessor`] offers all the functionalities of [`LlavaImageProcessor`] and [`LlamaTokenizerFast`]. See the
+    [`~LlavaProcessor.__call__`] and [`~LlavaProcessor.decode`] for more information.
+
+    Args:
+        image_processor ([`LlavaImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`LlamaTokenizerFast`], *optional*):
+            The tokenizer is a required input.
+        patch_size (`int`, *optional*):
+            Patch size from the vision tower.
+        vision_feature_select_strategy (`str`, *optional*):
+            The feature selection strategy used to select the vision feature from the vision backbone.
+            Should be same as in model's config
+        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
+            in a chat into a tokenizable string.
+        image_token (`str`, *optional*, defaults to `"<image>"`):
+            Special token used to denote image location.
+        num_additional_image_tokens (`int`, *optional*, defaults to 0):
+            Number of additional tokens added to the image embeddings, such as CLS (+1). If the backbone has no CLS or other
+            extra tokens appended, no need to set this arg.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = "AutoImageProcessor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        patch_size=None,
+        vision_feature_select_strategy=None,
+        chat_template=None,
+        image_token="<image>",  # set the default and let users change if they have peculiar special tokens in rare cases
+        num_additional_image_tokens=0,
+        **kwargs,
+    ):
+        self.patch_size = patch_size
+        self.num_additional_image_tokens = num_additional_image_tokens
+        self.vision_feature_select_strategy = vision_feature_select_strategy
+        self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
+        self.image_token_id = tokenizer.encode(self.image_token, add_special_tokens=False)[0]
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+
+    def __call__(
+        self,
+        images: Optional[ImageInput] = None,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[LlavaProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
+        CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
+        of the above two methods for more information.
+
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. Both channels-first and channels-last formats are supported.
+            text (`str`, `list[str]`, `list[list[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+        """
+        if images is None and text is None:
+            raise ValueError("You have to specify at least one of `images` or `text`.")
+
+        output_kwargs = self._merge_kwargs(
+            LlavaProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        if images is not None:
+            image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
+        else:
+            image_inputs = {}
+
+        if isinstance(text, str):
+            text = [text]
+        elif not isinstance(text, list) and not isinstance(text[0], str):
+            raise TypeError("Invalid input text. Please provide a string, or a list of strings")
+
+        # try to expand inputs in processing if we have the necessary parts
+        prompt_strings = text
+        if image_inputs.get("pixel_values") is not None:
+            # Replace the image token with the expanded image token sequence
+            pixel_values = image_inputs["pixel_values"]
+            height, width = get_image_size(to_numpy_array(pixel_values[0]))
+            num_image_tokens = (height // self.patch_size) * (
+                width // self.patch_size
+            ) + self.num_additional_image_tokens
+            if self.vision_feature_select_strategy == "default":
+                num_image_tokens -= 1
+
+            prompt_strings = []
+            for sample in text:
+                sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
+                prompt_strings.append(sample)
+
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
+        return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", False)
+        text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"], return_tensors=None)
+        self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image"])
+
+        if return_mm_token_type_ids:
+            array_ids = np.array(text_inputs["input_ids"])
+            mm_token_type_ids = np.zeros_like(text_inputs["input_ids"])
+            mm_token_type_ids[array_ids == self.image_token_id] = 1
+            text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
+
+        return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
+
+    def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs):
+        """
+        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
+
+        Args:
+            image_sizes (`list[list[int]]`, *optional*):
+                The input sizes formatted as (height, width) per each image.
+
+        Returns:
+            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
+            input modalities, along with other useful data.
+        """
+
+        vision_data = {}
+        if image_sizes is not None:
+            images_kwargs = LlavaProcessorKwargs._defaults.get("images_kwargs", {})
+            images_kwargs.update(kwargs)
+            crop_size = images_kwargs.get("crop_size", None) or self.image_processor.crop_size
+            resized_height, resized_width = crop_size["height"], crop_size["width"]
+
+            num_image_tokens = (resized_height // self.patch_size) * (resized_width // self.patch_size)
+            num_image_tokens += self.num_additional_image_tokens
+            if self.vision_feature_select_strategy == "default":
+                num_image_tokens -= 1
+
+            num_image_tokens = [num_image_tokens] * len(image_sizes)
+            num_image_patches = [1] * len(image_sizes)
+            vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches})
+
+        return MultiModalData(**vision_data)
+
+
+__all__ = ["LlavaProcessor"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..700e26350df81d1d7a2163eef51e8b2da573a87a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/__pycache__/configuration_llava_next.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/__pycache__/configuration_llava_next.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dbf7a4c64f112ee29bc62bd9da22659e9b847213
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/__pycache__/configuration_llava_next.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/__pycache__/image_processing_llava_next.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/__pycache__/image_processing_llava_next.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f17a7aba47c2f2076420246293d03d5293fad303
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/__pycache__/image_processing_llava_next.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/__pycache__/image_processing_llava_next_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/__pycache__/image_processing_llava_next_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d98becc15135d91b98a036a39392f5940a80a1f3
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/__pycache__/image_processing_llava_next_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/__pycache__/modeling_llava_next.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/__pycache__/modeling_llava_next.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c2728ca85c654ef7fd69d3908e95fe4837f24656
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/__pycache__/modeling_llava_next.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/__pycache__/processing_llava_next.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/__pycache__/processing_llava_next.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..349b5f6f0e5410c19a225a491e187febfd69fb50
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/llava_next/__pycache__/processing_llava_next.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/longcat_flash/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/longcat_flash/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6684e9c33036c104dc5f1ec1ec0dc02d87c91719
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/longcat_flash/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/longcat_flash/__pycache__/configuration_longcat_flash.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/longcat_flash/__pycache__/configuration_longcat_flash.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f12304a64dd0575dc10b3efb46d8b8eb7ad498da
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/longcat_flash/__pycache__/configuration_longcat_flash.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/longcat_flash/__pycache__/modeling_longcat_flash.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/longcat_flash/__pycache__/modeling_longcat_flash.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e54390584863e036558bdfef33d6803216f3a221
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/longcat_flash/__pycache__/modeling_longcat_flash.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/longcat_flash/__pycache__/modular_longcat_flash.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/longcat_flash/__pycache__/modular_longcat_flash.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9e70290fef88869d172fdba02dcfecb9cfe5b69d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/longcat_flash/__pycache__/modular_longcat_flash.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/longt5/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/longt5/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4e3365cff6e6847a2b381019c098a48e5c1dd0a5
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/longt5/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/longt5/__pycache__/configuration_longt5.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/longt5/__pycache__/configuration_longt5.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..af68d5ff07b3ed8bcf04a0e8e4e4680f7ae2b05f
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/longt5/__pycache__/configuration_longt5.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lxmert/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lxmert/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..623b56c71c35571e4634547534093921483d8f31
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lxmert/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lxmert/__pycache__/configuration_lxmert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lxmert/__pycache__/configuration_lxmert.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f938a076d279831a363877b172fbe6fbe2117ca9
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lxmert/__pycache__/configuration_lxmert.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lxmert/__pycache__/modeling_lxmert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lxmert/__pycache__/modeling_lxmert.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6ea3a201affe0c1241fc638ba475544b69e37231
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lxmert/__pycache__/modeling_lxmert.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lxmert/__pycache__/modeling_tf_lxmert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lxmert/__pycache__/modeling_tf_lxmert.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3b7effe1fbd5fe6afa973fa3ecb0ba77a41e855a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lxmert/__pycache__/modeling_tf_lxmert.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lxmert/__pycache__/tokenization_lxmert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lxmert/__pycache__/tokenization_lxmert.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..019dafc905e6b248b9ef9686c953838aaeb0e2f0
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lxmert/__pycache__/tokenization_lxmert.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lxmert/__pycache__/tokenization_lxmert_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lxmert/__pycache__/tokenization_lxmert_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b71c0d72f1e493a0d7c42e9e5d8d6ed8dc309f9d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/lxmert/__pycache__/tokenization_lxmert_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/markuplm/__pycache__/modeling_markuplm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/markuplm/__pycache__/modeling_markuplm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1cde448aebcdf2a991f0e65debf56e058f85df1a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/markuplm/__pycache__/modeling_markuplm.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/maskformer/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/maskformer/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..536514d32ee704a198b8746bd8ab2b13d4c3df84
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/maskformer/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/maskformer/__pycache__/configuration_maskformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/maskformer/__pycache__/configuration_maskformer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1b4d9863803c4e5773c683cccfaeb38c5ca4a873
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/maskformer/__pycache__/configuration_maskformer.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/maskformer/__pycache__/configuration_maskformer_swin.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/maskformer/__pycache__/configuration_maskformer_swin.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eb923c87938e81456d24bd63d39b03d918541368
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/maskformer/__pycache__/configuration_maskformer_swin.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/maskformer/__pycache__/feature_extraction_maskformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/maskformer/__pycache__/feature_extraction_maskformer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f85fa69a40410358b41d5bdd01f9a409e10dffca
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/maskformer/__pycache__/feature_extraction_maskformer.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/maskformer/__pycache__/image_processing_maskformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/maskformer/__pycache__/image_processing_maskformer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0586f6f33d29a462b32e74243448b4396889390c
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/maskformer/__pycache__/image_processing_maskformer.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/maskformer/__pycache__/image_processing_maskformer_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/maskformer/__pycache__/image_processing_maskformer_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eeddb15a71cb5dbcee7f6e1677124a274ae5497e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/maskformer/__pycache__/image_processing_maskformer_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/maskformer/__pycache__/modeling_maskformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/maskformer/__pycache__/modeling_maskformer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d45911043144c8840f32aaea487196f3d83bb0df
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/maskformer/__pycache__/modeling_maskformer.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/maskformer/__pycache__/modeling_maskformer_swin.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/maskformer/__pycache__/modeling_maskformer_swin.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7368cd0b9fe05eaf1266a473d316d11ad85ceb68
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/maskformer/__pycache__/modeling_maskformer_swin.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mbart50/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mbart50/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5797a1e89776a1de02ee857bc3577f5f0c4d99e5
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mbart50/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mbart50/__pycache__/tokenization_mbart50.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mbart50/__pycache__/tokenization_mbart50.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c7fad6458561921cde60468881f3e2298b6ae320
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mbart50/__pycache__/tokenization_mbart50.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mbart50/__pycache__/tokenization_mbart50_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mbart50/__pycache__/tokenization_mbart50_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6e12700e5b32be209afd8ba9812084019bc6cccf
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mbart50/__pycache__/tokenization_mbart50_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/megatron_gpt2/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/megatron_gpt2/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c65c038af4f46cfdb545d9e42015cccc8b198f94
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/megatron_gpt2/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/megatron_gpt2/__pycache__/checkpoint_reshaping_and_interoperability.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/megatron_gpt2/__pycache__/checkpoint_reshaping_and_interoperability.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a84dc60a6bc88feb5777b8e202fd2946f556588c
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/megatron_gpt2/__pycache__/checkpoint_reshaping_and_interoperability.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mgp_str/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mgp_str/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..a20c4cd4efc3ee8da6b22d0d4edf3dab25dcfc06
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mgp_str/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_mgp_str import *
+    from .modeling_mgp_str import *
+    from .processing_mgp_str import *
+    from .tokenization_mgp_str import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mgp_str/configuration_mgp_str.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mgp_str/configuration_mgp_str.py
new file mode 100644
index 0000000000000000000000000000000000000000..1f6b355c7718054f2522a6892f5a3212c4d85ced
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mgp_str/configuration_mgp_str.py
@@ -0,0 +1,137 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""MGP-STR model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class MgpstrConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of an [`MgpstrModel`]. It is used to instantiate an
+    MGP-STR model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the MGP-STR
+    [alibaba-damo/mgp-str-base](https://huggingface.co/alibaba-damo/mgp-str-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        image_size (`list[int]`, *optional*, defaults to `[32, 128]`):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 4):
+            The size (resolution) of each patch.
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        max_token_length (`int`, *optional*, defaults to 27):
+            The max number of output tokens.
+        num_character_labels (`int`, *optional*, defaults to 38):
+            The number of classes for character head .
+        num_bpe_labels (`int`, *optional*, defaults to 50257):
+            The number of classes for bpe head .
+        num_wordpiece_labels (`int`, *optional*, defaults to 30522):
+            The number of classes for wordpiece head .
+        hidden_size (`int`, *optional*, defaults to 768):
+            The embedding dimension.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        mlp_ratio (`float`, *optional*, defaults to 4.0):
+            The ratio of mlp hidden dim to embedding dim.
+        qkv_bias (`bool`, *optional*, defaults to `True`):
+            Whether to add a bias to the queries, keys and values.
+        distilled (`bool`, *optional*, defaults to `False`):
+            Model includes a distillation token and head as in DeiT models.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        drop_rate (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all fully connected layers in the embeddings, encoder.
+        attn_drop_rate (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        drop_path_rate (`float`, *optional*, defaults to 0.0):
+            The stochastic depth rate.
+        output_a3_attentions (`bool`, *optional*, defaults to `False`):
+            Whether or not the model should returns A^3 module attentions.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+
+    Example:
+
+    ```python
+    >>> from transformers import MgpstrConfig, MgpstrForSceneTextRecognition
+
+    >>> # Initializing a Mgpstr mgp-str-base style configuration
+    >>> configuration = MgpstrConfig()
+
+    >>> # Initializing a model (with random weights) from the mgp-str-base style configuration
+    >>> model = MgpstrForSceneTextRecognition(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "mgp-str"
+
+    def __init__(
+        self,
+        image_size=[32, 128],
+        patch_size=4,
+        num_channels=3,
+        max_token_length=27,
+        num_character_labels=38,
+        num_bpe_labels=50257,
+        num_wordpiece_labels=30522,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        distilled=False,
+        layer_norm_eps=1e-5,
+        drop_rate=0.0,
+        attn_drop_rate=0.0,
+        drop_path_rate=0.0,
+        output_a3_attentions=False,
+        initializer_range=0.02,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.num_channels = num_channels
+        self.max_token_length = max_token_length
+        self.num_character_labels = num_character_labels
+        self.num_bpe_labels = num_bpe_labels
+        self.num_wordpiece_labels = num_wordpiece_labels
+        self.hidden_size = hidden_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.mlp_ratio = mlp_ratio
+        self.distilled = distilled
+        self.layer_norm_eps = layer_norm_eps
+        self.drop_rate = drop_rate
+        self.qkv_bias = qkv_bias
+        self.attn_drop_rate = attn_drop_rate
+        self.drop_path_rate = drop_path_rate
+        self.output_a3_attentions = output_a3_attentions
+        self.initializer_range = initializer_range
+
+
+__all__ = ["MgpstrConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mgp_str/modeling_mgp_str.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mgp_str/modeling_mgp_str.py
new file mode 100644
index 0000000000000000000000000000000000000000..be7cf08b14ab74b4a4598d4ee557f9388199a3ed
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mgp_str/modeling_mgp_str.py
@@ -0,0 +1,459 @@
+# coding=utf-8
+# Copyright 2023 Alibaba Research and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch MGP-STR model."""
+
+import collections.abc
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from ...modeling_outputs import BaseModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...utils import ModelOutput, auto_docstring, logging
+from .configuration_mgp_str import MgpstrConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+# Copied from transformers.models.beit.modeling_beit.drop_path
+def drop_path(input: torch.Tensor, drop_prob: float = 0.0, training: bool = False) -> torch.Tensor:
+    """
+    Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+
+    Comment by Ross Wightman: This is the same as the DropConnect impl I created for EfficientNet, etc networks,
+    however, the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for changing the
+    layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use 'survival rate' as the
+    argument.
+    """
+    if drop_prob == 0.0 or not training:
+        return input
+    keep_prob = 1 - drop_prob
+    shape = (input.shape[0],) + (1,) * (input.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=input.dtype, device=input.device)
+    random_tensor.floor_()  # binarize
+    output = input.div(keep_prob) * random_tensor
+    return output
+
+
+# Copied from transformers.models.beit.modeling_beit.BeitDropPath with Beit->Mgpstr
+class MgpstrDropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks)."""
+
+    def __init__(self, drop_prob: Optional[float] = None) -> None:
+        super().__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        return drop_path(hidden_states, self.drop_prob, self.training)
+
+    def extra_repr(self) -> str:
+        return f"p={self.drop_prob}"
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for vision model's outputs that also contains image embeddings of the pooling of the last hidden states.
+    """
+)
+class MgpstrModelOutput(ModelOutput):
+    r"""
+    logits (`tuple(torch.FloatTensor)` of shape `(batch_size, config.num_character_labels)`):
+        Tuple of `torch.FloatTensor` (one for the output of character of shape `(batch_size,
+        config.max_token_length, config.num_character_labels)`, + one for the output of bpe of shape `(batch_size,
+        config.max_token_length, config.num_bpe_labels)`, + one for the output of wordpiece of shape `(batch_size,
+        config.max_token_length, config.num_wordpiece_labels)`) .
+
+        Classification scores (before SoftMax) of character, bpe and wordpiece.
+    a3_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_a3_attentions=True` is passed or when `config.output_a3_attentions=True`):
+        Tuple of `torch.FloatTensor` (one for the attention of character, + one for the attention of bpe`, + one
+        for the attention of wordpiece) of shape `(batch_size, config.max_token_length, sequence_length)`.
+
+        Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+        heads.
+    """
+
+    logits: Optional[tuple[torch.FloatTensor]] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    a3_attentions: Optional[tuple[torch.FloatTensor]] = None
+
+
+class MgpstrEmbeddings(nn.Module):
+    """2D Image to Patch Embedding"""
+
+    def __init__(self, config: MgpstrConfig):
+        super().__init__()
+        image_size = (
+            config.image_size
+            if isinstance(config.image_size, collections.abc.Iterable)
+            else (config.image_size, config.image_size)
+        )
+        patch_size = (
+            config.patch_size
+            if isinstance(config.patch_size, collections.abc.Iterable)
+            else (config.patch_size, config.patch_size)
+        )
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.grid_size = (image_size[0] // patch_size[0], image_size[1] // patch_size[1])
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
+        self.num_tokens = 2 if config.distilled else 1
+
+        self.proj = nn.Conv2d(config.num_channels, config.hidden_size, kernel_size=patch_size, stride=patch_size)
+
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, config.hidden_size))
+
+        self.pos_embed = nn.Parameter(torch.zeros(1, self.num_patches + self.num_tokens, config.hidden_size))
+        self.pos_drop = nn.Dropout(p=config.drop_rate)
+
+    def forward(self, pixel_values):
+        batch_size, channel, height, width = pixel_values.shape
+        if height != self.image_size[0] or width != self.image_size[1]:
+            raise ValueError(
+                f"Input image size ({height}*{width}) doesn't match model ({self.image_size[0]}*{self.image_size[1]})."
+            )
+
+        patch_embeddings = self.proj(pixel_values)
+        patch_embeddings = patch_embeddings.flatten(2).transpose(1, 2)  # BCHW -> BNC
+
+        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
+        embedding_output = torch.cat((cls_tokens, patch_embeddings), dim=1)
+        embedding_output = embedding_output + self.pos_embed
+        embedding_output = self.pos_drop(embedding_output)
+
+        return embedding_output
+
+
+class MgpstrMlp(nn.Module):
+    """MLP as used in Vision Transformer, MLP-Mixer and related networks"""
+
+    def __init__(self, config: MgpstrConfig, hidden_features):
+        super().__init__()
+        hidden_features = hidden_features or config.hidden_size
+        self.fc1 = nn.Linear(config.hidden_size, hidden_features)
+        self.act = nn.GELU()
+        self.fc2 = nn.Linear(hidden_features, config.hidden_size)
+        self.drop = nn.Dropout(config.drop_rate)
+
+    def forward(self, hidden_states):
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.drop(hidden_states)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = self.drop(hidden_states)
+        return hidden_states
+
+
+class MgpstrAttention(nn.Module):
+    def __init__(self, config: MgpstrConfig):
+        super().__init__()
+        self.num_heads = config.num_attention_heads
+        head_dim = config.hidden_size // config.num_attention_heads
+        self.scale = head_dim**-0.5
+
+        self.qkv = nn.Linear(config.hidden_size, config.hidden_size * 3, bias=config.qkv_bias)
+        self.attn_drop = nn.Dropout(config.attn_drop_rate)
+        self.proj = nn.Linear(config.hidden_size, config.hidden_size)
+        self.proj_drop = nn.Dropout(config.drop_rate)
+
+    def forward(self, hidden_states):
+        batch_size, num, channel = hidden_states.shape
+        qkv = (
+            self.qkv(hidden_states)
+            .reshape(batch_size, num, 3, self.num_heads, channel // self.num_heads)
+            .permute(2, 0, 3, 1, 4)
+        )
+        query, key, value = qkv[0], qkv[1], qkv[2]  # make torchscript happy (cannot use tensor as tuple)
+
+        attention_probs = (query @ key.transpose(-2, -1)) * self.scale
+        attention_probs = attention_probs.softmax(dim=-1)
+        attention_probs = self.attn_drop(attention_probs)
+
+        context_layer = (attention_probs @ value).transpose(1, 2).reshape(batch_size, num, channel)
+        context_layer = self.proj(context_layer)
+        context_layer = self.proj_drop(context_layer)
+        return (context_layer, attention_probs)
+
+
+class MgpstrLayer(nn.Module):
+    def __init__(self, config: MgpstrConfig, drop_path=None):
+        super().__init__()
+        self.norm1 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.attn = MgpstrAttention(config)
+        # NOTE: drop path for stochastic depth, we shall see if this is better than dropout here
+        self.drop_path = MgpstrDropPath(drop_path) if drop_path is not None else nn.Identity()
+        self.norm2 = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        mlp_hidden_dim = int(config.hidden_size * config.mlp_ratio)
+        self.mlp = MgpstrMlp(config, mlp_hidden_dim)
+
+    def forward(self, hidden_states):
+        self_attention_outputs = self.attn(self.norm1(hidden_states))
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1]
+
+        # first residual connection
+        hidden_states = self.drop_path(attention_output) + hidden_states
+
+        # second residual connection is done here
+        layer_output = hidden_states + self.drop_path(self.mlp(self.norm2(hidden_states)))
+
+        outputs = (layer_output, outputs)
+        return outputs
+
+
+class MgpstrEncoder(nn.Module):
+    def __init__(self, config: MgpstrConfig):
+        super().__init__()
+        # stochastic depth decay rule
+        dpr = [x.item() for x in torch.linspace(0, config.drop_path_rate, config.num_hidden_layers, device="cpu")]
+
+        self.blocks = nn.Sequential(
+            *[MgpstrLayer(config=config, drop_path=dpr[i]) for i in range(config.num_hidden_layers)]
+        )
+
+    def forward(self, hidden_states, output_attentions=False, output_hidden_states=False, return_dict=True):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        for _, blk in enumerate(self.blocks):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            layer_outputs = blk(hidden_states)
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class MgpstrA3Module(nn.Module):
+    def __init__(self, config: MgpstrConfig):
+        super().__init__()
+        self.token_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.tokenLearner = nn.Sequential(
+            nn.Conv2d(config.hidden_size, config.hidden_size, kernel_size=(1, 1), stride=1, groups=8, bias=False),
+            nn.Conv2d(config.hidden_size, config.max_token_length, kernel_size=(1, 1), stride=1, bias=False),
+        )
+        self.feat = nn.Conv2d(
+            config.hidden_size, config.hidden_size, kernel_size=(1, 1), stride=1, groups=8, bias=False
+        )
+        self.norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+    def forward(self, hidden_states):
+        hidden_states = self.token_norm(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2).unsqueeze(-1)
+        selected = self.tokenLearner(hidden_states)
+        selected = selected.flatten(2)
+        attentions = F.softmax(selected, dim=-1)
+
+        feat = self.feat(hidden_states)
+        feat = feat.flatten(2).transpose(1, 2)
+        feat = torch.einsum("...si,...id->...sd", attentions, feat)
+        a3_out = self.norm(feat)
+
+        return (a3_out, attentions)
+
+
+@auto_docstring
+class MgpstrPreTrainedModel(PreTrainedModel):
+    config: MgpstrConfig
+    base_model_prefix = "mgp_str"
+    _no_split_modules = []
+
+    def _init_weights(self, module: nn.Module) -> None:
+        """Initialize the weights"""
+        std = self.config.initializer_range
+        if isinstance(module, MgpstrEmbeddings):
+            nn.init.trunc_normal_(module.pos_embed, mean=0.0, std=std)
+            nn.init.trunc_normal_(module.cls_token, mean=0.0, std=std)
+        elif isinstance(module, (nn.Linear, nn.Conv2d)):
+            nn.init.trunc_normal_(module.weight.data, mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+@auto_docstring
+class MgpstrModel(MgpstrPreTrainedModel):
+    def __init__(self, config: MgpstrConfig):
+        super().__init__(config)
+        self.config = config
+        self.embeddings = MgpstrEmbeddings(config)
+        self.encoder = MgpstrEncoder(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self) -> nn.Module:
+        return self.embeddings.proj
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.FloatTensor], BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        embedding_output = self.embeddings(pixel_values)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return encoder_outputs
+        return BaseModelOutput(
+            last_hidden_state=encoder_outputs.last_hidden_state,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    MGP-STR Model transformer with three classification heads on top (three A^3 modules and three linear layer on top
+    of the transformer encoder output) for scene text recognition (STR) .
+    """
+)
+class MgpstrForSceneTextRecognition(MgpstrPreTrainedModel):
+    config: MgpstrConfig
+    main_input_name = "pixel_values"
+
+    def __init__(self, config: MgpstrConfig) -> None:
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.mgp_str = MgpstrModel(config)
+
+        self.char_a3_module = MgpstrA3Module(config)
+        self.bpe_a3_module = MgpstrA3Module(config)
+        self.wp_a3_module = MgpstrA3Module(config)
+
+        self.char_head = nn.Linear(config.hidden_size, config.num_character_labels)
+        self.bpe_head = nn.Linear(config.hidden_size, config.num_bpe_labels)
+        self.wp_head = nn.Linear(config.hidden_size, config.num_wordpiece_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        output_attentions: Optional[bool] = None,
+        output_a3_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.FloatTensor], MgpstrModelOutput]:
+        r"""
+        output_a3_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of a3 modules. See `a3_attentions` under returned tensors
+            for more detail.
+
+        Example:
+
+        ```python
+        >>> from transformers import (
+        ...     MgpstrProcessor,
+        ...     MgpstrForSceneTextRecognition,
+        ... )
+        >>> import requests
+        >>> from PIL import Image
+
+        >>> # load image from the IIIT-5k dataset
+        >>> url = "https://i.postimg.cc/ZKwLg2Gw/367-14.png"
+        >>> image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
+
+        >>> processor = MgpstrProcessor.from_pretrained("alibaba-damo/mgp-str-base")
+        >>> pixel_values = processor(images=image, return_tensors="pt").pixel_values
+
+        >>> model = MgpstrForSceneTextRecognition.from_pretrained("alibaba-damo/mgp-str-base")
+
+        >>> # inference
+        >>> outputs = model(pixel_values)
+        >>> out_strs = processor.batch_decode(outputs.logits)
+        >>> out_strs["generated_text"]
+        '["ticket"]'
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        mgp_outputs = self.mgp_str(
+            pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = mgp_outputs[0]
+
+        char_a3_out, char_attention = self.char_a3_module(sequence_output)
+        bpe_a3_out, bpe_attention = self.bpe_a3_module(sequence_output)
+        wp_a3_out, wp_attention = self.wp_a3_module(sequence_output)
+
+        char_logits = self.char_head(char_a3_out)
+        bpe_logits = self.bpe_head(bpe_a3_out)
+        wp_logits = self.wp_head(wp_a3_out)
+
+        all_a3_attentions = (char_attention, bpe_attention, wp_attention) if output_a3_attentions else None
+        all_logits = (char_logits, bpe_logits, wp_logits)
+
+        if not return_dict:
+            outputs = (all_logits, all_a3_attentions) + mgp_outputs[1:]
+            return tuple(output for output in outputs if output is not None)
+        return MgpstrModelOutput(
+            logits=all_logits,
+            hidden_states=mgp_outputs.hidden_states,
+            attentions=mgp_outputs.attentions,
+            a3_attentions=all_a3_attentions,
+        )
+
+
+__all__ = ["MgpstrModel", "MgpstrPreTrainedModel", "MgpstrForSceneTextRecognition"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mgp_str/processing_mgp_str.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mgp_str/processing_mgp_str.py
new file mode 100644
index 0000000000000000000000000000000000000000..8f7d32388748b2c8d9ea3bbf35edacc464408a69
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mgp_str/processing_mgp_str.py
@@ -0,0 +1,231 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Processor class for MGP-STR."""
+
+import warnings
+
+from transformers import AutoTokenizer
+from transformers.utils import is_torch_available
+from transformers.utils.generic import ExplicitEnum
+
+from ...processing_utils import ProcessorMixin
+from ...utils.import_utils import requires
+
+
+if is_torch_available():
+    import torch
+
+
+class DecodeType(ExplicitEnum):
+    CHARACTER = "char"
+    BPE = "bpe"
+    WORDPIECE = "wp"
+
+
+SUPPORTED_ANNOTATION_FORMATS = (DecodeType.CHARACTER, DecodeType.BPE, DecodeType.WORDPIECE)
+
+
+@requires(backends=("sentencepiece",))
+class MgpstrProcessor(ProcessorMixin):
+    r"""
+    Constructs a MGP-STR processor which wraps an image processor and MGP-STR tokenizers into a single
+
+    [`MgpstrProcessor`] offers all the functionalities of `ViTImageProcessor`] and [`MgpstrTokenizer`]. See the
+    [`~MgpstrProcessor.__call__`] and [`~MgpstrProcessor.batch_decode`] for more information.
+
+    Args:
+        image_processor (`ViTImageProcessor`, *optional*):
+            An instance of `ViTImageProcessor`. The image processor is a required input.
+        tokenizer ([`MgpstrTokenizer`], *optional*):
+            The tokenizer is a required input.
+    """
+
+    attributes = ["image_processor", "char_tokenizer"]
+    image_processor_class = ("ViTImageProcessor", "ViTImageProcessorFast")
+    char_tokenizer_class = "MgpstrTokenizer"
+
+    def __init__(self, image_processor=None, tokenizer=None, **kwargs):
+        feature_extractor = None
+        if "feature_extractor" in kwargs:
+            warnings.warn(
+                "The `feature_extractor` argument is deprecated and will be removed in v5, use `image_processor`"
+                " instead.",
+                FutureWarning,
+            )
+            feature_extractor = kwargs.pop("feature_extractor")
+
+        image_processor = image_processor if image_processor is not None else feature_extractor
+
+        self.char_tokenizer = tokenizer
+        self.bpe_tokenizer = AutoTokenizer.from_pretrained("openai-community/gpt2")
+        self.wp_tokenizer = AutoTokenizer.from_pretrained("google-bert/bert-base-uncased")
+
+        super().__init__(image_processor, tokenizer)
+
+    def __call__(self, text=None, images=None, return_tensors=None, **kwargs):
+        """
+        When used in normal mode, this method forwards all its arguments to ViTImageProcessor's
+        [`~ViTImageProcessor.__call__`] and returns its output. This method also forwards the `text` and `kwargs`
+        arguments to MgpstrTokenizer's [`~MgpstrTokenizer.__call__`] if `text` is not `None` to encode the text. Please
+        refer to the docstring of the above methods for more information.
+        """
+        if images is None and text is None:
+            raise ValueError("You need to specify either an `images` or `text` input to process.")
+
+        if images is not None:
+            inputs = self.image_processor(images, return_tensors=return_tensors, **kwargs)
+        if text is not None:
+            encodings = self.char_tokenizer(text, return_tensors=return_tensors, **kwargs)
+
+        if text is None:
+            return inputs
+        elif images is None:
+            return encodings
+        else:
+            inputs["labels"] = encodings["input_ids"]
+            return inputs
+
+    def batch_decode(self, sequences):
+        """
+        Convert a list of lists of token ids into a list of strings by calling decode.
+
+        Args:
+            sequences (`torch.Tensor`):
+                List of tokenized input ids.
+
+        Returns:
+            `dict[str, any]`: Dictionary of all the outputs of the decoded results.
+                generated_text (`list[str]`): The final results after fusion of char, bpe, and wp. scores
+                (`list[float]`): The final scores after fusion of char, bpe, and wp. char_preds (`list[str]`): The list
+                of character decoded sentences. bpe_preds (`list[str]`): The list of bpe decoded sentences. wp_preds
+                (`list[str]`): The list of wp decoded sentences.
+
+        This method forwards all its arguments to PreTrainedTokenizer's [`~PreTrainedTokenizer.batch_decode`]. Please
+        refer to the docstring of this method for more information.
+        """
+        char_preds, bpe_preds, wp_preds = sequences
+        batch_size = char_preds.size(0)
+
+        char_strs, char_scores = self._decode_helper(char_preds, "char")
+        bpe_strs, bpe_scores = self._decode_helper(bpe_preds, "bpe")
+        wp_strs, wp_scores = self._decode_helper(wp_preds, "wp")
+
+        final_strs = []
+        final_scores = []
+        for i in range(batch_size):
+            scores = [char_scores[i], bpe_scores[i], wp_scores[i]]
+            strs = [char_strs[i], bpe_strs[i], wp_strs[i]]
+            max_score_index = scores.index(max(scores))
+            final_strs.append(strs[max_score_index])
+            final_scores.append(scores[max_score_index])
+
+        out = {}
+        out["generated_text"] = final_strs
+        out["scores"] = final_scores
+        out["char_preds"] = char_strs
+        out["bpe_preds"] = bpe_strs
+        out["wp_preds"] = wp_strs
+        return out
+
+    def _decode_helper(self, pred_logits, format):
+        """
+        Convert a list of lists of bpe token ids into a list of strings by calling bpe tokenizer.
+
+        Args:
+            pred_logits (`torch.Tensor`):
+                List of model prediction logits.
+            format (`Union[DecoderType, str]`):
+                Type of model prediction. Must be one of ['char', 'bpe', 'wp'].
+        Returns:
+            `tuple`:
+                dec_strs(`str`): The decode strings of model prediction. conf_scores(`list[float]`): The confidence
+                score of model prediction.
+        """
+        if format == DecodeType.CHARACTER:
+            decoder = self.char_decode
+            eos_token = 1
+            eos_str = "[s]"
+        elif format == DecodeType.BPE:
+            decoder = self.bpe_decode
+            eos_token = 2
+            eos_str = "#"
+        elif format == DecodeType.WORDPIECE:
+            decoder = self.wp_decode
+            eos_token = 102
+            eos_str = "[SEP]"
+        else:
+            raise ValueError(f"Format {format} is not supported.")
+
+        dec_strs, conf_scores = [], []
+        batch_size = pred_logits.size(0)
+        batch_max_length = pred_logits.size(1)
+        _, preds_index = pred_logits.topk(1, dim=-1, largest=True, sorted=True)
+        preds_index = preds_index.view(-1, batch_max_length)[:, 1:]
+        preds_str = decoder(preds_index)
+        preds_max_prob, _ = torch.nn.functional.softmax(pred_logits, dim=2).max(dim=2)
+        preds_max_prob = preds_max_prob[:, 1:]
+
+        for index in range(batch_size):
+            pred_eos = preds_str[index].find(eos_str)
+            pred = preds_str[index][:pred_eos]
+            pred_index = preds_index[index].tolist()
+            pred_eos_index = pred_index.index(eos_token) if eos_token in pred_index else -1
+            pred_max_prob = preds_max_prob[index][: pred_eos_index + 1]
+            confidence_score = pred_max_prob.cumprod(dim=0)[-1] if pred_max_prob.nelement() != 0 else 0.0
+            dec_strs.append(pred)
+            conf_scores.append(confidence_score)
+
+        return dec_strs, conf_scores
+
+    def char_decode(self, sequences):
+        """
+        Convert a list of lists of char token ids into a list of strings by calling char tokenizer.
+
+        Args:
+            sequences (`torch.Tensor`):
+                List of tokenized input ids.
+        Returns:
+            `list[str]`: The list of char decoded sentences.
+        """
+        decode_strs = [seq.replace(" ", "") for seq in self.char_tokenizer.batch_decode(sequences)]
+        return decode_strs
+
+    def bpe_decode(self, sequences):
+        """
+        Convert a list of lists of bpe token ids into a list of strings by calling bpe tokenizer.
+
+        Args:
+            sequences (`torch.Tensor`):
+                List of tokenized input ids.
+        Returns:
+            `list[str]`: The list of bpe decoded sentences.
+        """
+        return self.bpe_tokenizer.batch_decode(sequences)
+
+    def wp_decode(self, sequences):
+        """
+        Convert a list of lists of word piece token ids into a list of strings by calling word piece tokenizer.
+
+        Args:
+            sequences (`torch.Tensor`):
+                List of tokenized input ids.
+        Returns:
+            `list[str]`: The list of wp decoded sentences.
+        """
+        decode_strs = [seq.replace(" ", "") for seq in self.wp_tokenizer.batch_decode(sequences)]
+        return decode_strs
+
+
+__all__ = ["MgpstrProcessor"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mgp_str/tokenization_mgp_str.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mgp_str/tokenization_mgp_str.py
new file mode 100644
index 0000000000000000000000000000000000000000..f29b8a8348ee3771210df17de79be6c30a65c34a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mgp_str/tokenization_mgp_str.py
@@ -0,0 +1,104 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for MGT-STR CHAR."""
+
+import json
+import os
+from typing import Optional
+
+from ...tokenization_utils import PreTrainedTokenizer
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "vocab.json"}
+
+
+class MgpstrTokenizer(PreTrainedTokenizer):
+    """
+    Construct a MGP-STR char tokenizer.
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        unk_token (`str`, *optional*, defaults to `"[GO]"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        bos_token (`str`, *optional*, defaults to `"[GO]"`):
+            The beginning of sequence token.
+        eos_token (`str`, *optional*, defaults to `"[s]"`):
+            The end of sequence token.
+        pad_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"[GO]"`):
+            A special token used to make arrays of tokens the same size for batching purpose. Will then be ignored by
+            attention mechanisms or loss computation.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+
+    def __init__(self, vocab_file, unk_token="[GO]", bos_token="[GO]", eos_token="[s]", pad_token="[GO]", **kwargs):
+        with open(vocab_file, encoding="utf-8") as vocab_handle:
+            self.vocab = json.load(vocab_handle)
+        self.decoder = {v: k for k, v in self.vocab.items()}
+        super().__init__(
+            unk_token=unk_token,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            pad_token=pad_token,
+            **kwargs,
+        )
+
+    @property
+    def vocab_size(self):
+        return len(self.vocab)
+
+    def get_vocab(self):
+        vocab = dict(self.vocab).copy()
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def _tokenize(self, text):
+        """Tokenize a string."""
+        char_tokens = []
+        for s in text:
+            char_tokens.extend(s)
+        return char_tokens
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        return self.vocab.get(token, self.vocab.get(self.unk_token))
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.decoder.get(index)
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        with open(vocab_file, "w", encoding="utf-8") as f:
+            f.write(json.dumps(self.vocab, indent=2, sort_keys=True, ensure_ascii=False) + "\n")
+
+        return (vocab_file,)
+
+
+__all__ = ["MgpstrTokenizer"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/minimax/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/minimax/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..17d7eb983f6d3a87babe0bf066fde67d0b476694
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/minimax/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/minimax/__pycache__/configuration_minimax.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/minimax/__pycache__/configuration_minimax.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e76ab3663b83972dd5895b878f15b759dc52f81b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/minimax/__pycache__/configuration_minimax.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/minimax/__pycache__/modeling_minimax.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/minimax/__pycache__/modeling_minimax.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..abb8cb25c3326a610596975e24b25257f84edab4
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/minimax/__pycache__/modeling_minimax.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/minimax/__pycache__/modular_minimax.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/minimax/__pycache__/modular_minimax.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a57b16b27ab3326d780475de25fd05f80b36b1da
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/minimax/__pycache__/modular_minimax.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ministral/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ministral/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..803d33bbfaa44b597d3848a95a3b674a413eed10
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ministral/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ministral/__pycache__/configuration_ministral.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ministral/__pycache__/configuration_ministral.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e1673c69c047e194d23257018f062d25b6d14407
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ministral/__pycache__/configuration_ministral.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ministral/__pycache__/modular_ministral.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ministral/__pycache__/modular_ministral.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..df4be675d68f72fb78f061599e27276e82dc9ed3
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/ministral/__pycache__/modular_ministral.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilebert/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilebert/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cdad2afa8c35c2a7d4de612ccbcd69b1658a93f6
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilebert/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilebert/__pycache__/configuration_mobilebert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilebert/__pycache__/configuration_mobilebert.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..37337d52ad7988b649c3abfade43d0d65c5c8d4b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilebert/__pycache__/configuration_mobilebert.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilebert/__pycache__/modeling_mobilebert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilebert/__pycache__/modeling_mobilebert.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..28472af3f089996e67ae1caacd79f5848134dec9
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilebert/__pycache__/modeling_mobilebert.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilebert/__pycache__/modeling_tf_mobilebert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilebert/__pycache__/modeling_tf_mobilebert.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..65633a8aa1e152943b8c941bd29dcba878723a58
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilebert/__pycache__/modeling_tf_mobilebert.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilebert/__pycache__/tokenization_mobilebert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilebert/__pycache__/tokenization_mobilebert.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..363f0cd34add34fd6ec00f3ec729b33bd19fb8c7
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilebert/__pycache__/tokenization_mobilebert.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilebert/__pycache__/tokenization_mobilebert_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilebert/__pycache__/tokenization_mobilebert_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e09efd5674529048b9db54e5cff9b7e376468ab8
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilebert/__pycache__/tokenization_mobilebert_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..bce83216c35546fe01cf96c738cbe9c2f3582486
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/__init__.py
@@ -0,0 +1,30 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_mobilenet_v1 import *
+    from .feature_extraction_mobilenet_v1 import *
+    from .image_processing_mobilenet_v1 import *
+    from .image_processing_mobilenet_v1_fast import *
+    from .modeling_mobilenet_v1 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bd21d9a898ec65f6ba7b325cf7acd1933b54402a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/__pycache__/configuration_mobilenet_v1.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/__pycache__/configuration_mobilenet_v1.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..506d147772dc443284f286a1aaa0cc47bd5de5d0
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/__pycache__/configuration_mobilenet_v1.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/__pycache__/feature_extraction_mobilenet_v1.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/__pycache__/feature_extraction_mobilenet_v1.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fabf58e7af316c0fb81ec1d754433407d442d412
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/__pycache__/feature_extraction_mobilenet_v1.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/__pycache__/image_processing_mobilenet_v1.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/__pycache__/image_processing_mobilenet_v1.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eb13cfa5c4ed0ee89f526765ec01b31c53ff80b0
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/__pycache__/image_processing_mobilenet_v1.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/__pycache__/image_processing_mobilenet_v1_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/__pycache__/image_processing_mobilenet_v1_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..94bd6c771f95de7cb74c136c76a550f58116d68c
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/__pycache__/image_processing_mobilenet_v1_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/__pycache__/modeling_mobilenet_v1.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/__pycache__/modeling_mobilenet_v1.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e521352dddfcd5e4669a2af0e0ab1fb8f158f728
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/__pycache__/modeling_mobilenet_v1.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/configuration_mobilenet_v1.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/configuration_mobilenet_v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..c18d72d55003dc88e8fbcfe2ee80a18975d23dbc
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/configuration_mobilenet_v1.py
@@ -0,0 +1,126 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""MobileNetV1 model configuration"""
+
+from collections import OrderedDict
+from collections.abc import Mapping
+
+from packaging import version
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class MobileNetV1Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MobileNetV1Model`]. It is used to instantiate a
+    MobileNetV1 model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the MobileNetV1
+    [google/mobilenet_v1_1.0_224](https://huggingface.co/google/mobilenet_v1_1.0_224) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        image_size (`int`, *optional*, defaults to 224):
+            The size (resolution) of each image.
+        depth_multiplier (`float`, *optional*, defaults to 1.0):
+            Shrinks or expands the number of channels in each layer. Default is 1.0, which starts the network with 32
+            channels. This is sometimes also called "alpha" or "width multiplier".
+        min_depth (`int`, *optional*, defaults to 8):
+            All layers will have at least this many channels.
+        hidden_act (`str` or `function`, *optional*, defaults to `"relu6"`):
+            The non-linear activation function (function or string) in the Transformer encoder and convolution layers.
+        tf_padding (`bool`, *optional*, defaults to `True`):
+            Whether to use TensorFlow padding rules on the convolution layers.
+        classifier_dropout_prob (`float`, *optional*, defaults to 0.999):
+            The dropout ratio for attached classifiers.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 0.001):
+            The epsilon used by the layer normalization layers.
+
+    Example:
+
+    ```python
+    >>> from transformers import MobileNetV1Config, MobileNetV1Model
+
+    >>> # Initializing a "mobilenet_v1_1.0_224" style configuration
+    >>> configuration = MobileNetV1Config()
+
+    >>> # Initializing a model from the "mobilenet_v1_1.0_224" style configuration
+    >>> model = MobileNetV1Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "mobilenet_v1"
+
+    def __init__(
+        self,
+        num_channels=3,
+        image_size=224,
+        depth_multiplier=1.0,
+        min_depth=8,
+        hidden_act="relu6",
+        tf_padding=True,
+        classifier_dropout_prob=0.999,
+        initializer_range=0.02,
+        layer_norm_eps=0.001,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        if depth_multiplier <= 0:
+            raise ValueError("depth_multiplier must be greater than zero.")
+
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.depth_multiplier = depth_multiplier
+        self.min_depth = min_depth
+        self.hidden_act = hidden_act
+        self.tf_padding = tf_padding
+        self.classifier_dropout_prob = classifier_dropout_prob
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+
+
+class MobileNetV1OnnxConfig(OnnxConfig):
+    torch_onnx_minimum_version = version.parse("1.11")
+
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict([("pixel_values", {0: "batch"})])
+
+    @property
+    def outputs(self) -> Mapping[str, Mapping[int, str]]:
+        if self.task == "image-classification":
+            return OrderedDict([("logits", {0: "batch"})])
+        else:
+            return OrderedDict([("last_hidden_state", {0: "batch"}), ("pooler_output", {0: "batch"})])
+
+    @property
+    def atol_for_validation(self) -> float:
+        return 1e-4
+
+
+__all__ = ["MobileNetV1Config", "MobileNetV1OnnxConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/modeling_mobilenet_v1.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/modeling_mobilenet_v1.py
new file mode 100644
index 0000000000000000000000000000000000000000..25997a46790c009bc62e5b4323d378386e3c1718
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v1/modeling_mobilenet_v1.py
@@ -0,0 +1,414 @@
+# coding=utf-8
+# Copyright 2022 Apple Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch MobileNetV1 model."""
+
+from typing import Optional, Union
+
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...modeling_outputs import BaseModelOutputWithPoolingAndNoAttention, ImageClassifierOutputWithNoAttention
+from ...modeling_utils import PreTrainedModel
+from ...utils import auto_docstring, logging
+from .configuration_mobilenet_v1 import MobileNetV1Config
+
+
+logger = logging.get_logger(__name__)
+
+
+def _build_tf_to_pytorch_map(model, config, tf_weights=None):
+    """
+    A map of modules from TF to PyTorch.
+    """
+
+    tf_to_pt_map = {}
+
+    if isinstance(model, MobileNetV1ForImageClassification):
+        backbone = model.mobilenet_v1
+    else:
+        backbone = model
+
+    prefix = "MobilenetV1/Conv2d_0/"
+    tf_to_pt_map[prefix + "weights"] = backbone.conv_stem.convolution.weight
+    tf_to_pt_map[prefix + "BatchNorm/beta"] = backbone.conv_stem.normalization.bias
+    tf_to_pt_map[prefix + "BatchNorm/gamma"] = backbone.conv_stem.normalization.weight
+    tf_to_pt_map[prefix + "BatchNorm/moving_mean"] = backbone.conv_stem.normalization.running_mean
+    tf_to_pt_map[prefix + "BatchNorm/moving_variance"] = backbone.conv_stem.normalization.running_var
+
+    for i in range(13):
+        tf_index = i + 1
+        pt_index = i * 2
+
+        pointer = backbone.layer[pt_index]
+        prefix = f"MobilenetV1/Conv2d_{tf_index}_depthwise/"
+        tf_to_pt_map[prefix + "depthwise_weights"] = pointer.convolution.weight
+        tf_to_pt_map[prefix + "BatchNorm/beta"] = pointer.normalization.bias
+        tf_to_pt_map[prefix + "BatchNorm/gamma"] = pointer.normalization.weight
+        tf_to_pt_map[prefix + "BatchNorm/moving_mean"] = pointer.normalization.running_mean
+        tf_to_pt_map[prefix + "BatchNorm/moving_variance"] = pointer.normalization.running_var
+
+        pointer = backbone.layer[pt_index + 1]
+        prefix = f"MobilenetV1/Conv2d_{tf_index}_pointwise/"
+        tf_to_pt_map[prefix + "weights"] = pointer.convolution.weight
+        tf_to_pt_map[prefix + "BatchNorm/beta"] = pointer.normalization.bias
+        tf_to_pt_map[prefix + "BatchNorm/gamma"] = pointer.normalization.weight
+        tf_to_pt_map[prefix + "BatchNorm/moving_mean"] = pointer.normalization.running_mean
+        tf_to_pt_map[prefix + "BatchNorm/moving_variance"] = pointer.normalization.running_var
+
+    if isinstance(model, MobileNetV1ForImageClassification):
+        prefix = "MobilenetV1/Logits/Conv2d_1c_1x1/"
+        tf_to_pt_map[prefix + "weights"] = model.classifier.weight
+        tf_to_pt_map[prefix + "biases"] = model.classifier.bias
+
+    return tf_to_pt_map
+
+
+def load_tf_weights_in_mobilenet_v1(model, config, tf_checkpoint_path):
+    """Load TensorFlow checkpoints in a PyTorch model."""
+    try:
+        import numpy as np
+        import tensorflow as tf
+    except ImportError:
+        logger.error(
+            "Loading a TensorFlow models in PyTorch, requires TensorFlow to be installed. Please see "
+            "https://www.tensorflow.org/install/ for installation instructions."
+        )
+        raise
+
+    # Load weights from TF model
+    init_vars = tf.train.list_variables(tf_checkpoint_path)
+    tf_weights = {}
+    for name, shape in init_vars:
+        logger.info(f"Loading TF weight {name} with shape {shape}")
+        array = tf.train.load_variable(tf_checkpoint_path, name)
+        tf_weights[name] = array
+
+    # Build TF to PyTorch weights loading map
+    tf_to_pt_map = _build_tf_to_pytorch_map(model, config, tf_weights)
+
+    for name, pointer in tf_to_pt_map.items():
+        logger.info(f"Importing {name}")
+        if name not in tf_weights:
+            logger.info(f"{name} not in tf pre-trained weights, skipping")
+            continue
+
+        array = tf_weights[name]
+
+        if "depthwise_weights" in name:
+            logger.info("Transposing depthwise")
+            array = np.transpose(array, (2, 3, 0, 1))
+        elif "weights" in name:
+            logger.info("Transposing")
+            if len(pointer.shape) == 2:  # copying into linear layer
+                array = array.squeeze().transpose()
+            else:
+                array = np.transpose(array, (3, 2, 0, 1))
+
+        if pointer.shape != array.shape:
+            raise ValueError(f"Pointer shape {pointer.shape} and array shape {array.shape} mismatched")
+
+        logger.info(f"Initialize PyTorch weight {name} {array.shape}")
+        pointer.data = torch.from_numpy(array)
+
+        tf_weights.pop(name, None)
+        tf_weights.pop(name + "/RMSProp", None)
+        tf_weights.pop(name + "/RMSProp_1", None)
+        tf_weights.pop(name + "/ExponentialMovingAverage", None)
+
+    logger.info(f"Weights not copied to PyTorch model: {', '.join(tf_weights.keys())}")
+    return model
+
+
+def apply_tf_padding(features: torch.Tensor, conv_layer: nn.Conv2d) -> torch.Tensor:
+    """
+    Apply TensorFlow-style "SAME" padding to a convolution layer. See the notes at:
+    https://www.tensorflow.org/api_docs/python/tf/nn#notes_on_padding_2
+    """
+    in_height, in_width = features.shape[-2:]
+    stride_height, stride_width = conv_layer.stride
+    kernel_height, kernel_width = conv_layer.kernel_size
+
+    if in_height % stride_height == 0:
+        pad_along_height = max(kernel_height - stride_height, 0)
+    else:
+        pad_along_height = max(kernel_height - (in_height % stride_height), 0)
+
+    if in_width % stride_width == 0:
+        pad_along_width = max(kernel_width - stride_width, 0)
+    else:
+        pad_along_width = max(kernel_width - (in_width % stride_width), 0)
+
+    pad_left = pad_along_width // 2
+    pad_right = pad_along_width - pad_left
+    pad_top = pad_along_height // 2
+    pad_bottom = pad_along_height - pad_top
+
+    padding = (pad_left, pad_right, pad_top, pad_bottom)
+    return nn.functional.pad(features, padding, "constant", 0.0)
+
+
+class MobileNetV1ConvLayer(nn.Module):
+    def __init__(
+        self,
+        config: MobileNetV1Config,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: Optional[int] = 1,
+        groups: Optional[int] = 1,
+        bias: bool = False,
+        use_normalization: Optional[bool] = True,
+        use_activation: Optional[Union[bool, str]] = True,
+    ) -> None:
+        super().__init__()
+        self.config = config
+
+        if in_channels % groups != 0:
+            raise ValueError(f"Input channels ({in_channels}) are not divisible by {groups} groups.")
+        if out_channels % groups != 0:
+            raise ValueError(f"Output channels ({out_channels}) are not divisible by {groups} groups.")
+
+        padding = 0 if config.tf_padding else int((kernel_size - 1) / 2)
+
+        self.convolution = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=groups,
+            bias=bias,
+            padding_mode="zeros",
+        )
+
+        if use_normalization:
+            self.normalization = nn.BatchNorm2d(
+                num_features=out_channels,
+                eps=config.layer_norm_eps,
+                momentum=0.9997,
+                affine=True,
+                track_running_stats=True,
+            )
+        else:
+            self.normalization = None
+
+        if use_activation:
+            if isinstance(use_activation, str):
+                self.activation = ACT2FN[use_activation]
+            elif isinstance(config.hidden_act, str):
+                self.activation = ACT2FN[config.hidden_act]
+            else:
+                self.activation = config.hidden_act
+        else:
+            self.activation = None
+
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        if self.config.tf_padding:
+            features = apply_tf_padding(features, self.convolution)
+        features = self.convolution(features)
+        if self.normalization is not None:
+            features = self.normalization(features)
+        if self.activation is not None:
+            features = self.activation(features)
+        return features
+
+
+@auto_docstring
+class MobileNetV1PreTrainedModel(PreTrainedModel):
+    config: MobileNetV1Config
+    load_tf_weights = load_tf_weights_in_mobilenet_v1
+    base_model_prefix = "mobilenet_v1"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = False
+    _no_split_modules = []
+
+    def _init_weights(self, module: Union[nn.Linear, nn.Conv2d]) -> None:
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d)):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.BatchNorm2d):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+@auto_docstring
+class MobileNetV1Model(MobileNetV1PreTrainedModel):
+    def __init__(self, config: MobileNetV1Config, add_pooling_layer: bool = True):
+        r"""
+        add_pooling_layer (bool, *optional*, defaults to `True`):
+            Whether to add a pooling layer
+        """
+        super().__init__(config)
+        self.config = config
+
+        depth = 32
+        out_channels = max(int(depth * config.depth_multiplier), config.min_depth)
+
+        self.conv_stem = MobileNetV1ConvLayer(
+            config,
+            in_channels=config.num_channels,
+            out_channels=out_channels,
+            kernel_size=3,
+            stride=2,
+        )
+
+        strides = [1, 2, 1, 2, 1, 2, 1, 1, 1, 1, 1, 2, 1]
+
+        self.layer = nn.ModuleList()
+        for i in range(13):
+            in_channels = out_channels
+
+            if strides[i] == 2 or i == 0:
+                depth *= 2
+                out_channels = max(int(depth * config.depth_multiplier), config.min_depth)
+
+            self.layer.append(
+                MobileNetV1ConvLayer(
+                    config,
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    kernel_size=3,
+                    stride=strides[i],
+                    groups=in_channels,
+                )
+            )
+
+            self.layer.append(
+                MobileNetV1ConvLayer(
+                    config,
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    kernel_size=1,
+                )
+            )
+
+        self.pooler = nn.AdaptiveAvgPool2d((1, 1)) if add_pooling_layer else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def _prune_heads(self, heads_to_prune):
+        raise NotImplementedError
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutputWithPoolingAndNoAttention]:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        hidden_states = self.conv_stem(pixel_values)
+
+        all_hidden_states = () if output_hidden_states else None
+
+        for i, layer_module in enumerate(self.layer):
+            hidden_states = layer_module(hidden_states)
+
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+        last_hidden_state = hidden_states
+
+        if self.pooler is not None:
+            pooled_output = torch.flatten(self.pooler(last_hidden_state), start_dim=1)
+        else:
+            pooled_output = None
+
+        if not return_dict:
+            return tuple(v for v in [last_hidden_state, pooled_output, all_hidden_states] if v is not None)
+
+        return BaseModelOutputWithPoolingAndNoAttention(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=all_hidden_states,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    MobileNetV1 model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
+    ImageNet.
+    """
+)
+class MobileNetV1ForImageClassification(MobileNetV1PreTrainedModel):
+    def __init__(self, config: MobileNetV1Config) -> None:
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.mobilenet_v1 = MobileNetV1Model(config)
+
+        last_hidden_size = self.mobilenet_v1.layer[-1].convolution.out_channels
+
+        # Classifier head
+        self.dropout = nn.Dropout(config.classifier_dropout_prob, inplace=True)
+        self.classifier = nn.Linear(last_hidden_size, config.num_labels) if config.num_labels > 0 else nn.Identity()
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, ImageClassifierOutputWithNoAttention]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss). If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.mobilenet_v1(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict)
+
+        pooled_output = outputs.pooler_output if return_dict else outputs[1]
+
+        logits = self.classifier(self.dropout(pooled_output))
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(labels, logits, self.config)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return ImageClassifierOutputWithNoAttention(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+        )
+
+
+__all__ = [
+    "MobileNetV1ForImageClassification",
+    "MobileNetV1Model",
+    "MobileNetV1PreTrainedModel",
+    "load_tf_weights_in_mobilenet_v1",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v2/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v2/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..36a4229b4a0068a3293e4e11b12fd31551567623
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v2/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v2/__pycache__/configuration_mobilenet_v2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v2/__pycache__/configuration_mobilenet_v2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e1c6109fd2023d8dcaeb013d67c01b6ec3ba9c69
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v2/__pycache__/configuration_mobilenet_v2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v2/__pycache__/feature_extraction_mobilenet_v2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v2/__pycache__/feature_extraction_mobilenet_v2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..873e73b7f99f2b2bbd69da7d7312b98f70ba7ee6
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v2/__pycache__/feature_extraction_mobilenet_v2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v2/__pycache__/image_processing_mobilenet_v2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v2/__pycache__/image_processing_mobilenet_v2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..16c6309c3e759170330184ce1fcc440ff20a4dcd
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v2/__pycache__/image_processing_mobilenet_v2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v2/__pycache__/image_processing_mobilenet_v2_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v2/__pycache__/image_processing_mobilenet_v2_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..714e6ad0ed29c3b837fcb011d2c393dc489bcae1
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v2/__pycache__/image_processing_mobilenet_v2_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v2/__pycache__/modeling_mobilenet_v2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v2/__pycache__/modeling_mobilenet_v2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..66edc800a400e438ebcd5551d5aaa4a26dcbf6cf
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v2/__pycache__/modeling_mobilenet_v2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v2/feature_extraction_mobilenet_v2.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v2/feature_extraction_mobilenet_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..e36b50cffaada3293d1dada34f06022f3ecf8163
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilenet_v2/feature_extraction_mobilenet_v2.py
@@ -0,0 +1,38 @@
+# coding=utf-8
+# Copyright 2022 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Feature extractor class for MobileNetV2."""
+
+import warnings
+
+from ...utils import logging
+from ...utils.import_utils import requires
+from .image_processing_mobilenet_v2 import MobileNetV2ImageProcessor
+
+
+logger = logging.get_logger(__name__)
+
+
+@requires(backends=("vision",))
+class MobileNetV2FeatureExtractor(MobileNetV2ImageProcessor):
+    def __init__(self, *args, **kwargs) -> None:
+        warnings.warn(
+            "The class MobileNetV2FeatureExtractor is deprecated and will be removed in version 5 of Transformers."
+            " Please use MobileNetV2ImageProcessor instead.",
+            FutureWarning,
+        )
+        super().__init__(*args, **kwargs)
+
+
+__all__ = ["MobileNetV2FeatureExtractor"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilevitv2/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilevitv2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..15e10a1d9815935629fadf9405effdae5a16fbb2
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilevitv2/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_mobilevitv2 import *
+    from .modeling_mobilevitv2 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilevitv2/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilevitv2/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7a1bbb4fb72d73b2c5bc4a4305b24a9e1346cfb8
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilevitv2/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilevitv2/__pycache__/configuration_mobilevitv2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilevitv2/__pycache__/configuration_mobilevitv2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..029cd2f974327a28a2bc390d8f527d286bfc6a35
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilevitv2/__pycache__/configuration_mobilevitv2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilevitv2/__pycache__/modeling_mobilevitv2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilevitv2/__pycache__/modeling_mobilevitv2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..26870c71541e38ca83037e212be00433d5c4481c
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilevitv2/__pycache__/modeling_mobilevitv2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilevitv2/configuration_mobilevitv2.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilevitv2/configuration_mobilevitv2.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc9db247602c77daa35c3c3ad97dbda10e881cc8
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilevitv2/configuration_mobilevitv2.py
@@ -0,0 +1,168 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""MobileViTV2 model configuration"""
+
+from collections import OrderedDict
+from collections.abc import Mapping
+
+from packaging import version
+
+from ...configuration_utils import PretrainedConfig
+from ...onnx import OnnxConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class MobileViTV2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`MobileViTV2Model`]. It is used to instantiate a
+    MobileViTV2 model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the MobileViTV2
+    [apple/mobilevitv2-1.0](https://huggingface.co/apple/mobilevitv2-1.0) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        image_size (`int`, *optional*, defaults to 256):
+            The size (resolution) of each image.
+        patch_size (`int`, *optional*, defaults to 2):
+            The size (resolution) of each patch.
+        expand_ratio (`float`, *optional*, defaults to 2.0):
+            Expansion factor for the MobileNetv2 layers.
+        hidden_act (`str` or `function`, *optional*, defaults to `"swish"`):
+            The non-linear activation function (function or string) in the Transformer encoder and convolution layers.
+        conv_kernel_size (`int`, *optional*, defaults to 3):
+            The size of the convolutional kernel in the MobileViTV2 layer.
+        output_stride (`int`, *optional*, defaults to 32):
+            The ratio of the spatial resolution of the output to the resolution of the input image.
+        classifier_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for attached classifiers.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        aspp_out_channels (`int`, *optional*, defaults to 512):
+            Number of output channels used in the ASPP layer for semantic segmentation.
+        atrous_rates (`list[int]`, *optional*, defaults to `[6, 12, 18]`):
+            Dilation (atrous) factors used in the ASPP layer for semantic segmentation.
+        aspp_dropout_prob (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the ASPP layer for semantic segmentation.
+        semantic_loss_ignore_index (`int`, *optional*, defaults to 255):
+            The index that is ignored by the loss function of the semantic segmentation model.
+        n_attn_blocks (`list[int]`, *optional*, defaults to `[2, 4, 3]`):
+            The number of attention blocks in each MobileViTV2Layer
+        base_attn_unit_dims (`list[int]`, *optional*, defaults to `[128, 192, 256]`):
+            The base multiplier for dimensions of attention blocks in each MobileViTV2Layer
+        width_multiplier (`float`, *optional*, defaults to 1.0):
+            The width multiplier for MobileViTV2.
+        ffn_multiplier (`int`, *optional*, defaults to 2):
+            The FFN multiplier for MobileViTV2.
+        attn_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout in the attention layer.
+        ffn_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout between FFN layers.
+
+    Example:
+
+    ```python
+    >>> from transformers import MobileViTV2Config, MobileViTV2Model
+
+    >>> # Initializing a mobilevitv2-small style configuration
+    >>> configuration = MobileViTV2Config()
+
+    >>> # Initializing a model from the mobilevitv2-small style configuration
+    >>> model = MobileViTV2Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "mobilevitv2"
+
+    def __init__(
+        self,
+        num_channels=3,
+        image_size=256,
+        patch_size=2,
+        expand_ratio=2.0,
+        hidden_act="swish",
+        conv_kernel_size=3,
+        output_stride=32,
+        classifier_dropout_prob=0.1,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        aspp_out_channels=512,
+        atrous_rates=[6, 12, 18],
+        aspp_dropout_prob=0.1,
+        semantic_loss_ignore_index=255,
+        n_attn_blocks=[2, 4, 3],
+        base_attn_unit_dims=[128, 192, 256],
+        width_multiplier=1.0,
+        ffn_multiplier=2,
+        attn_dropout=0.0,
+        ffn_dropout=0.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.num_channels = num_channels
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.expand_ratio = expand_ratio
+        self.hidden_act = hidden_act
+        self.conv_kernel_size = conv_kernel_size
+        self.output_stride = output_stride
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.n_attn_blocks = n_attn_blocks
+        self.base_attn_unit_dims = base_attn_unit_dims
+        self.width_multiplier = width_multiplier
+        self.ffn_multiplier = ffn_multiplier
+        self.ffn_dropout = ffn_dropout
+        self.attn_dropout = attn_dropout
+        self.classifier_dropout_prob = classifier_dropout_prob
+
+        # decode head attributes for semantic segmentation
+        self.aspp_out_channels = aspp_out_channels
+        self.atrous_rates = atrous_rates
+        self.aspp_dropout_prob = aspp_dropout_prob
+        self.semantic_loss_ignore_index = semantic_loss_ignore_index
+
+
+class MobileViTV2OnnxConfig(OnnxConfig):
+    torch_onnx_minimum_version = version.parse("1.11")
+
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict([("pixel_values", {0: "batch", 1: "num_channels", 2: "height", 3: "width"})])
+
+    @property
+    def outputs(self) -> Mapping[str, Mapping[int, str]]:
+        if self.task == "image-classification":
+            return OrderedDict([("logits", {0: "batch"})])
+        else:
+            return OrderedDict([("last_hidden_state", {0: "batch"}), ("pooler_output", {0: "batch"})])
+
+    @property
+    def atol_for_validation(self) -> float:
+        return 1e-4
+
+
+__all__ = ["MobileViTV2Config", "MobileViTV2OnnxConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilevitv2/modeling_mobilevitv2.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilevitv2/modeling_mobilevitv2.py
new file mode 100644
index 0000000000000000000000000000000000000000..4e0e972a648ad028f37d4a5928e3c4f178fa9b27
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/mobilevitv2/modeling_mobilevitv2.py
@@ -0,0 +1,947 @@
+# coding=utf-8
+# Copyright 2023 Apple Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Original license: https://github.com/apple/ml-cvnets/blob/main/LICENSE
+"""PyTorch MobileViTV2 model."""
+
+from typing import Optional, Union
+
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from ...activations import ACT2FN
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BaseModelOutputWithNoAttention,
+    BaseModelOutputWithPoolingAndNoAttention,
+    ImageClassifierOutputWithNoAttention,
+    SemanticSegmenterOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import auto_docstring, logging
+from .configuration_mobilevitv2 import MobileViTV2Config
+
+
+logger = logging.get_logger(__name__)
+
+
+# Copied from transformers.models.mobilevit.modeling_mobilevit.make_divisible
+def make_divisible(value: int, divisor: int = 8, min_value: Optional[int] = None) -> int:
+    """
+    Ensure that all layers have a channel count that is divisible by `divisor`. This function is taken from the
+    original TensorFlow repo. It can be seen here:
+    https://github.com/tensorflow/models/blob/master/research/slim/nets/mobilenet/mobilenet.py
+    """
+    if min_value is None:
+        min_value = divisor
+    new_value = max(min_value, int(value + divisor / 2) // divisor * divisor)
+    # Make sure that round down does not go down by more than 10%.
+    if new_value < 0.9 * value:
+        new_value += divisor
+    return int(new_value)
+
+
+def clip(value: float, min_val: float = float("-inf"), max_val: float = float("inf")) -> float:
+    return max(min_val, min(max_val, value))
+
+
+# Copied from transformers.models.mobilevit.modeling_mobilevit.MobileViTConvLayer with MobileViT->MobileViTV2
+class MobileViTV2ConvLayer(nn.Module):
+    def __init__(
+        self,
+        config: MobileViTV2Config,
+        in_channels: int,
+        out_channels: int,
+        kernel_size: int,
+        stride: int = 1,
+        groups: int = 1,
+        bias: bool = False,
+        dilation: int = 1,
+        use_normalization: bool = True,
+        use_activation: Union[bool, str] = True,
+    ) -> None:
+        super().__init__()
+        padding = int((kernel_size - 1) / 2) * dilation
+
+        if in_channels % groups != 0:
+            raise ValueError(f"Input channels ({in_channels}) are not divisible by {groups} groups.")
+        if out_channels % groups != 0:
+            raise ValueError(f"Output channels ({out_channels}) are not divisible by {groups} groups.")
+
+        self.convolution = nn.Conv2d(
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            dilation=dilation,
+            groups=groups,
+            bias=bias,
+            padding_mode="zeros",
+        )
+
+        if use_normalization:
+            self.normalization = nn.BatchNorm2d(
+                num_features=out_channels,
+                eps=1e-5,
+                momentum=0.1,
+                affine=True,
+                track_running_stats=True,
+            )
+        else:
+            self.normalization = None
+
+        if use_activation:
+            if isinstance(use_activation, str):
+                self.activation = ACT2FN[use_activation]
+            elif isinstance(config.hidden_act, str):
+                self.activation = ACT2FN[config.hidden_act]
+            else:
+                self.activation = config.hidden_act
+        else:
+            self.activation = None
+
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        features = self.convolution(features)
+        if self.normalization is not None:
+            features = self.normalization(features)
+        if self.activation is not None:
+            features = self.activation(features)
+        return features
+
+
+# Copied from transformers.models.mobilevit.modeling_mobilevit.MobileViTInvertedResidual with MobileViT->MobileViTV2
+class MobileViTV2InvertedResidual(nn.Module):
+    """
+    Inverted residual block (MobileNetv2): https://huggingface.co/papers/1801.04381
+    """
+
+    def __init__(
+        self, config: MobileViTV2Config, in_channels: int, out_channels: int, stride: int, dilation: int = 1
+    ) -> None:
+        super().__init__()
+        expanded_channels = make_divisible(int(round(in_channels * config.expand_ratio)), 8)
+
+        if stride not in [1, 2]:
+            raise ValueError(f"Invalid stride {stride}.")
+
+        self.use_residual = (stride == 1) and (in_channels == out_channels)
+
+        self.expand_1x1 = MobileViTV2ConvLayer(
+            config, in_channels=in_channels, out_channels=expanded_channels, kernel_size=1
+        )
+
+        self.conv_3x3 = MobileViTV2ConvLayer(
+            config,
+            in_channels=expanded_channels,
+            out_channels=expanded_channels,
+            kernel_size=3,
+            stride=stride,
+            groups=expanded_channels,
+            dilation=dilation,
+        )
+
+        self.reduce_1x1 = MobileViTV2ConvLayer(
+            config,
+            in_channels=expanded_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            use_activation=False,
+        )
+
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        residual = features
+
+        features = self.expand_1x1(features)
+        features = self.conv_3x3(features)
+        features = self.reduce_1x1(features)
+
+        return residual + features if self.use_residual else features
+
+
+# Copied from transformers.models.mobilevit.modeling_mobilevit.MobileViTMobileNetLayer with MobileViT->MobileViTV2
+class MobileViTV2MobileNetLayer(nn.Module):
+    def __init__(
+        self, config: MobileViTV2Config, in_channels: int, out_channels: int, stride: int = 1, num_stages: int = 1
+    ) -> None:
+        super().__init__()
+
+        self.layer = nn.ModuleList()
+        for i in range(num_stages):
+            layer = MobileViTV2InvertedResidual(
+                config,
+                in_channels=in_channels,
+                out_channels=out_channels,
+                stride=stride if i == 0 else 1,
+            )
+            self.layer.append(layer)
+            in_channels = out_channels
+
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        for layer_module in self.layer:
+            features = layer_module(features)
+        return features
+
+
+class MobileViTV2LinearSelfAttention(nn.Module):
+    """
+    This layer applies a self-attention with linear complexity, as described in MobileViTV2 paper:
+    https://huggingface.co/papers/2206.02680
+
+    Args:
+        config (`MobileVitv2Config`):
+             Model configuration object
+        embed_dim (`int`):
+            `input_channels` from an expected input of size :math:`(batch_size, input_channels, height, width)`
+    """
+
+    def __init__(self, config: MobileViTV2Config, embed_dim: int) -> None:
+        super().__init__()
+
+        self.qkv_proj = MobileViTV2ConvLayer(
+            config=config,
+            in_channels=embed_dim,
+            out_channels=1 + (2 * embed_dim),
+            bias=True,
+            kernel_size=1,
+            use_normalization=False,
+            use_activation=False,
+        )
+
+        self.attn_dropout = nn.Dropout(p=config.attn_dropout)
+        self.out_proj = MobileViTV2ConvLayer(
+            config=config,
+            in_channels=embed_dim,
+            out_channels=embed_dim,
+            bias=True,
+            kernel_size=1,
+            use_normalization=False,
+            use_activation=False,
+        )
+        self.embed_dim = embed_dim
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        # (batch_size, embed_dim, num_pixels_in_patch, num_patches) --> (batch_size, 1+2*embed_dim, num_pixels_in_patch, num_patches)
+        qkv = self.qkv_proj(hidden_states)
+
+        # Project hidden_states into query, key and value
+        # Query --> [batch_size, 1, num_pixels_in_patch, num_patches]
+        # value, key --> [batch_size, embed_dim, num_pixels_in_patch, num_patches]
+        query, key, value = torch.split(qkv, split_size_or_sections=[1, self.embed_dim, self.embed_dim], dim=1)
+
+        # apply softmax along num_patches dimension
+        context_scores = torch.nn.functional.softmax(query, dim=-1)
+        context_scores = self.attn_dropout(context_scores)
+
+        # Compute context vector
+        # [batch_size, embed_dim, num_pixels_in_patch, num_patches] x [batch_size, 1, num_pixels_in_patch, num_patches] -> [batch_size, embed_dim, num_pixels_in_patch, num_patches]
+        context_vector = key * context_scores
+        # [batch_size, embed_dim, num_pixels_in_patch, num_patches] --> [batch_size, embed_dim, num_pixels_in_patch, 1]
+        context_vector = torch.sum(context_vector, dim=-1, keepdim=True)
+
+        # combine context vector with values
+        # [batch_size, embed_dim, num_pixels_in_patch, num_patches] * [batch_size, embed_dim, num_pixels_in_patch, 1] --> [batch_size, embed_dim, num_pixels_in_patch, num_patches]
+        out = torch.nn.functional.relu(value) * context_vector.expand_as(value)
+        out = self.out_proj(out)
+        return out
+
+
+class MobileViTV2FFN(nn.Module):
+    def __init__(
+        self,
+        config: MobileViTV2Config,
+        embed_dim: int,
+        ffn_latent_dim: int,
+        ffn_dropout: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.conv1 = MobileViTV2ConvLayer(
+            config=config,
+            in_channels=embed_dim,
+            out_channels=ffn_latent_dim,
+            kernel_size=1,
+            stride=1,
+            bias=True,
+            use_normalization=False,
+            use_activation=True,
+        )
+        self.dropout1 = nn.Dropout(ffn_dropout)
+
+        self.conv2 = MobileViTV2ConvLayer(
+            config=config,
+            in_channels=ffn_latent_dim,
+            out_channels=embed_dim,
+            kernel_size=1,
+            stride=1,
+            bias=True,
+            use_normalization=False,
+            use_activation=False,
+        )
+        self.dropout2 = nn.Dropout(ffn_dropout)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        hidden_states = self.conv1(hidden_states)
+        hidden_states = self.dropout1(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+        hidden_states = self.dropout2(hidden_states)
+        return hidden_states
+
+
+class MobileViTV2TransformerLayer(nn.Module):
+    def __init__(
+        self,
+        config: MobileViTV2Config,
+        embed_dim: int,
+        ffn_latent_dim: int,
+        dropout: float = 0.0,
+    ) -> None:
+        super().__init__()
+        self.layernorm_before = nn.GroupNorm(num_groups=1, num_channels=embed_dim, eps=config.layer_norm_eps)
+        self.attention = MobileViTV2LinearSelfAttention(config, embed_dim)
+        self.dropout1 = nn.Dropout(p=dropout)
+        self.layernorm_after = nn.GroupNorm(num_groups=1, num_channels=embed_dim, eps=config.layer_norm_eps)
+        self.ffn = MobileViTV2FFN(config, embed_dim, ffn_latent_dim, config.ffn_dropout)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        layernorm_1_out = self.layernorm_before(hidden_states)
+        attention_output = self.attention(layernorm_1_out)
+        hidden_states = attention_output + hidden_states
+
+        layer_output = self.layernorm_after(hidden_states)
+        layer_output = self.ffn(layer_output)
+
+        layer_output = layer_output + hidden_states
+        return layer_output
+
+
+class MobileViTV2Transformer(nn.Module):
+    def __init__(self, config: MobileViTV2Config, n_layers: int, d_model: int) -> None:
+        super().__init__()
+
+        ffn_multiplier = config.ffn_multiplier
+
+        ffn_dims = [ffn_multiplier * d_model] * n_layers
+
+        # ensure that dims are multiple of 16
+        ffn_dims = [int((d // 16) * 16) for d in ffn_dims]
+
+        self.layer = nn.ModuleList()
+        for block_idx in range(n_layers):
+            transformer_layer = MobileViTV2TransformerLayer(
+                config, embed_dim=d_model, ffn_latent_dim=ffn_dims[block_idx]
+            )
+            self.layer.append(transformer_layer)
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        for layer_module in self.layer:
+            hidden_states = layer_module(hidden_states)
+        return hidden_states
+
+
+class MobileViTV2Layer(GradientCheckpointingLayer):
+    """
+    MobileViTV2 layer: https://huggingface.co/papers/2206.02680
+    """
+
+    def __init__(
+        self,
+        config: MobileViTV2Config,
+        in_channels: int,
+        out_channels: int,
+        attn_unit_dim: int,
+        n_attn_blocks: int = 2,
+        dilation: int = 1,
+        stride: int = 2,
+    ) -> None:
+        super().__init__()
+        self.patch_width = config.patch_size
+        self.patch_height = config.patch_size
+
+        cnn_out_dim = attn_unit_dim
+
+        if stride == 2:
+            self.downsampling_layer = MobileViTV2InvertedResidual(
+                config,
+                in_channels=in_channels,
+                out_channels=out_channels,
+                stride=stride if dilation == 1 else 1,
+                dilation=dilation // 2 if dilation > 1 else 1,
+            )
+            in_channels = out_channels
+        else:
+            self.downsampling_layer = None
+
+        # Local representations
+        self.conv_kxk = MobileViTV2ConvLayer(
+            config,
+            in_channels=in_channels,
+            out_channels=in_channels,
+            kernel_size=config.conv_kernel_size,
+            groups=in_channels,
+        )
+        self.conv_1x1 = MobileViTV2ConvLayer(
+            config,
+            in_channels=in_channels,
+            out_channels=cnn_out_dim,
+            kernel_size=1,
+            use_normalization=False,
+            use_activation=False,
+        )
+
+        # Global representations
+        self.transformer = MobileViTV2Transformer(config, d_model=attn_unit_dim, n_layers=n_attn_blocks)
+
+        # self.layernorm = MobileViTV2LayerNorm2D(attn_unit_dim, eps=config.layer_norm_eps)
+        self.layernorm = nn.GroupNorm(num_groups=1, num_channels=attn_unit_dim, eps=config.layer_norm_eps)
+
+        # Fusion
+        self.conv_projection = MobileViTV2ConvLayer(
+            config,
+            in_channels=cnn_out_dim,
+            out_channels=in_channels,
+            kernel_size=1,
+            use_normalization=True,
+            use_activation=False,
+        )
+
+    def unfolding(self, feature_map: torch.Tensor) -> tuple[torch.Tensor, tuple[int, int]]:
+        batch_size, in_channels, img_height, img_width = feature_map.shape
+        patches = nn.functional.unfold(
+            feature_map,
+            kernel_size=(self.patch_height, self.patch_width),
+            stride=(self.patch_height, self.patch_width),
+        )
+        patches = patches.reshape(batch_size, in_channels, self.patch_height * self.patch_width, -1)
+
+        return patches, (img_height, img_width)
+
+    def folding(self, patches: torch.Tensor, output_size: tuple[int, int]) -> torch.Tensor:
+        batch_size, in_dim, patch_size, n_patches = patches.shape
+        patches = patches.reshape(batch_size, in_dim * patch_size, n_patches)
+
+        feature_map = nn.functional.fold(
+            patches,
+            output_size=output_size,
+            kernel_size=(self.patch_height, self.patch_width),
+            stride=(self.patch_height, self.patch_width),
+        )
+
+        return feature_map
+
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        # reduce spatial dimensions if needed
+        if self.downsampling_layer:
+            features = self.downsampling_layer(features)
+
+        # local representation
+        features = self.conv_kxk(features)
+        features = self.conv_1x1(features)
+
+        # convert feature map to patches
+        patches, output_size = self.unfolding(features)
+
+        # learn global representations
+        patches = self.transformer(patches)
+        patches = self.layernorm(patches)
+
+        # convert patches back to feature maps
+        # [batch_size, patch_height, patch_width, input_dim] --> [batch_size, input_dim, patch_height, patch_width]
+        features = self.folding(patches, output_size)
+
+        features = self.conv_projection(features)
+        return features
+
+
+class MobileViTV2Encoder(nn.Module):
+    def __init__(self, config: MobileViTV2Config) -> None:
+        super().__init__()
+        self.config = config
+
+        self.layer = nn.ModuleList()
+        self.gradient_checkpointing = False
+
+        # segmentation architectures like DeepLab and PSPNet modify the strides
+        # of the classification backbones
+        dilate_layer_4 = dilate_layer_5 = False
+        if config.output_stride == 8:
+            dilate_layer_4 = True
+            dilate_layer_5 = True
+        elif config.output_stride == 16:
+            dilate_layer_5 = True
+
+        dilation = 1
+
+        layer_0_dim = make_divisible(
+            clip(value=32 * config.width_multiplier, min_val=16, max_val=64), divisor=8, min_value=16
+        )
+
+        layer_1_dim = make_divisible(64 * config.width_multiplier, divisor=16)
+        layer_2_dim = make_divisible(128 * config.width_multiplier, divisor=8)
+        layer_3_dim = make_divisible(256 * config.width_multiplier, divisor=8)
+        layer_4_dim = make_divisible(384 * config.width_multiplier, divisor=8)
+        layer_5_dim = make_divisible(512 * config.width_multiplier, divisor=8)
+
+        layer_1 = MobileViTV2MobileNetLayer(
+            config,
+            in_channels=layer_0_dim,
+            out_channels=layer_1_dim,
+            stride=1,
+            num_stages=1,
+        )
+        self.layer.append(layer_1)
+
+        layer_2 = MobileViTV2MobileNetLayer(
+            config,
+            in_channels=layer_1_dim,
+            out_channels=layer_2_dim,
+            stride=2,
+            num_stages=2,
+        )
+        self.layer.append(layer_2)
+
+        layer_3 = MobileViTV2Layer(
+            config,
+            in_channels=layer_2_dim,
+            out_channels=layer_3_dim,
+            attn_unit_dim=make_divisible(config.base_attn_unit_dims[0] * config.width_multiplier, divisor=8),
+            n_attn_blocks=config.n_attn_blocks[0],
+        )
+        self.layer.append(layer_3)
+
+        if dilate_layer_4:
+            dilation *= 2
+
+        layer_4 = MobileViTV2Layer(
+            config,
+            in_channels=layer_3_dim,
+            out_channels=layer_4_dim,
+            attn_unit_dim=make_divisible(config.base_attn_unit_dims[1] * config.width_multiplier, divisor=8),
+            n_attn_blocks=config.n_attn_blocks[1],
+            dilation=dilation,
+        )
+        self.layer.append(layer_4)
+
+        if dilate_layer_5:
+            dilation *= 2
+
+        layer_5 = MobileViTV2Layer(
+            config,
+            in_channels=layer_4_dim,
+            out_channels=layer_5_dim,
+            attn_unit_dim=make_divisible(config.base_attn_unit_dims[2] * config.width_multiplier, divisor=8),
+            n_attn_blocks=config.n_attn_blocks[2],
+            dilation=dilation,
+        )
+        self.layer.append(layer_5)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ) -> Union[tuple, BaseModelOutputWithNoAttention]:
+        all_hidden_states = () if output_hidden_states else None
+
+        for i, layer_module in enumerate(self.layer):
+            hidden_states = layer_module(hidden_states)
+
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states] if v is not None)
+
+        return BaseModelOutputWithNoAttention(last_hidden_state=hidden_states, hidden_states=all_hidden_states)
+
+
+@auto_docstring
+class MobileViTV2PreTrainedModel(PreTrainedModel):
+    config: MobileViTV2Config
+    base_model_prefix = "mobilevitv2"
+    main_input_name = "pixel_values"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["MobileViTV2Layer"]
+
+    def _init_weights(self, module: nn.Module) -> None:
+        """Initialize the weights"""
+        if isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
+            # Slightly different from the TF version which uses truncated_normal for initialization
+            # cf https://github.com/pytorch/pytorch/pull/5617
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.GroupNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+
+
+@auto_docstring
+class MobileViTV2Model(MobileViTV2PreTrainedModel):
+    def __init__(self, config: MobileViTV2Config, expand_output: bool = True):
+        r"""
+        expand_output (`bool`, *optional*, defaults to `True`):
+            Whether to expand the output of the model. If `True`, the model will output pooled features in addition to
+            hidden states. If `False`, only the hidden states will be returned.
+        """
+        super().__init__(config)
+        self.config = config
+        self.expand_output = expand_output
+
+        layer_0_dim = make_divisible(
+            clip(value=32 * config.width_multiplier, min_val=16, max_val=64), divisor=8, min_value=16
+        )
+
+        self.conv_stem = MobileViTV2ConvLayer(
+            config,
+            in_channels=config.num_channels,
+            out_channels=layer_0_dim,
+            kernel_size=3,
+            stride=2,
+            use_normalization=True,
+            use_activation=True,
+        )
+        self.encoder = MobileViTV2Encoder(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def _prune_heads(self, heads_to_prune):
+        """Prunes heads of the model.
+        heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base class PreTrainedModel
+        """
+        for layer_index, heads in heads_to_prune.items():
+            mobilevitv2_layer = self.encoder.layer[layer_index]
+            if isinstance(mobilevitv2_layer, MobileViTV2Layer):
+                for transformer_layer in mobilevitv2_layer.transformer.layer:
+                    transformer_layer.attention.prune_heads(heads)
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, BaseModelOutputWithPoolingAndNoAttention]:
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if pixel_values is None:
+            raise ValueError("You have to specify pixel_values")
+
+        embedding_output = self.conv_stem(pixel_values)
+
+        encoder_outputs = self.encoder(
+            embedding_output,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if self.expand_output:
+            last_hidden_state = encoder_outputs[0]
+
+            # global average pooling: (batch_size, channels, height, width) -> (batch_size, channels)
+            pooled_output = torch.mean(last_hidden_state, dim=[-2, -1], keepdim=False)
+        else:
+            last_hidden_state = encoder_outputs[0]
+            pooled_output = None
+
+        if not return_dict:
+            output = (last_hidden_state, pooled_output) if pooled_output is not None else (last_hidden_state,)
+            return output + encoder_outputs[1:]
+
+        return BaseModelOutputWithPoolingAndNoAttention(
+            last_hidden_state=last_hidden_state,
+            pooler_output=pooled_output,
+            hidden_states=encoder_outputs.hidden_states,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    MobileViTV2 model with an image classification head on top (a linear layer on top of the pooled features), e.g. for
+    ImageNet.
+    """
+)
+class MobileViTV2ForImageClassification(MobileViTV2PreTrainedModel):
+    def __init__(self, config: MobileViTV2Config) -> None:
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.mobilevitv2 = MobileViTV2Model(config)
+
+        out_channels = make_divisible(512 * config.width_multiplier, divisor=8)  # layer 5 output dimension
+        # Classifier head
+        self.classifier = (
+            nn.Linear(in_features=out_channels, out_features=config.num_labels)
+            if config.num_labels > 0
+            else nn.Identity()
+        )
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, ImageClassifierOutputWithNoAttention]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the image classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss). If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.mobilevitv2(pixel_values, output_hidden_states=output_hidden_states, return_dict=return_dict)
+
+        pooled_output = outputs.pooler_output if return_dict else outputs[1]
+
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(labels, logits, self.config)
+
+        if not return_dict:
+            output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return ImageClassifierOutputWithNoAttention(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+        )
+
+
+# Copied from transformers.models.mobilevit.modeling_mobilevit.MobileViTASPPPooling with MobileViT->MobileViTV2
+class MobileViTV2ASPPPooling(nn.Module):
+    def __init__(self, config: MobileViTV2Config, in_channels: int, out_channels: int) -> None:
+        super().__init__()
+
+        self.global_pool = nn.AdaptiveAvgPool2d(output_size=1)
+
+        self.conv_1x1 = MobileViTV2ConvLayer(
+            config,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            stride=1,
+            use_normalization=True,
+            use_activation="relu",
+        )
+
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        spatial_size = features.shape[-2:]
+        features = self.global_pool(features)
+        features = self.conv_1x1(features)
+        features = nn.functional.interpolate(features, size=spatial_size, mode="bilinear", align_corners=False)
+        return features
+
+
+class MobileViTV2ASPP(nn.Module):
+    """
+    ASPP module defined in DeepLab papers: https://huggingface.co/papers/1606.00915, https://huggingface.co/papers/1706.05587
+    """
+
+    def __init__(self, config: MobileViTV2Config) -> None:
+        super().__init__()
+
+        encoder_out_channels = make_divisible(512 * config.width_multiplier, divisor=8)  # layer 5 output dimension
+        in_channels = encoder_out_channels
+        out_channels = config.aspp_out_channels
+
+        if len(config.atrous_rates) != 3:
+            raise ValueError("Expected 3 values for atrous_rates")
+
+        self.convs = nn.ModuleList()
+
+        in_projection = MobileViTV2ConvLayer(
+            config,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            kernel_size=1,
+            use_activation="relu",
+        )
+        self.convs.append(in_projection)
+
+        self.convs.extend(
+            [
+                MobileViTV2ConvLayer(
+                    config,
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    kernel_size=3,
+                    dilation=rate,
+                    use_activation="relu",
+                )
+                for rate in config.atrous_rates
+            ]
+        )
+
+        pool_layer = MobileViTV2ASPPPooling(config, in_channels, out_channels)
+        self.convs.append(pool_layer)
+
+        self.project = MobileViTV2ConvLayer(
+            config, in_channels=5 * out_channels, out_channels=out_channels, kernel_size=1, use_activation="relu"
+        )
+
+        self.dropout = nn.Dropout(p=config.aspp_dropout_prob)
+
+    def forward(self, features: torch.Tensor) -> torch.Tensor:
+        pyramid = []
+        for conv in self.convs:
+            pyramid.append(conv(features))
+        pyramid = torch.cat(pyramid, dim=1)
+
+        pooled_features = self.project(pyramid)
+        pooled_features = self.dropout(pooled_features)
+        return pooled_features
+
+
+# Copied from transformers.models.mobilevit.modeling_mobilevit.MobileViTDeepLabV3 with MobileViT->MobileViTV2
+class MobileViTV2DeepLabV3(nn.Module):
+    """
+    DeepLabv3 architecture: https://huggingface.co/papers/1706.05587
+    """
+
+    def __init__(self, config: MobileViTV2Config) -> None:
+        super().__init__()
+        self.aspp = MobileViTV2ASPP(config)
+
+        self.dropout = nn.Dropout2d(config.classifier_dropout_prob)
+
+        self.classifier = MobileViTV2ConvLayer(
+            config,
+            in_channels=config.aspp_out_channels,
+            out_channels=config.num_labels,
+            kernel_size=1,
+            use_normalization=False,
+            use_activation=False,
+            bias=True,
+        )
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        features = self.aspp(hidden_states[-1])
+        features = self.dropout(features)
+        features = self.classifier(features)
+        return features
+
+
+@auto_docstring(
+    custom_intro="""
+    MobileViTV2 model with a semantic segmentation head on top, e.g. for Pascal VOC.
+    """
+)
+class MobileViTV2ForSemanticSegmentation(MobileViTV2PreTrainedModel):
+    def __init__(self, config: MobileViTV2Config) -> None:
+        super().__init__(config)
+
+        self.num_labels = config.num_labels
+        self.mobilevitv2 = MobileViTV2Model(config, expand_output=False)
+        self.segmentation_head = MobileViTV2DeepLabV3(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, SemanticSegmenterOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, height, width)`, *optional*):
+            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels > 1`, a classification loss is computed (Cross-Entropy).
+
+        Examples:
+
+        ```python
+        >>> import requests
+        >>> import torch
+        >>> from PIL import Image
+        >>> from transformers import AutoImageProcessor, MobileViTV2ForSemanticSegmentation
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("apple/mobilevitv2-1.0-imagenet1k-256")
+        >>> model = MobileViTV2ForSemanticSegmentation.from_pretrained("apple/mobilevitv2-1.0-imagenet1k-256")
+
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+
+        >>> with torch.no_grad():
+        ...     outputs = model(**inputs)
+
+        >>> # logits are of shape (batch_size, num_labels, height, width)
+        >>> logits = outputs.logits
+        ```"""
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if labels is not None and self.config.num_labels == 1:
+            raise ValueError("The number of labels should be greater than one")
+
+        outputs = self.mobilevitv2(
+            pixel_values,
+            output_hidden_states=True,  # we need the intermediate hidden states
+            return_dict=return_dict,
+        )
+
+        encoder_hidden_states = outputs.hidden_states if return_dict else outputs[1]
+
+        logits = self.segmentation_head(encoder_hidden_states)
+
+        loss = None
+        if labels is not None:
+            # upsample logits to the images' original size
+            upsampled_logits = nn.functional.interpolate(
+                logits, size=labels.shape[-2:], mode="bilinear", align_corners=False
+            )
+            loss_fct = CrossEntropyLoss(ignore_index=self.config.semantic_loss_ignore_index)
+            loss = loss_fct(upsampled_logits, labels)
+
+        if not return_dict:
+            if output_hidden_states:
+                output = (logits,) + outputs[1:]
+            else:
+                output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SemanticSegmenterOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=None,
+        )
+
+
+__all__ = [
+    "MobileViTV2ForImageClassification",
+    "MobileViTV2ForSemanticSegmentation",
+    "MobileViTV2Model",
+    "MobileViTV2PreTrainedModel",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/moonshine/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/moonshine/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b5b33ef6ade8cf12d1af4e220d74cdb89bc10c8f
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/moonshine/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/moonshine/__pycache__/configuration_moonshine.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/moonshine/__pycache__/configuration_moonshine.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ee533499880562a619f1f932d7f135e69cfadce6
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/moonshine/__pycache__/configuration_moonshine.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/moonshine/__pycache__/modeling_moonshine.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/moonshine/__pycache__/modeling_moonshine.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..598950bd1ad2a480fb590033fd3dcf17bdead55d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/moonshine/__pycache__/modeling_moonshine.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/moonshine/__pycache__/modular_moonshine.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/moonshine/__pycache__/modular_moonshine.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e080555396d84a50ecbfc5514ff50137941f753b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/moonshine/__pycache__/modular_moonshine.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/myt5/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/myt5/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..91ec6e7aed336ecebebc54d5974f27dea0b6ada3
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/myt5/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/myt5/__pycache__/tokenization_myt5.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/myt5/__pycache__/tokenization_myt5.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..73d6032bffebb116becfff55a344a6b18b3b6f58
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/myt5/__pycache__/tokenization_myt5.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nougat/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nougat/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bc4c520fd348470ea81123168f26f94146c4942e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nougat/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nougat/__pycache__/image_processing_nougat.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nougat/__pycache__/image_processing_nougat.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0fe2fb222480431bea3e35b6fb610900ba5396f1
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nougat/__pycache__/image_processing_nougat.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nougat/__pycache__/image_processing_nougat_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nougat/__pycache__/image_processing_nougat_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1f09a78b05c282a3dd5a39a368e9ae5c6511cc92
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nougat/__pycache__/image_processing_nougat_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nougat/__pycache__/processing_nougat.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nougat/__pycache__/processing_nougat.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..42cdb117f51f1507a353d5726dec888b5a55799d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nougat/__pycache__/processing_nougat.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nougat/__pycache__/tokenization_nougat_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nougat/__pycache__/tokenization_nougat_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f7a9c46742776f1377e1896457f2875a7754043c
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nougat/__pycache__/tokenization_nougat_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nystromformer/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nystromformer/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..21a580d20c745314883a9cde18e30fe51ee0042e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nystromformer/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nystromformer/__pycache__/configuration_nystromformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nystromformer/__pycache__/configuration_nystromformer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e296905fc0fdbdba55e31521ea2a52db1248f4b1
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nystromformer/__pycache__/configuration_nystromformer.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nystromformer/__pycache__/modeling_nystromformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nystromformer/__pycache__/modeling_nystromformer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0d0c8d32d468a2aba03d93e0f3cd16a94a08424d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/nystromformer/__pycache__/modeling_nystromformer.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..256074f5555278e51a83b42c41ae539821154f55
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo/__pycache__/configuration_olmo.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo/__pycache__/configuration_olmo.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bf511dbe981ab692d3c02b6a80294e022f6088da
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo/__pycache__/configuration_olmo.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo/__pycache__/modeling_olmo.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo/__pycache__/modeling_olmo.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aba60e0321e72c7ca34827d38a99d1a03bfe7203
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo/__pycache__/modeling_olmo.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo/__pycache__/modular_olmo.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo/__pycache__/modular_olmo.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..680735939b19a33fac01c141c0076f2825ca42f9
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo/__pycache__/modular_olmo.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo3/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo3/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a823be18326aa479a98a2d8dccc5c3f0686c76e9
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo3/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo3/__pycache__/configuration_olmo3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo3/__pycache__/configuration_olmo3.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6a0801a84f05621f46a5b84926660547989f3e60
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo3/__pycache__/configuration_olmo3.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo3/__pycache__/modeling_olmo3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo3/__pycache__/modeling_olmo3.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b7a24927b04201e059ee73bffada8cbe4ee8444d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo3/__pycache__/modeling_olmo3.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo3/__pycache__/modular_olmo3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo3/__pycache__/modular_olmo3.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..178a500813f8fdede8434566de5cc3b9ea6452b4
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmo3/__pycache__/modular_olmo3.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmoe/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmoe/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4ef81dc3a581ce85e3e122d9ee4a42db68010867
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmoe/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmoe/__pycache__/configuration_olmoe.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmoe/__pycache__/configuration_olmoe.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a48b10a8d517af501d245073a733dfe6c3760579
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmoe/__pycache__/configuration_olmoe.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmoe/__pycache__/modeling_olmoe.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmoe/__pycache__/modeling_olmoe.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..40b0330cf6cd9bd743e70d68aae0e48f75a63f90
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/olmoe/__pycache__/modeling_olmoe.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/omdet_turbo/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/omdet_turbo/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..26831561e7f5379aeab71ec1b7abcaa27d3b1c65
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/omdet_turbo/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/omdet_turbo/__pycache__/configuration_omdet_turbo.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/omdet_turbo/__pycache__/configuration_omdet_turbo.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a66f7809fe8f00b2ac974d151f371d4e53a0ee3e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/omdet_turbo/__pycache__/configuration_omdet_turbo.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/omdet_turbo/__pycache__/modeling_omdet_turbo.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/omdet_turbo/__pycache__/modeling_omdet_turbo.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b4eeb0ba8a08f1207b8d8f00a6ce8440481d45f6
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/omdet_turbo/__pycache__/modeling_omdet_turbo.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/omdet_turbo/__pycache__/processing_omdet_turbo.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/omdet_turbo/__pycache__/processing_omdet_turbo.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..df77135e36d5bca0e05f36f18486ede2ed647d6c
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/omdet_turbo/__pycache__/processing_omdet_turbo.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/oneformer/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/oneformer/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..014feb11206605a3c02029f979c713fdbdf8b1b9
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/oneformer/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/oneformer/__pycache__/configuration_oneformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/oneformer/__pycache__/configuration_oneformer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..000d9df0872e6bd674858e929283f41ecd8e68f9
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/oneformer/__pycache__/configuration_oneformer.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/oneformer/__pycache__/image_processing_oneformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/oneformer/__pycache__/image_processing_oneformer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f54cf8edeaa734ca8cc9368399cfb7c11d51930e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/oneformer/__pycache__/image_processing_oneformer.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/oneformer/__pycache__/image_processing_oneformer_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/oneformer/__pycache__/image_processing_oneformer_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..584e4a026d33dc59ace5a66cdaad98fc09ef1de8
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/oneformer/__pycache__/image_processing_oneformer_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/oneformer/__pycache__/processing_oneformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/oneformer/__pycache__/processing_oneformer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3c0c37505c2745b52d741eb0d5a39e26255a036b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/oneformer/__pycache__/processing_oneformer.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/openai/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/openai/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4409a72784b7033c8989d4692dcba0b1627cab0f
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/openai/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/openai/__pycache__/configuration_openai.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/openai/__pycache__/configuration_openai.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9a4a4c5d7e5aea7d20935e0729e36d3342f7f7cf
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/openai/__pycache__/configuration_openai.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/openai/__pycache__/modeling_openai.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/openai/__pycache__/modeling_openai.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fd54402e1f6a98d8840392084a817ae224763724
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/openai/__pycache__/modeling_openai.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/openai/__pycache__/modeling_tf_openai.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/openai/__pycache__/modeling_tf_openai.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..16db04d809897b99cd9a71714533e14b6a0ddb50
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/openai/__pycache__/modeling_tf_openai.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/openai/__pycache__/tokenization_openai.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/openai/__pycache__/tokenization_openai.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cb8728296758436a87d1dfaaaf7d1f8bb4d2bf36
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/openai/__pycache__/tokenization_openai.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/openai/__pycache__/tokenization_openai_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/openai/__pycache__/tokenization_openai_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..926de034177fbf340531007dfd5b56b23ec0cd27
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/openai/__pycache__/tokenization_openai_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/opt/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/opt/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..280f3bb9cf58e2f2079a6fa7be987e19204736cf
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/opt/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/opt/__pycache__/configuration_opt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/opt/__pycache__/configuration_opt.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a989bfb9da1344399494a26b863c370bcb37a967
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/opt/__pycache__/configuration_opt.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/opt/__pycache__/modeling_flax_opt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/opt/__pycache__/modeling_flax_opt.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3febf9c99d494d11412ca3bb6f0d982ab5456e4a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/opt/__pycache__/modeling_flax_opt.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/opt/__pycache__/modeling_opt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/opt/__pycache__/modeling_opt.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..baee2daba1f23f8440a64ff331cef13d32ec0f7f
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/opt/__pycache__/modeling_opt.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/opt/__pycache__/modeling_tf_opt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/opt/__pycache__/modeling_tf_opt.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..24c1d11a3b13899c4908d5e1be8b7e6777b066e3
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/opt/__pycache__/modeling_tf_opt.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/paligemma/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/paligemma/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..9048afe6adbdc0ad36007e02f60e899cae677c55
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/paligemma/__init__.py
@@ -0,0 +1,28 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_paligemma import *
+    from .modeling_paligemma import *
+    from .processing_paligemma import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/paligemma/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/paligemma/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..495ce5cd208119bd67e99dcc26f9fa7799d06e34
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/paligemma/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/paligemma/__pycache__/configuration_paligemma.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/paligemma/__pycache__/configuration_paligemma.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8b97fef8641a29ca45dd3ef7070dd93b1a40ec9f
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/paligemma/__pycache__/configuration_paligemma.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/paligemma/__pycache__/modeling_paligemma.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/paligemma/__pycache__/modeling_paligemma.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e90adc77c5a261f366e2e134c300c057fadf0461
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/paligemma/__pycache__/modeling_paligemma.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/paligemma/__pycache__/processing_paligemma.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/paligemma/__pycache__/processing_paligemma.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..158461d8471ac34b79ca94448db7729e79af3581
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/paligemma/__pycache__/processing_paligemma.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/paligemma/configuration_paligemma.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/paligemma/configuration_paligemma.py
new file mode 100644
index 0000000000000000000000000000000000000000..e4ee4b3b45c2637d804f7eafa7f9cc7afa2638f8
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/paligemma/configuration_paligemma.py
@@ -0,0 +1,128 @@
+# coding=utf-8
+# Copyright 2024 Microsoft Research & University of Wisconsin-Madison and the HuggingFace Inc. team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PaliGemmamodel configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ..auto import CONFIG_MAPPING, AutoConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class PaliGemmaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`PaliGemmaForConditionalGeneration`]. It is used to instantiate an
+    PaliGemmamodel according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the PaliGemma-2B.
+
+    e.g. [paligemma-hf/paligemma-2b](https://huggingface.co/paligemma-hf/paligemma-2b)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vision_config (`PaliGemmaVisionConfig`,  *optional*):
+            Custom vision config or dict
+        text_config (`Union[AutoConfig, dict]`, *optional*):
+            The config object of the text backbone. Can be any of `LlamaConfig` or `MistralConfig`.
+        image_token_index (`int`, *optional*, defaults to 256000):
+            The image token index to encode the image prompt.
+        vocab_size (`int`, *optional*, defaults to 257152):
+            Vocabulary size of the PaliGemmamodel. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`~PaliGemmaForConditionalGeneration`]
+        projection_dim (`int`, *optional*, defaults to 2048):
+            Dimension of the multimodal projection space.
+        hidden_size (`int`, *optional*, defaults to 2048):
+            Dimension of the hidden layer of the Language model.
+
+    Example:
+
+    ```python
+    >>> from transformers import PaliGemmaForConditionalGeneration, PaliGemmaConfig, SiglipVisionConfig, GemmaConfig
+
+    >>> # Initializing a Siglip-like vision config
+    >>> vision_config = SiglipVisionConfig()
+
+    >>> # Initializing a PaliGemma config
+    >>> text_config = GemmaConfig()
+
+    >>> # Initializing a PaliGemma paligemma-3b-224 style configuration
+    >>> configuration = PaliGemmaConfig(vision_config, text_config)
+
+    >>> # Initializing a model from the paligemma-3b-224 style configuration
+    >>> model = PaliGemmaForConditionalGeneration(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "paligemma"
+    attribute_map = {
+        "image_token_id": "image_token_index",
+    }
+    sub_configs = {"text_config": AutoConfig, "vision_config": AutoConfig}
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vision_config=None,
+        text_config=None,
+        image_token_index=256000,
+        vocab_size=257152,
+        projection_dim=2048,
+        hidden_size=2048,
+        **kwargs,
+    ):
+        self.image_token_index = image_token_index
+        self.projection_dim = projection_dim
+        self.hidden_size = hidden_size
+        self.vision_config = vision_config
+        self.is_encoder_decoder = False
+
+        if isinstance(self.vision_config, dict):
+            vision_config["model_type"] = vision_config.get("model_type", "siglip_vision_model")
+            self.vision_config = CONFIG_MAPPING[vision_config["model_type"]](**vision_config)
+        elif vision_config is None:
+            self.vision_config = CONFIG_MAPPING["siglip_vision_model"](
+                intermediate_size=4096,
+                hidden_size=1152,
+                patch_size=14,
+                image_size=224,
+                num_hidden_layers=27,
+                num_attention_heads=16,
+                vocab_size=257152,
+                vision_use_head=False,
+            )
+
+        self.text_config = text_config
+        if isinstance(self.text_config, dict):
+            text_config["model_type"] = text_config.get("model_type", "gemma")
+            self.text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+        elif text_config is None:
+            self.text_config = CONFIG_MAPPING["gemma"](
+                hidden_size=2048,
+                num_hidden_layers=18,
+                intermediate_size=16384,
+                num_attention_heads=8,
+                num_key_value_heads=1,
+                is_encoder_decoder=False,
+                vocab_size=vocab_size,
+            )
+        self.text_config.num_image_tokens = (self.vision_config.image_size // self.vision_config.patch_size) ** 2
+        self.vision_config.projection_dim = projection_dim
+        super().__init__(**kwargs)
+
+
+__all__ = ["PaliGemmaConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/paligemma/modeling_paligemma.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/paligemma/modeling_paligemma.py
new file mode 100644
index 0000000000000000000000000000000000000000..abd8595e24abb7d3850a19144ebe67707b85f7ae
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/paligemma/modeling_paligemma.py
@@ -0,0 +1,625 @@
+# coding=utf-8
+# Copyright 2024 the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch PaliGemmamodel."""
+
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+from torch import nn
+
+from ...cache_utils import Cache, StaticCache
+from ...generation import GenerationMixin
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_outputs import BaseModelOutputWithPast
+from ...modeling_utils import PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import (
+    ModelOutput,
+    TransformersKwargs,
+    auto_docstring,
+    can_return_tuple,
+    logging,
+)
+from ..auto import AutoModel
+from .configuration_paligemma import PaliGemmaConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for Paligemma outputs, with hidden states and attentions.
+    """
+)
+class PaligemmaModelOutputWithPast(BaseModelOutputWithPast):
+    r"""
+    image_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+        image_hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    """
+
+    image_hidden_states: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for PaliGemma causal language model (or autoregressive) outputs.
+    """
+)
+class PaliGemmaCausalLMOutputWithPast(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Language modeling loss (for next-token prediction).
+    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.text_config.vocab_size)`):
+        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    image_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+        image_hidden_states of the model produced by the vision encoder after projecting last hidden state.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[Cache] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[torch.FloatTensor] = None
+
+
+class PaliGemmaMultiModalProjector(nn.Module):
+    def __init__(self, config: PaliGemmaConfig):
+        super().__init__()
+        self.linear = nn.Linear(config.vision_config.hidden_size, config.vision_config.projection_dim, bias=True)
+
+    def forward(self, image_features):
+        hidden_states = self.linear(image_features)
+
+        return hidden_states
+
+
+@auto_docstring
+class PaliGemmaPreTrainedModel(PreTrainedModel):
+    config: PaliGemmaConfig
+    base_model_prefix = ""
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["PaliGemmaMultiModalProjector"]
+    _skip_keys_device_placement = "past_key_values"
+
+    _can_compile_fullgraph = False
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+    _supports_attention_backend = True
+
+    def _init_weights(self, module):
+        # important: this ported version of PaliGemmaisn't meant for training from scratch - only
+        # inference and fine-tuning
+        std = getattr(self.config, "initializer_range", self.config.get_text_config().initializer_range)
+
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+
+
+@auto_docstring(
+    custom_intro="""
+    The Base Paligemma model which consists of a vision backbone and a language model without language modeling head.,
+    """
+)
+class PaliGemmaModel(PaliGemmaPreTrainedModel):
+    _checkpoint_conversion_mapping = {"language_model.model": "language_model"}
+    # we are filtering the logits/labels so we shouldn't divide the loss based on num_items_in_batch
+    accepts_loss_kwargs = False
+
+    def __init__(self, config: PaliGemmaConfig):
+        super().__init__(config)
+        self.vision_tower = AutoModel.from_config(config=config.vision_config)
+        self.multi_modal_projector = PaliGemmaMultiModalProjector(config)
+        self.vocab_size = config.text_config.vocab_size
+
+        language_model = AutoModel.from_config(config=config.text_config)
+        self.language_model = language_model
+
+        self.pad_token_id = self.config.pad_token_id if self.config.pad_token_id is not None else -1
+        self.text_config_dtype = self.config.get_text_config().dtype or self.dtype
+        self.post_init()
+
+    # Copied from transformers.models.llava.modeling_llava.LlavaModel.get_input_embeddings with Llava->PaliGemma
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    # Copied from transformers.models.llava.modeling_llava.LlavaModel.set_input_embeddings with Llava->PaliGemma
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+
+    def set_decoder(self, decoder):
+        self.language_model = decoder
+
+    def get_decoder(self):
+        return self.language_model
+
+    def _update_causal_mask(
+        self,
+        attention_mask,
+        token_type_ids=None,
+        past_key_values=None,
+        cache_position=None,
+        input_tensor=None,
+        is_training: Optional[bool] = None,
+    ):
+        if self.config.text_config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+        is_training = is_training if is_training is not None else self.training
+        using_static_cache = isinstance(past_key_values, StaticCache)
+        min_dtype = torch.finfo(self.text_config_dtype).min
+        if input_tensor is None:
+            input_tensor = attention_mask
+
+        inputs_lead_dim, sequence_length = input_tensor.shape[:2]
+        if using_static_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else cache_position[0] + sequence_length + 1
+            )
+
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            return attention_mask
+
+        causal_mask = torch.full(
+            (sequence_length, target_length),
+            fill_value=min_dtype,
+            dtype=self.text_config_dtype,
+            device=cache_position.device,
+        )
+        # Causal diagonal mask only if training, otherwise attend to the whole prefix. Training-specific attn for prefix is handled below
+        if sequence_length != 1:
+            if is_training:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            else:
+                causal_mask[:, :sequence_length] = 0.0
+
+        causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
+        causal_mask = causal_mask[None, None, :, :].expand(inputs_lead_dim, 1, -1, -1)
+        if attention_mask is not None:
+            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+            mask_length = attention_mask.shape[-1]
+
+            # First unmask prefix tokens during training
+            if is_training:
+                if token_type_ids is None:
+                    raise ValueError("Token type ids must be provided during training")
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    token_type_ids[:, None, None, :].to(causal_mask.device) == 0, 0
+                )
+
+            # Then apply padding mask (will mask pad tokens)
+            padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(causal_mask.device)
+            padding_mask = padding_mask == 0
+            causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                padding_mask, min_dtype
+            )
+
+        return causal_mask
+
+    def get_image_features(self, pixel_values: torch.FloatTensor):
+        """
+        Obtains image last hidden states from the vision tower and apply multimodal projection.
+
+        Args:
+            pixel_values (`torch.FloatTensor]` of shape `(batch_size, channels, height, width)`)
+               The tensors corresponding to the input images.
+        Returns:
+            image_features (`torch.Tensor`): Image feature tensor of shape `(num_images, image_length, embed_dim)`).
+        """
+        image_outputs = self.vision_tower(pixel_values)
+        selected_image_feature = image_outputs.last_hidden_state
+        image_features = self.multi_modal_projector(selected_image_feature)
+        image_features = image_features / (self.config.text_config.hidden_size**0.5)
+        return image_features
+
+    def get_placeholder_mask(
+        self, input_ids: torch.LongTensor, inputs_embeds: torch.FloatTensor, image_features: torch.FloatTensor
+    ):
+        """
+        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
+        equal to the length of multimodal features. If the lengths are different, an error is raised.
+        """
+        if input_ids is None:
+            special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_image_mask = special_image_mask.all(-1)
+        else:
+            special_image_mask = input_ids == self.config.image_token_id
+
+        n_image_tokens = special_image_mask.sum()
+        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        n_image_features = image_features.shape[0] * image_features.shape[1]
+        if inputs_embeds[special_image_mask].numel() != image_features.numel():
+            raise ValueError(
+                f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {n_image_features}"
+            )
+        return special_image_mask
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> Union[tuple, PaligemmaModelOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
+
+        >>> model = PaliGemmaForConditionalGeneration.from_pretrained("google/paligemma2-3b-mix-224")
+        >>> processor = AutoProcessor.from_pretrained("google/paligemma2-3b-mix-224")
+
+        >>> prompt = "Where is the cat standing?"
+        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, text=prompt,  return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(**inputs,)
+        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Where is the cat standing?\nsnow"
+        ```"""
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        is_training = token_type_ids is not None and labels is not None
+
+        # Replace image id with PAD if the image token if OOV, to avoid index-errors
+        if input_ids is not None and self.config.image_token_id >= self.vocab_size:
+            special_image_mask = input_ids == self.config.image_token_id
+            llm_input_ids = input_ids.clone()
+            llm_input_ids[special_image_mask] = 0
+        else:
+            llm_input_ids = input_ids
+
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(llm_input_ids)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0) + 1  # Paligemma positions are 1-indexed
+
+        # Merge text and images
+        if pixel_values is not None:
+            image_features = self.get_image_features(pixel_values)
+            image_features = image_features.to(inputs_embeds.device, inputs_embeds.dtype)
+            special_image_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, image_features=image_features
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
+        causal_mask = self._update_causal_mask(
+            attention_mask, token_type_ids, past_key_values, cache_position, inputs_embeds, is_training
+        )
+        outputs = self.language_model(
+            attention_mask=causal_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        return PaligemmaModelOutputWithPast(
+            last_hidden_state=outputs.last_hidden_state,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=image_features if pixel_values is not None else None,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    The Base Paligemma model which consists of a vision backbone and a language model without language modeling head.,
+    """
+)
+class PaliGemmaForConditionalGeneration(PaliGemmaPreTrainedModel, GenerationMixin):
+    _checkpoint_conversion_mapping = {
+        "^language_model.model": "model.language_model",
+        "^vision_tower": "model.vision_tower",
+        "^multi_modal_projector": "model.multi_modal_projector",
+        "^language_model.lm_head": "lm_head",
+    }
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config: PaliGemmaConfig):
+        super().__init__(config)
+        self.model = PaliGemmaModel(config)
+        self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.model.set_input_embeddings(value)
+
+    def set_decoder(self, decoder):
+        self.model.set_decoder(decoder)
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    def get_image_features(self, pixel_values):
+        return self.model.get_image_features(pixel_values)
+
+    # Make modules available through conditional class for BC
+    @property
+    def language_model(self):
+        return self.model.language_model
+
+    @property
+    def vision_tower(self):
+        return self.model.vision_tower
+
+    @property
+    def multi_modal_projector(self):
+        return self.model.multi_modal_projector
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        token_type_ids: Optional[torch.LongTensor] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> Union[tuple, PaliGemmaCausalLMOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.text_config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.text_config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from PIL import Image
+        >>> import requests
+        >>> from transformers import AutoProcessor, PaliGemmaForConditionalGeneration
+
+        >>> model = PaliGemmaForConditionalGeneration.from_pretrained("google/paligemma2-3b-mix-224")
+        >>> processor = AutoProcessor.from_pretrained("google/paligemma2-3b-mix-224")
+
+        >>> prompt = "Where is the cat standing?"
+        >>> url = "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/pipeline-cat-chonk.jpeg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> inputs = processor(images=image, text=prompt,  return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(**inputs,)
+        >>> processor.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Where is the cat standing?\nsnow"
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.model(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            token_type_ids=token_type_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            labels=labels,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(
+                logits=logits, labels=labels, vocab_size=self.config.text_config.vocab_size, **kwargs
+            )
+
+        return PaliGemmaCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=outputs.image_hidden_states,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        pixel_values=None,
+        attention_mask=None,
+        token_type_ids=None,
+        use_cache=True,
+        logits_to_keep=None,
+        labels=None,
+        **kwargs,
+    ):
+        # Overwritten -- custom `position_ids` and `pixel_values` handling
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            cache_position=cache_position,
+            use_cache=use_cache,
+            logits_to_keep=logits_to_keep,
+            token_type_ids=token_type_ids,
+            **kwargs,
+        )
+
+        # position_ids in Paligemma are 1-indexed
+        if model_inputs.get("position_ids") is not None:
+            model_inputs["position_ids"] += 1
+        # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+        # Otherwise we need pixel values to be passed to model. NOTE: use_cache=False needs pixel_values always
+        if cache_position[0] == 0:
+            model_inputs["pixel_values"] = pixel_values
+        is_training = token_type_ids is not None and labels is not None
+        is_static_hybrid_cache = isinstance(past_key_values, StaticCache) and any(past_key_values.is_sliding)
+        if cache_position[0] == 0 and is_static_hybrid_cache:
+            input_tensor = inputs_embeds if inputs_embeds is not None else input_ids
+            causal_mask = self.model._update_causal_mask(
+                attention_mask, token_type_ids, past_key_values, cache_position, input_tensor, is_training
+            )
+            model_inputs["attention_mask"] = causal_mask
+
+        return model_inputs
+
+    @staticmethod
+    # Copied from transformers.models.gptj.modeling_gptj.GPTJModel._prepare_4d_causal_attention_mask_with_cache_position
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+                `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache,
+                to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
+                    causal_mask.device
+                )
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+
+        return causal_mask
+
+
+__all__ = ["PaliGemmaForConditionalGeneration", "PaliGemmaPreTrainedModel", "PaliGemmaModel"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/paligemma/processing_paligemma.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/paligemma/processing_paligemma.py
new file mode 100644
index 0000000000000000000000000000000000000000..242627a0eb71a98f71ca0aadf260cc51df3fb085
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/paligemma/processing_paligemma.py
@@ -0,0 +1,336 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for PaliGemma.
+"""
+
+from typing import Optional, Union
+
+import numpy as np
+
+from ...feature_extraction_utils import BatchFeature
+from ...image_utils import ImageInput, is_valid_image
+from ...processing_utils import (
+    ImagesKwargs,
+    MultiModalData,
+    ProcessingKwargs,
+    ProcessorMixin,
+    TextKwargs,
+    Unpack,
+)
+from ...tokenization_utils_base import AddedToken, PreTokenizedInput, TextInput
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+IMAGE_TOKEN = "<image>"
+EXTRA_TOKENS = [f"<loc{i:0>4}>" for i in range(1024)] + [f"<seg{i:0>3}>" for i in range(128)]
+
+
+class PaliGemmaTextKwargs(TextKwargs):
+    suffix: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]]
+
+
+class PaliGemmaImagesKwargs(ImagesKwargs):
+    do_convert_rgb: Optional[bool]
+
+
+class PaliGemmaProcessorKwargs(ProcessingKwargs, total=False):
+    text_kwargs: PaliGemmaTextKwargs
+    images_kwargs: PaliGemmaImagesKwargs
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+            "return_mm_token_type_ids": False,
+        },
+        "images_kwargs": {
+            "data_format": "channels_first",
+        },
+    }
+
+
+# Copied from transformers.models.idefics2.processing_idefics2.is_url
+def is_url(val) -> bool:
+    return isinstance(val, str) and val.startswith("http")
+
+
+# Copied from transformers.models.idefics2.processing_idefics2.is_image_or_image_url
+def is_image_or_image_url(elem):
+    return is_url(elem) or is_valid_image(elem)
+
+
+def _is_str_or_image(elem):
+    return isinstance(elem, (str)) or is_image_or_image_url(elem)
+
+
+def build_string_from_input(prompt, bos_token, image_seq_len, image_token, num_images):
+    """
+    Builds a string from the input prompt and image tokens.
+    For example, for the call:
+    build_string_from_input(
+        prompt="Prefix str"
+        bos_token="<s>",
+        image_seq_len=3,
+        image_token="<im>",
+    )
+    The output will be:
+    "<im><im><im><s>Initial str"
+    Args:
+        prompt (`list[Union[str, ImageInput]]`): The input prompt.
+        bos_token (`str`): The beginning of sentence token.
+        image_seq_len (`int`): The length of the image sequence.
+        image_token (`str`): The image token.
+        num_images (`int`): Number of images in the prompt.
+    """
+    return f"{image_token * image_seq_len * num_images}{bos_token}{prompt}\n"
+
+
+class PaliGemmaProcessor(ProcessorMixin):
+    r"""
+    Constructs a PaliGemma processor which wraps a PaliGemma image processor and a PaliGemma tokenizer into a single processor.
+
+    [`PaliGemmaProcessor`] offers all the functionalities of [`SiglipImageProcessor`] and [`GemmaTokenizerFast`]. See the
+    [`~PaliGemmaProcessor.__call__`] and [`~PaliGemmaProcessor.decode`] for more information.
+
+    Args:
+        image_processor ([`SiglipImageProcessor`], *optional*):
+            The image processor is a required input.
+        tokenizer ([`GemmaTokenizerFast`], *optional*):
+            The tokenizer is a required input.
+        chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
+            in a chat into a tokenizable string.
+    """
+
+    attributes = ["image_processor", "tokenizer"]
+    image_processor_class = ("SiglipImageProcessor", "SiglipImageProcessorFast")
+    tokenizer_class = ("GemmaTokenizer", "GemmaTokenizerFast")
+
+    def __init__(
+        self,
+        image_processor=None,
+        tokenizer=None,
+        chat_template=None,
+        **kwargs,
+    ):
+        if not hasattr(image_processor, "image_seq_length"):
+            raise ValueError("Image processor is missing an `image_seq_length` attribute.")
+
+        self.image_seq_length = image_processor.image_seq_length
+
+        if not hasattr(tokenizer, "image_token"):
+            image_token = AddedToken(IMAGE_TOKEN, normalized=False, special=True)
+            tokens_to_add = {"additional_special_tokens": [image_token]}
+            tokenizer.add_special_tokens(tokens_to_add)
+            self.image_token_id = tokenizer.convert_tokens_to_ids(IMAGE_TOKEN)
+            self.image_token = IMAGE_TOKEN
+        else:
+            self.image_token_id = tokenizer.image_token_id
+            self.image_token = tokenizer.image_token
+
+        tokenizer.add_tokens(EXTRA_TOKENS)
+        tokenizer.add_bos_token = False
+        tokenizer.add_eos_token = False
+
+        super().__init__(image_processor, tokenizer, chat_template=chat_template)
+
+    def __call__(
+        self,
+        images: Optional[ImageInput] = None,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
+        audio=None,
+        videos=None,
+        **kwargs: Unpack[PaliGemmaProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
+        and `kwargs` arguments to GemmaTokenizerFast's [`~GemmaTokenizerFast.__call__`] if `text` is not `None` to encode
+        the text. To prepare the image(s), this method forwards the `images` and `kwargs` arguments to
+        SiglipImageProcessor's [`~SiglipImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
+        of the above two methods for more information.
+
+        The usage for PaliGemma fine-tuning preparation is slightly different than usual. suffix passed are suffixes to
+        the prompt in `text`, and will be placed after the prompt. This is because attention is handled differently for
+        the prefix and the suffix. For instance,
+        ```python
+        image = PIL_cow_image
+        prompt = "answer en Where is the cow standing?"
+        suffix = "on the beach"
+        inputs = processor(text=prompt, images=image, suffix=suffix)
+        ```
+        Here `inputs` will contain the `input_ids` and `token_type_ids` that follow
+        ```python
+        inputs["input_ids"][:, 256:]
+        # tensor([[     2,   6006,    603,    573,  13910,   9980, 235336,    108,    477,   573,   8318]])
+        inputs["token_type_ids"][:, 256:]
+        tensor([[0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1]])
+        ```
+        Meaning the last three tokens are of "label" ("suffix") type while the other ones are of "prefix" type.
+
+
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `list[PIL.Image.Image]`, `list[np.ndarray]`, `list[torch.Tensor]`):
+                The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
+                tensor. In case of a NumPy array/PyTorch tensor, each image should be of shape (C, H, W), where C is a
+                number of channels, H and W are image height and width.
+            text (`str`, `list[str]`, `list[list[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+            suffix (`str`, `list[str]`, `list[list[str]]`):
+                The suffixes or batch of suffixes to be encoded. Only necessary for finetuning. See https://github.com/google-research/big_vision/blob/main/big_vision/configs/proj/paligemma/README.md
+                for more information. If your prompt is "<image> What is on the image", the suffix corresponds to the expected prediction "a cow sitting on a bench".
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`. If `suffix`
+              is provided, the `input_ids` will also contain the suffix input ids.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
+            - **labels** -- Labels compatible with training if `suffix` is not None
+        """
+
+        output_kwargs = self._merge_kwargs(
+            PaliGemmaProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        suffix = output_kwargs["text_kwargs"].pop("suffix", None)
+
+        return_token_type_ids = suffix is not None
+
+        if images is None:
+            raise ValueError("`images` are expected as arguments to a `PaliGemmaProcessor` instance.")
+        if text is None:
+            logger.warning_once(
+                "You are using PaliGemma without a text prefix. It will perform as a picture-captioning model."
+            )
+            text = ""
+
+        if _is_str_or_image(text):
+            text = [text]
+        elif isinstance(text, list) and _is_str_or_image(text[0]):
+            pass
+
+        if text is not None and images is not None:
+            if not any(IMAGE_TOKEN in sample for sample in text):
+                logger.warning(
+                    "You are passing both `text` and `images` to `PaliGemmaProcessor`. The processor expects special "
+                    "image tokens in the text, as many tokens as there are images per each text. It is recommended to "
+                    "add `<image>` tokens in the very beginning of your text. For this call, we will infer how many images "
+                    "each text has and add special tokens."
+                )
+
+                if isinstance(text, list) and isinstance(images, list):
+                    if len(images) != len(text):
+                        raise ValueError(
+                            f"Received {len(images)} images for {len(text)} prompts. Each prompt should be associated with an image or list of images."
+                        )
+
+                # make a nested list of lists to be able to iterate over the images and text below
+                if is_valid_image(images):
+                    images = [[images]]
+                elif isinstance(images, (list, tuple)) and is_valid_image(images[0]):
+                    images = [[image] for image in images]
+                elif not (
+                    isinstance(images, (list, tuple))
+                    and isinstance(images[0], (list, tuple))
+                    and is_valid_image(images[0][0])
+                ):
+                    raise ValueError("images must be an image, list of images or list of list of images")
+
+                input_strings = [
+                    build_string_from_input(
+                        prompt=prompt,
+                        bos_token=self.tokenizer.bos_token,
+                        image_seq_len=self.image_seq_length,
+                        image_token=IMAGE_TOKEN,
+                        num_images=len(image_list) if isinstance(image_list, list) else 1,
+                    )
+                    for prompt, image_list in zip(text, images)
+                ]
+            else:
+                expanded_samples = []
+                for sample in text:
+                    expanded_sample = sample.replace(IMAGE_TOKEN, IMAGE_TOKEN * self.image_seq_length)
+                    bos_rfind_index = expanded_sample.rfind(IMAGE_TOKEN)
+                    bos_index = bos_rfind_index + len(IMAGE_TOKEN) if bos_rfind_index != -1 else 0
+                    expanded_sample = (
+                        expanded_sample[:bos_index] + self.tokenizer.bos_token + expanded_sample[bos_index:]
+                    )
+                    expanded_samples.append(expanded_sample)
+                input_strings = [f"{sample}\n" for sample in expanded_samples]
+
+        if suffix is not None and _is_str_or_image(suffix):
+            suffix = [suffix]
+        if suffix is not None:
+            suffix = [sfx + self.tokenizer.eos_token for sfx in suffix]
+        pixel_values = self.image_processor(images, **output_kwargs["images_kwargs"])["pixel_values"]
+
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
+        return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", None)
+        inputs = self.tokenizer(
+            input_strings,
+            text_pair=suffix,
+            return_token_type_ids=return_token_type_ids,
+            **output_kwargs["text_kwargs"],
+        )
+        self._check_special_mm_tokens(input_strings, inputs, modalities=["image"])
+
+        return_data = {**inputs, "pixel_values": pixel_values}
+
+        if return_token_type_ids:
+            labels = np.array(inputs["input_ids"])
+            labels[np.array(inputs["token_type_ids"]) == 0] = -100
+            return_data.update({"labels": labels})
+
+        if return_mm_token_type_ids:
+            array_ids = np.array(return_data["input_ids"])
+            mm_token_type_ids = np.zeros_like(return_data["input_ids"])
+            mm_token_type_ids[array_ids == self.image_token_id] = 1
+            return_data["mm_token_type_ids"] = mm_token_type_ids.tolist()
+
+        return BatchFeature(data=return_data, tensor_type=return_tensors)
+
+    def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs):
+        """
+        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
+
+        Args:
+            image_sizes (list[list[str]], *optional*):
+                The input sizes formatted as (height, width) per each image.
+        Returns:
+            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
+            input modalities, along with other useful data.
+        """
+        vision_data = {}
+        if image_sizes is not None:
+            num_image_tokens = [self.image_seq_length] * len(image_sizes)
+            num_image_patches = [1] * len(image_sizes)
+            vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches})
+        return MultiModalData(**vision_data)
+
+
+__all__ = ["PaliGemmaProcessor"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/patchtst/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/patchtst/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..87f1a02b801dea628890536045de4b4ffa06c7d1
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/patchtst/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/patchtst/__pycache__/configuration_patchtst.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/patchtst/__pycache__/configuration_patchtst.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..aedddb44be5c37cd2701350e33a3a37f358c333e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/patchtst/__pycache__/configuration_patchtst.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/patchtst/__pycache__/modeling_patchtst.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/patchtst/__pycache__/modeling_patchtst.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..60ebacf223ae75fd25974633260fdece52480b6b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/patchtst/__pycache__/modeling_patchtst.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pegasus/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pegasus/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..4903c400f9827239f7920b7e2c9bfddfda48eecd
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pegasus/__init__.py
@@ -0,0 +1,31 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_pegasus import *
+    from .modeling_flax_pegasus import *
+    from .modeling_pegasus import *
+    from .modeling_tf_pegasus import *
+    from .tokenization_pegasus import *
+    from .tokenization_pegasus_fast import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pegasus/__pycache__/modeling_flax_pegasus.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pegasus/__pycache__/modeling_flax_pegasus.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7c6994b584a281afc745e6e47b1427ff483d892b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pegasus/__pycache__/modeling_flax_pegasus.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pegasus/configuration_pegasus.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pegasus/configuration_pegasus.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c27f7d44d952d769ae362b000f87b3dd6a1cfd5
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pegasus/configuration_pegasus.py
@@ -0,0 +1,164 @@
+# coding=utf-8
+# Copyright 2021, Google and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PEGASUS model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class PegasusConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`PegasusModel`]. It is used to instantiate an
+    PEGASUS model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the PEGASUS
+    [google/pegasus-large](https://huggingface.co/google/pegasus-large) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 50265):
+            Vocabulary size of the PEGASUS model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`PegasusModel`] or [`TFPegasusModel`].
+        d_model (`int`, *optional*, defaults to 1024):
+            Dimensionality of the layers and the pooler layer.
+        encoder_layers (`int`, *optional*, defaults to 12):
+            Number of encoder layers.
+        decoder_layers (`int`, *optional*, defaults to 12):
+            Number of decoder layers.
+        encoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        decoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 4096):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in decoder.
+        activation_function (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        max_position_embeddings (`int`, *optional*, defaults to 1024):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        init_std (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        encoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the encoder. See the [LayerDrop paper](see https://huggingface.co/papers/1909.11556)
+            for more details.
+        decoder_layerdrop (`float`, *optional*, defaults to 0.0):
+            The LayerDrop probability for the decoder. See the [LayerDrop paper](see https://huggingface.co/papers/1909.11556)
+            for more details.
+        scale_embedding (`bool`, *optional*, defaults to `False`):
+            Scale embeddings by diving by sqrt(d_model).
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models)
+        forced_eos_token_id (`int`, *optional*, defaults to 1):
+            The id of the token to force as the last generated token when `max_length` is reached. Usually set to
+            `eos_token_id`.
+
+    Example:
+
+    ```python
+    >>> from transformers import PegasusConfig, PegasusModel
+
+    >>> # Initializing a PEGASUS google/pegasus-large style configuration
+    >>> configuration = PegasusConfig()
+
+    >>> # Initializing a model (with random weights) from the google/pegasus-large style configuration
+    >>> model = PegasusModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "pegasus"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"}
+
+    def __init__(
+        self,
+        vocab_size=50265,
+        max_position_embeddings=1024,
+        encoder_layers=12,
+        encoder_ffn_dim=4096,
+        encoder_attention_heads=16,
+        decoder_layers=12,
+        decoder_ffn_dim=4096,
+        decoder_attention_heads=16,
+        encoder_layerdrop=0.0,
+        decoder_layerdrop=0.0,
+        use_cache=True,
+        is_encoder_decoder=True,
+        activation_function="gelu",
+        d_model=1024,
+        dropout=0.1,
+        attention_dropout=0.0,
+        activation_dropout=0.0,
+        init_std=0.02,
+        decoder_start_token_id=0,
+        scale_embedding=False,
+        pad_token_id=0,
+        eos_token_id=1,
+        forced_eos_token_id=1,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.d_model = d_model
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_layers = encoder_layers
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.activation_function = activation_function
+        self.init_std = init_std
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+        self.use_cache = use_cache
+        self.num_hidden_layers = encoder_layers
+        self.scale_embedding = scale_embedding  # scale factor will be sqrt(d_model) if True
+        super().__init__(
+            pad_token_id=pad_token_id,
+            eos_token_id=eos_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            decoder_start_token_id=decoder_start_token_id,
+            forced_eos_token_id=forced_eos_token_id,
+            **kwargs,
+        )
+
+    @property
+    def num_attention_heads(self) -> int:
+        return self.encoder_attention_heads
+
+    @property
+    def hidden_size(self) -> int:
+        return self.d_model
+
+
+__all__ = ["PegasusConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pegasus/modeling_flax_pegasus.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pegasus/modeling_flax_pegasus.py
new file mode 100644
index 0000000000000000000000000000000000000000..ddf0ae492407c17bbc678b82b3d19e34da7d8a75
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pegasus/modeling_flax_pegasus.py
@@ -0,0 +1,1532 @@
+# coding=utf-8
+# Copyright 2021, Google and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Flax PEGASUS model."""
+
+import math
+import random
+from functools import partial
+from typing import Callable, Optional
+
+import flax.linen as nn
+import jax
+import jax.numpy as jnp
+import numpy as np
+from flax.core.frozen_dict import FrozenDict, freeze, unfreeze
+from flax.linen import combine_masks, make_causal_mask
+from flax.linen.attention import dot_product_attention_weights
+from flax.traverse_util import flatten_dict, unflatten_dict
+from jax import lax
+from jax.random import PRNGKey
+
+from ...modeling_flax_outputs import (
+    FlaxBaseModelOutput,
+    FlaxBaseModelOutputWithPastAndCrossAttentions,
+    FlaxCausalLMOutputWithCrossAttentions,
+    FlaxSeq2SeqLMOutput,
+    FlaxSeq2SeqModelOutput,
+)
+from ...modeling_flax_utils import (
+    ACT2FN,
+    FlaxPreTrainedModel,
+    add_start_docstrings_to_model_forward,
+    append_call_sample_docstring,
+    append_replace_return_docstrings,
+    overwrite_call_docstring,
+)
+from ...utils import add_start_docstrings, logging, replace_return_docstrings
+from .configuration_pegasus import PegasusConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "google/pegasus-large"
+_CONFIG_FOR_DOC = "PegasusConfig"
+
+PEGASUS_START_DOCSTRING = r"""
+    This model inherits from [`FlaxPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a Flax Linen
+    [flax.nn.Module](https://flax.readthedocs.io/en/latest/_autosummary/flax.nn.module.html) subclass. Use it as a
+    regular Flax Module and refer to the Flax documentation for all matter related to general usage and behavior.
+
+    Finally, this model supports inherent JAX features such as:
+
+    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
+
+    Parameters:
+        config ([`PegasusConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~FlaxPreTrainedModel.from_pretrained`] method to load the model weights.
+        dtype (`jax.numpy.dtype`, *optional*, defaults to `jax.numpy.float32`):
+            The data type of the computation. Can be one of `jax.numpy.float32`, `jax.numpy.float16` (on GPUs) and
+            `jax.numpy.bfloat16` (on TPUs).
+
+            This can be used to enable mixed-precision training or half-precision inference on GPUs or TPUs. If
+            specified all the computation will be performed with the given `dtype`.
+
+            **Note that this only specifies the dtype of the computation and does not influence the dtype of model
+            parameters.**
+
+            If you wish to change the dtype of the model parameters, see [`~FlaxPreTrainedModel.to_fp16`] and
+            [`~FlaxPreTrainedModel.to_bf16`].
+"""
+
+PEGASUS_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+        decoder_attention_mask (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+
+            If you want to change padding behavior, you should modify to your needs. See diagram 1 in [the
+            paper](https://huggingface.co/papers/1910.13461) for more information on the default strategy.
+        position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+        decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the
+            range `[0, config.max_position_embeddings - 1]`.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+PEGASUS_ENCODE_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`jnp.ndarray` of shape `(batch_size, sequence_length)`):
+            Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
+            it.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
+            config.max_position_embeddings - 1]`.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+PEGASUS_DECODE_INPUTS_DOCSTRING = r"""
+    Args:
+        decoder_input_ids (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+        encoder_outputs (`tuple(tuple(jnp.ndarray)`):
+            Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+            `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+            hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+        encoder_attention_mask (`jnp.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        decoder_attention_mask (`jnp.ndarray` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+
+            If you want to change padding behavior, you should modify to your needs. See diagram 1 in [the
+            paper](https://huggingface.co/papers/1910.13461) for more information on the default strategy.
+        decoder_position_ids (`numpy.ndarray` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the
+            range `[0, config.max_position_embeddings - 1]`.
+        past_key_values (`dict[str, np.ndarray]`, *optional*, returned by `init_cache` or when passing previous `past_key_values`):
+            Dictionary of pre-computed hidden-states (key and values in the attention blocks) that can be used for fast
+            auto-regressive decoding. Pre-computed key and value hidden-states are of shape *[batch_size, max_length]*.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+
+
+# Copied from transformers.models.bart.modeling_flax_bart.shift_tokens_right
+def shift_tokens_right(input_ids: jnp.ndarray, pad_token_id: int, decoder_start_token_id: int) -> jnp.ndarray:
+    """
+    Shift input ids one token to the right.
+    """
+    shifted_input_ids = jnp.zeros_like(input_ids)
+    shifted_input_ids = shifted_input_ids.at[:, 1:].set(input_ids[:, :-1])
+    shifted_input_ids = shifted_input_ids.at[:, 0].set(decoder_start_token_id)
+
+    shifted_input_ids = jnp.where(shifted_input_ids == -100, pad_token_id, shifted_input_ids)
+    return shifted_input_ids
+
+
+# Copied from transformers.models.marian.modeling_flax_marian.create_sinusoidal_positions
+def create_sinusoidal_positions(n_pos, dim):
+    position_enc = np.array([[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)])
+    sentinel = dim // 2 + dim % 2
+    out = np.zeros_like(position_enc)
+    out[:, 0:sentinel] = np.sin(position_enc[:, 0::2])
+    out[:, sentinel:] = np.cos(position_enc[:, 1::2])
+
+    return jnp.array(out)
+
+
+# Copied from transformers.models.bart.modeling_flax_bart.FlaxBartAttention with Bart->Pegasus
+class FlaxPegasusAttention(nn.Module):
+    config: PegasusConfig
+    embed_dim: int
+    num_heads: int
+    dropout: float = 0.0
+    causal: bool = False
+    bias: bool = True
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self) -> None:
+        self.head_dim = self.embed_dim // self.num_heads
+        if self.head_dim * self.num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {self.num_heads})."
+            )
+
+        dense = partial(
+            nn.Dense,
+            self.embed_dim,
+            use_bias=self.bias,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.init_std),
+        )
+
+        self.q_proj, self.k_proj, self.v_proj = dense(), dense(), dense()
+        self.out_proj = dense()
+
+        self.dropout_layer = nn.Dropout(rate=self.dropout)
+
+        if self.causal:
+            self.causal_mask = make_causal_mask(
+                jnp.ones((1, self.config.max_position_embeddings), dtype="bool"), dtype="bool"
+            )
+
+    def _split_heads(self, hidden_states):
+        return hidden_states.reshape(hidden_states.shape[:2] + (self.num_heads, self.head_dim))
+
+    def _merge_heads(self, hidden_states):
+        return hidden_states.reshape(hidden_states.shape[:2] + (self.embed_dim,))
+
+    @nn.compact
+    def _concatenate_to_cache(self, key, value, query, attention_mask):
+        """
+        This function takes projected key, value states from a single input token and concatenates the states to cached
+        states from previous steps. This function is slightly adapted from the official Flax repository:
+        https://github.com/google/flax/blob/491ce18759622506588784b4fca0e4bf05f8c8cd/flax/linen/attention.py#L252
+        """
+        # detect if we're initializing by absence of existing cache data.
+        is_initialized = self.has_variable("cache", "cached_key")
+        cached_key = self.variable("cache", "cached_key", jnp.zeros, key.shape, key.dtype)
+        cached_value = self.variable("cache", "cached_value", jnp.zeros, value.shape, value.dtype)
+        cache_index = self.variable("cache", "cache_index", lambda: jnp.array(0, dtype=jnp.int32))
+
+        if is_initialized:
+            *batch_dims, max_length, num_heads, depth_per_head = cached_key.value.shape
+            # update key, value caches with our new 1d spatial slices
+            cur_index = cache_index.value
+            indices = (0,) * len(batch_dims) + (cur_index, 0, 0)
+            key = lax.dynamic_update_slice(cached_key.value, key, indices)
+            value = lax.dynamic_update_slice(cached_value.value, value, indices)
+            cached_key.value = key
+            cached_value.value = value
+            num_updated_cache_vectors = query.shape[1]
+            cache_index.value = cache_index.value + num_updated_cache_vectors
+            # causal mask for cached decoder self-attention: our single query position should only attend to those key positions that have already been generated and cached, not the remaining zero elements.
+            pad_mask = jnp.broadcast_to(
+                jnp.arange(max_length) < cur_index + num_updated_cache_vectors,
+                tuple(batch_dims) + (1, num_updated_cache_vectors, max_length),
+            )
+            attention_mask = combine_masks(pad_mask, attention_mask)
+        return key, value, attention_mask
+
+    def __call__(
+        self,
+        hidden_states: jnp.ndarray,
+        key_value_states: Optional[jnp.ndarray] = None,
+        attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        deterministic: bool = True,
+    ) -> tuple[jnp.ndarray]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        batch_size = hidden_states.shape[0]
+
+        # get query proj
+        query_states = self.q_proj(hidden_states)
+        # get key, value proj
+        if is_cross_attention:
+            # cross_attentions
+            key_states = self.k_proj(key_value_states)
+            value_states = self.v_proj(key_value_states)
+        else:
+            # self_attention
+            key_states = self.k_proj(hidden_states)
+            value_states = self.v_proj(hidden_states)
+
+        query_states = self._split_heads(query_states)
+        key_states = self._split_heads(key_states)
+        value_states = self._split_heads(value_states)
+
+        # handle cache prepare causal attention mask
+        if self.causal:
+            query_length, key_length = query_states.shape[1], key_states.shape[1]
+            if self.has_variable("cache", "cached_key"):
+                mask_shift = self.variables["cache"]["cache_index"]
+                max_decoder_length = self.variables["cache"]["cached_key"].shape[1]
+                causal_mask = lax.dynamic_slice(
+                    self.causal_mask, (0, 0, mask_shift, 0), (1, 1, query_length, max_decoder_length)
+                )
+            else:
+                causal_mask = self.causal_mask[:, :, :query_length, :key_length]
+            causal_mask = jnp.broadcast_to(causal_mask, (batch_size,) + causal_mask.shape[1:])
+
+        # combine masks if needed
+        if attention_mask is not None and self.causal:
+            attention_mask = jnp.broadcast_to(jnp.expand_dims(attention_mask, axis=(-3, -2)), causal_mask.shape)
+            attention_mask = combine_masks(attention_mask, causal_mask)
+        elif self.causal:
+            attention_mask = causal_mask
+        elif attention_mask is not None:
+            attention_mask = jnp.expand_dims(attention_mask, axis=(-3, -2))
+
+        # During fast autoregressive decoding, we feed one position at a time,
+        # and cache the keys and values step by step.
+        if self.causal and (self.has_variable("cache", "cached_key") or init_cache):
+            key_states, value_states, attention_mask = self._concatenate_to_cache(
+                key_states, value_states, query_states, attention_mask
+            )
+
+        # Convert the boolean attention mask to an attention bias.
+        if attention_mask is not None:
+            # attention mask in the form of attention bias
+            attention_bias = lax.select(
+                attention_mask > 0,
+                jnp.full(attention_mask.shape, 0.0).astype(self.dtype),
+                jnp.full(attention_mask.shape, jnp.finfo(self.dtype).min).astype(self.dtype),
+            )
+        else:
+            attention_bias = None
+
+        dropout_rng = None
+        if not deterministic and self.dropout > 0.0:
+            dropout_rng = self.make_rng("dropout")
+
+        attn_weights = dot_product_attention_weights(
+            query_states,
+            key_states,
+            bias=attention_bias,
+            dropout_rng=dropout_rng,
+            dropout_rate=self.dropout,
+            broadcast_dropout=True,
+            deterministic=deterministic,
+            dtype=self.dtype,
+            precision=None,
+        )
+
+        attn_output = jnp.einsum("...hqk,...khd->...qhd", attn_weights, value_states)
+        attn_output = self._merge_heads(attn_output)
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights
+
+
+# Copied from transformers.models.mbart.modeling_flax_mbart.FlaxMBartEncoderLayer with MBart->Pegasus
+class FlaxPegasusEncoderLayer(nn.Module):
+    config: PegasusConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self) -> None:
+        self.embed_dim = self.config.d_model
+        self.self_attn = FlaxPegasusAttention(
+            config=self.config,
+            embed_dim=self.embed_dim,
+            num_heads=self.config.encoder_attention_heads,
+            dropout=self.config.attention_dropout,
+            dtype=self.dtype,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
+        self.dropout_layer = nn.Dropout(rate=self.config.dropout)
+        self.activation_fn = ACT2FN[self.config.activation_function]
+        self.activation_dropout_layer = nn.Dropout(rate=self.config.activation_dropout)
+        self.fc1 = nn.Dense(
+            self.config.encoder_ffn_dim,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.init_std),
+        )
+        self.fc2 = nn.Dense(
+            self.embed_dim, dtype=self.dtype, kernel_init=jax.nn.initializers.normal(self.config.init_std)
+        )
+        self.final_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
+
+    def __call__(
+        self,
+        hidden_states: jnp.ndarray,
+        attention_mask: jnp.ndarray,
+        output_attentions: bool = True,
+        deterministic: bool = True,
+    ) -> tuple[jnp.ndarray]:
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, attn_weights = self.self_attn(hidden_states=hidden_states, attention_mask=attention_mask)
+        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = self.activation_dropout_layer(hidden_states, deterministic=deterministic)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+# Copied from transformers.models.bart.modeling_flax_bart.FlaxBartEncoderLayerCollection with Bart->Pegasus
+class FlaxPegasusEncoderLayerCollection(nn.Module):
+    config: PegasusConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.layers = [
+            FlaxPegasusEncoderLayer(self.config, name=str(i), dtype=self.dtype)
+            for i in range(self.config.encoder_layers)
+        ]
+        self.layerdrop = self.config.encoder_layerdrop
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        deterministic: bool = True,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        all_attentions = () if output_attentions else None
+        all_hidden_states = () if output_hidden_states else None
+
+        for encoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            dropout_probability = random.uniform(0, 1)
+            if not deterministic and (dropout_probability < self.layerdrop):  # skip the layer
+                layer_outputs = (None, None)
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    output_attentions,
+                    deterministic,
+                )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        outputs = (hidden_states, all_hidden_states, all_attentions)
+
+        if not return_dict:
+            return tuple(v for v in outputs if v is not None)
+
+        return FlaxBaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_attentions
+        )
+
+
+# Copied from transformers.models.mbart.modeling_flax_mbart.FlaxMBartDecoderLayer with MBart->Pegasus
+class FlaxPegasusDecoderLayer(nn.Module):
+    config: PegasusConfig
+    dtype: jnp.dtype = jnp.float32
+
+    def setup(self) -> None:
+        self.embed_dim = self.config.d_model
+        self.self_attn = FlaxPegasusAttention(
+            config=self.config,
+            embed_dim=self.embed_dim,
+            num_heads=self.config.decoder_attention_heads,
+            dropout=self.config.attention_dropout,
+            causal=True,
+            dtype=self.dtype,
+        )
+        self.dropout_layer = nn.Dropout(rate=self.config.dropout)
+        self.activation_fn = ACT2FN[self.config.activation_function]
+        self.activation_dropout_layer = nn.Dropout(rate=self.config.activation_dropout)
+
+        self.self_attn_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
+        self.encoder_attn = FlaxPegasusAttention(
+            config=self.config,
+            embed_dim=self.embed_dim,
+            num_heads=self.config.decoder_attention_heads,
+            dropout=self.config.attention_dropout,
+            dtype=self.dtype,
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
+        self.fc1 = nn.Dense(
+            self.config.decoder_ffn_dim,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.init_std),
+        )
+        self.fc2 = nn.Dense(
+            self.embed_dim, dtype=self.dtype, kernel_init=jax.nn.initializers.normal(self.config.init_std)
+        )
+        self.final_layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
+
+    def __call__(
+        self,
+        hidden_states: jnp.ndarray,
+        attention_mask: jnp.ndarray,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        output_attentions: bool = True,
+        deterministic: bool = True,
+    ) -> tuple[jnp.ndarray]:
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states, attention_mask=attention_mask, init_cache=init_cache
+        )
+        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
+        hidden_states = residual + hidden_states
+
+        # Cross-Attention Block
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+            hidden_states, cross_attn_weights = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+            )
+            hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
+            hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = self.activation_dropout_layer(hidden_states, deterministic=deterministic)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        return outputs
+
+
+# Copied from transformers.models.bart.modeling_flax_bart.FlaxBartDecoderLayerCollection with Bart->Pegasus
+class FlaxPegasusDecoderLayerCollection(nn.Module):
+    config: PegasusConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.layers = [
+            FlaxPegasusDecoderLayer(self.config, name=str(i), dtype=self.dtype)
+            for i in range(self.config.decoder_layers)
+        ]
+        self.layerdrop = self.config.decoder_layerdrop
+
+    def __call__(
+        self,
+        hidden_states,
+        attention_mask,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        deterministic: bool = True,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+    ):
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+
+        for decoder_layer in self.layers:
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+                # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            dropout_probability = random.uniform(0, 1)
+            if not deterministic and (dropout_probability < self.layerdrop):
+                layer_outputs = (None, None, None)
+            else:
+                layer_outputs = decoder_layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    encoder_hidden_states=encoder_hidden_states,
+                    encoder_attention_mask=encoder_attention_mask,
+                    init_cache=init_cache,
+                    output_attentions=output_attentions,
+                    deterministic=deterministic,
+                )
+
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        outputs = [hidden_states, all_hidden_states, all_self_attns, all_cross_attentions]
+
+        if not return_dict:
+            return tuple(v for v in outputs if v is not None)
+
+        return FlaxBaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+class FlaxPegasusEncoder(nn.Module):
+    config: PegasusConfig
+    embed_tokens: nn.Embed
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.dropout_layer = nn.Dropout(rate=self.config.dropout)
+
+        embed_dim = self.config.d_model
+        self.padding_idx = self.config.pad_token_id
+        self.max_source_positions = self.config.max_position_embeddings
+        self.embed_scale = math.sqrt(embed_dim) if self.config.scale_embedding else 1.0
+
+        self.embed_positions = create_sinusoidal_positions(self.config.max_position_embeddings, embed_dim)
+        self.layers = FlaxPegasusEncoderLayerCollection(self.config, self.dtype)
+        self.layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        position_ids,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+        deterministic: bool = True,
+    ):
+        input_shape = input_ids.shape
+        input_ids = input_ids.reshape(-1, input_shape[-1])
+
+        inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+
+        # embed positions
+        embed_pos = jnp.take(self.embed_positions, position_ids, axis=0)
+        # explicitly cast the positions here, since self.embed_positions are not registered as parameters
+        embed_pos = embed_pos.astype(inputs_embeds.dtype)
+
+        hidden_states = inputs_embeds + embed_pos
+        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
+        outputs = self.layers(
+            hidden_states,
+            attention_mask,
+            deterministic=deterministic,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_state = outputs[0]
+        last_hidden_state = self.layer_norm(last_hidden_state)
+
+        # update the last element in `hidden_states` after applying `layernorm` above
+        hidden_states = None
+        if output_hidden_states:
+            hidden_states = outputs[1]
+            hidden_states = hidden_states[:-1] + (last_hidden_state,)
+
+        if not return_dict:
+            outputs = (last_hidden_state, hidden_states) + (outputs[2:] if output_hidden_states else outputs[1:])
+            return tuple(v for v in outputs if v is not None)
+
+        return FlaxBaseModelOutput(
+            last_hidden_state=last_hidden_state,
+            hidden_states=hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class FlaxPegasusDecoder(nn.Module):
+    config: PegasusConfig
+    embed_tokens: nn.Embed
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.dropout_layer = nn.Dropout(rate=self.config.dropout)
+
+        embed_dim = self.config.d_model
+        self.padding_idx = self.config.pad_token_id
+        self.max_target_positions = self.config.max_position_embeddings
+        self.embed_scale = math.sqrt(self.config.d_model) if self.config.scale_embedding else 1.0
+
+        self.embed_positions = create_sinusoidal_positions(self.config.max_position_embeddings, embed_dim)
+
+        self.layers = FlaxPegasusDecoderLayerCollection(self.config, self.dtype)
+        self.layer_norm = nn.LayerNorm(dtype=self.dtype, epsilon=1e-05)
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        position_ids,
+        encoder_hidden_states: Optional[jnp.ndarray] = None,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        init_cache: bool = False,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+        deterministic: bool = True,
+    ):
+        input_shape = input_ids.shape
+        input_ids = input_ids.reshape(-1, input_shape[-1])
+
+        inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+
+        # embed positions
+        positions = jnp.take(self.embed_positions, position_ids, axis=0)
+        # explicitly cast the positions here, since self.embed_positions are not registered as parameters
+        positions = positions.astype(inputs_embeds.dtype)
+
+        hidden_states = inputs_embeds + positions
+        hidden_states = self.dropout_layer(hidden_states, deterministic=deterministic)
+        outputs = self.layers(
+            hidden_states,
+            attention_mask,
+            encoder_hidden_states,
+            encoder_attention_mask,
+            deterministic=deterministic,
+            init_cache=init_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        last_hidden_state = outputs[0]
+        last_hidden_state = self.layer_norm(last_hidden_state)
+
+        # update the last element in `hidden_states` after applying `layernorm` above
+        hidden_states = None
+        if output_hidden_states:
+            hidden_states = outputs[1]
+            hidden_states = hidden_states[:-1] + (last_hidden_state,)
+
+        if not return_dict:
+            outputs = (last_hidden_state, hidden_states) + (outputs[2:] if output_hidden_states else outputs[1:])
+            return tuple(v for v in outputs if v is not None)
+
+        return FlaxBaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=last_hidden_state,
+            hidden_states=hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+
+# Copied from transformers.models.bart.modeling_flax_bart.FlaxBartModule with Bart->Pegasus
+class FlaxPegasusModule(nn.Module):
+    config: PegasusConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+
+    def setup(self):
+        self.shared = nn.Embed(
+            self.config.vocab_size,
+            self.config.d_model,
+            embedding_init=jax.nn.initializers.normal(self.config.init_std),
+            dtype=self.dtype,
+        )
+
+        self.encoder = FlaxPegasusEncoder(self.config, dtype=self.dtype, embed_tokens=self.shared)
+        self.decoder = FlaxPegasusDecoder(self.config, dtype=self.dtype, embed_tokens=self.shared)
+
+    def _get_encoder_module(self):
+        return self.encoder
+
+    def _get_decoder_module(self):
+        return self.decoder
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        decoder_input_ids,
+        decoder_attention_mask,
+        position_ids,
+        decoder_position_ids,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+        deterministic: bool = True,
+    ):
+        encoder_outputs = self.encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=deterministic,
+        )
+
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            position_ids=decoder_position_ids,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=deterministic,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
+        return FlaxSeq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+
+class FlaxPegasusPreTrainedModel(FlaxPreTrainedModel):
+    config_class = PegasusConfig
+    base_model_prefix: str = "model"
+    module_class: nn.Module = None
+
+    def __init__(
+        self,
+        config: PegasusConfig,
+        input_shape: tuple[int] = (1, 1),
+        seed: int = 0,
+        dtype: jnp.dtype = jnp.float32,
+        _do_init: bool = True,
+        **kwargs,
+    ):
+        module = self.module_class(config=config, dtype=dtype, **kwargs)
+        super().__init__(config, module, input_shape=input_shape, seed=seed, dtype=dtype, _do_init=_do_init)
+
+    def init_weights(self, rng: jax.random.PRNGKey, input_shape: tuple, params: FrozenDict = None) -> FrozenDict:
+        # init input tensors
+        input_ids = jnp.zeros(input_shape, dtype="i4")
+        attention_mask = jnp.ones_like(input_ids)
+        decoder_input_ids = input_ids
+        decoder_attention_mask = jnp.ones_like(input_ids)
+
+        batch_size, sequence_length = input_ids.shape
+        position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))
+        decoder_position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))
+
+        params_rng, dropout_rng = jax.random.split(rng)
+        rngs = {"params": params_rng, "dropout": dropout_rng}
+
+        random_params = self.module.init(
+            rngs,
+            input_ids,
+            attention_mask,
+            decoder_input_ids,
+            decoder_attention_mask,
+            position_ids,
+            decoder_position_ids,
+        )["params"]
+
+        if params is not None:
+            random_params = flatten_dict(unfreeze(random_params))
+            params = flatten_dict(unfreeze(params))
+            for missing_key in self._missing_keys:
+                params[missing_key] = random_params[missing_key]
+            self._missing_keys = set()
+            return freeze(unflatten_dict(params))
+        else:
+            return random_params
+
+    def init_cache(self, batch_size, max_length, encoder_outputs):
+        r"""
+        Args:
+            batch_size (`int`):
+                batch_size used for fast auto-regressive decoding. Defines the batch size of the initialized cache.
+            max_length (`int`):
+                maximum possible length for auto-regressive decoding. Defines the sequence length of the initialized
+                cache.
+            encoder_outputs (`Union[FlaxBaseModelOutput, tuple(tuple(jnp.ndarray)]`):
+                `encoder_outputs` consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*:
+                `attentions`). `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*)
+                is a sequence of hidden-states at the output of the last layer of the encoder. Used in the
+                cross-attention of the decoder.
+        """
+        # init input variables to retrieve cache
+        decoder_input_ids = jnp.ones((batch_size, max_length), dtype="i4")
+        decoder_attention_mask = jnp.ones_like(decoder_input_ids)
+        decoder_position_ids = jnp.broadcast_to(
+            jnp.arange(jnp.atleast_2d(decoder_input_ids).shape[-1]), decoder_input_ids.shape
+        )
+
+        def _decoder_forward(module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, **kwargs):
+            decoder_module = module._get_decoder_module()
+            return decoder_module(
+                decoder_input_ids,
+                decoder_attention_mask,
+                decoder_position_ids,
+                **kwargs,
+            )
+
+        init_variables = self.module.init(
+            jax.random.PRNGKey(0),
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            decoder_position_ids=decoder_position_ids,
+            encoder_hidden_states=encoder_outputs[0],
+            init_cache=True,
+            method=_decoder_forward,  # we only need to call the decoder to init the cache
+        )
+        return unfreeze(init_variables["cache"])
+
+    @add_start_docstrings(PEGASUS_ENCODE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=FlaxBaseModelOutput, config_class=PegasusConfig)
+    def encode(
+        self,
+        input_ids: jnp.ndarray,
+        attention_mask: Optional[jnp.ndarray] = None,
+        position_ids: Optional[jnp.ndarray] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        train: bool = False,
+        params: Optional[dict] = None,
+        dropout_rng: PRNGKey = None,
+    ):
+        r"""
+        Returns:
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, FlaxPegasusForConditionalGeneration
+
+        >>> model = FlaxPegasusForConditionalGeneration.from_pretrained("google/pegasus-large")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/pegasus-large")
+
+        >>> text = "My friends are cool but they eat too many carbs."
+        >>> inputs = tokenizer(text, max_length=1024, return_tensors="np")
+        >>> encoder_outputs = model.encode(**inputs)
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        if attention_mask is None:
+            attention_mask = jnp.ones_like(input_ids)
+        if position_ids is None:
+            batch_size, sequence_length = input_ids.shape
+            position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))
+
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+
+        def _encoder_forward(module, input_ids, attention_mask, position_ids, **kwargs):
+            encode_module = module._get_encoder_module()
+            return encode_module(input_ids, attention_mask, position_ids, **kwargs)
+
+        return self.module.apply(
+            {"params": params or self.params},
+            input_ids=jnp.array(input_ids, dtype="i4"),
+            attention_mask=jnp.array(attention_mask, dtype="i4"),
+            position_ids=jnp.array(position_ids, dtype="i4"),
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=not train,
+            rngs=rngs,
+            method=_encoder_forward,
+        )
+
+    @add_start_docstrings(PEGASUS_DECODE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=FlaxBaseModelOutputWithPastAndCrossAttentions, config_class=PegasusConfig)
+    def decode(
+        self,
+        decoder_input_ids,
+        encoder_outputs,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        decoder_attention_mask: Optional[jnp.ndarray] = None,
+        decoder_position_ids: Optional[jnp.ndarray] = None,
+        past_key_values: Optional[dict] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        train: bool = False,
+        params: Optional[dict] = None,
+        dropout_rng: PRNGKey = None,
+    ):
+        r"""
+        Returns:
+
+        Example:
+
+        ```python
+        >>> import jax.numpy as jnp
+        >>> from transformers import AutoTokenizer, FlaxPegasusForConditionalGeneration
+
+        >>> model = FlaxPegasusForConditionalGeneration.from_pretrained("google/pegasus-large")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/pegasus-large")
+
+        >>> text = "My friends are cool but they eat too many carbs."
+        >>> inputs = tokenizer(text, max_length=1024, return_tensors="np")
+        >>> encoder_outputs = model.encode(**inputs)
+
+        >>> decoder_start_token_id = model.config.decoder_start_token_id
+        >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
+
+        >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
+        >>> last_decoder_hidden_states = outputs.last_hidden_state
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        encoder_hidden_states = encoder_outputs[0]
+        if encoder_attention_mask is None:
+            batch_size, sequence_length = encoder_hidden_states.shape[:2]
+            encoder_attention_mask = jnp.ones((batch_size, sequence_length))
+
+        batch_size, sequence_length = decoder_input_ids.shape
+        if decoder_attention_mask is None:
+            decoder_attention_mask = jnp.ones((batch_size, sequence_length))
+
+        if decoder_position_ids is None:
+            if past_key_values is not None:
+                raise ValueError("Make sure to provide `decoder_position_ids` when passing `past_key_values`.")
+
+            decoder_position_ids = jnp.broadcast_to(
+                jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)
+            )
+
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+
+        inputs = {"params": params or self.params}
+
+        # if past_key_values are passed then cache is already initialized a private flag init_cache has to be
+        # passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that
+        # it can be changed by FlaxPegasusAttention module
+        if past_key_values:
+            inputs["cache"] = past_key_values
+            mutable = ["cache"]
+        else:
+            mutable = False
+
+        def _decoder_forward(module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, **kwargs):
+            decoder_module = module._get_decoder_module()
+            return decoder_module(
+                decoder_input_ids,
+                decoder_attention_mask,
+                decoder_position_ids,
+                **kwargs,
+            )
+
+        outputs = self.module.apply(
+            inputs,
+            decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"),
+            decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"),
+            decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"),
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=jnp.array(encoder_attention_mask, dtype="i4"),
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=not train,
+            rngs=rngs,
+            mutable=mutable,
+            method=_decoder_forward,
+        )
+
+        # add updated cache to model output
+        if past_key_values is not None and return_dict:
+            outputs, past = outputs
+            outputs["past_key_values"] = unfreeze(past["cache"])
+            return outputs
+        elif past_key_values is not None and not return_dict:
+            outputs, past = outputs
+            outputs = outputs[:1] + (unfreeze(past["cache"]),) + outputs[1:]
+
+        return outputs
+
+    @add_start_docstrings_to_model_forward(PEGASUS_INPUTS_DOCSTRING)
+    def __call__(
+        self,
+        input_ids: jnp.ndarray,
+        attention_mask: Optional[jnp.ndarray] = None,
+        decoder_input_ids: Optional[jnp.ndarray] = None,
+        decoder_attention_mask: Optional[jnp.ndarray] = None,
+        position_ids: Optional[jnp.ndarray] = None,
+        decoder_position_ids: Optional[jnp.ndarray] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        train: bool = False,
+        params: Optional[dict] = None,
+        dropout_rng: PRNGKey = None,
+    ):
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        # prepare encoder inputs
+        if attention_mask is None:
+            attention_mask = jnp.ones_like(input_ids)
+        if position_ids is None:
+            batch_size, sequence_length = input_ids.shape
+            position_ids = jnp.broadcast_to(jnp.arange(sequence_length)[None, :], (batch_size, sequence_length))
+
+        # prepare decoder inputs
+        if decoder_input_ids is None:
+            decoder_input_ids = shift_tokens_right(
+                input_ids, self.config.pad_token_id, decoder_start_token_id=self.config.decoder_start_token_id
+            )
+        if decoder_attention_mask is None:
+            decoder_attention_mask = jnp.ones_like(decoder_input_ids)
+        if decoder_position_ids is None:
+            batch_size, sequence_length = decoder_input_ids.shape
+            decoder_position_ids = jnp.broadcast_to(
+                jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)
+            )
+
+        # Handle any PRNG if needed
+        rngs = {"dropout": dropout_rng} if dropout_rng is not None else {}
+
+        return self.module.apply(
+            {"params": params or self.params},
+            input_ids=jnp.array(input_ids, dtype="i4"),
+            attention_mask=jnp.array(attention_mask, dtype="i4"),
+            position_ids=jnp.array(position_ids, dtype="i4"),
+            decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"),
+            decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"),
+            decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"),
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=not train,
+            rngs=rngs,
+        )
+
+
+@add_start_docstrings(
+    "The bare Pegasus Model transformer outputting raw hidden-states without any specific head on top.",
+    PEGASUS_START_DOCSTRING,
+)
+class FlaxPegasusModel(FlaxPegasusPreTrainedModel):
+    config: PegasusConfig
+    dtype: jnp.dtype = jnp.float32  # the dtype of the computation
+    module_class = FlaxPegasusModule
+
+
+append_call_sample_docstring(FlaxPegasusModel, _CHECKPOINT_FOR_DOC, FlaxSeq2SeqModelOutput, _CONFIG_FOR_DOC)
+
+
+# Copied from transformers.models.bart.modeling_flax_bart.FlaxBartForConditionalGenerationModule with Bart->Pegasus
+class FlaxPegasusForConditionalGenerationModule(nn.Module):
+    config: PegasusConfig
+    dtype: jnp.dtype = jnp.float32
+    bias_init: Callable[..., jnp.ndarray] = jax.nn.initializers.zeros
+
+    def setup(self):
+        self.model = FlaxPegasusModule(config=self.config, dtype=self.dtype)
+        self.lm_head = nn.Dense(
+            self.model.shared.num_embeddings,
+            use_bias=False,
+            dtype=self.dtype,
+            kernel_init=jax.nn.initializers.normal(self.config.init_std),
+        )
+        self.final_logits_bias = self.param("final_logits_bias", self.bias_init, (1, self.model.shared.num_embeddings))
+
+    def _get_encoder_module(self):
+        return self.model.encoder
+
+    def _get_decoder_module(self):
+        return self.model.decoder
+
+    def __call__(
+        self,
+        input_ids,
+        attention_mask,
+        decoder_input_ids,
+        decoder_attention_mask,
+        position_ids,
+        decoder_position_ids,
+        output_attentions: bool = False,
+        output_hidden_states: bool = False,
+        return_dict: bool = True,
+        deterministic: bool = True,
+    ):
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            position_ids=position_ids,
+            decoder_position_ids=decoder_position_ids,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=deterministic,
+        )
+
+        hidden_states = outputs[0]
+
+        if self.config.tie_word_embeddings:
+            shared_embedding = self.model.variables["params"]["shared"]["embedding"]
+            lm_logits = self.lm_head.apply({"params": {"kernel": shared_embedding.T}}, hidden_states)
+        else:
+            lm_logits = self.lm_head(hidden_states)
+
+        lm_logits += jax.lax.stop_gradient(self.final_logits_bias.astype(self.dtype))
+
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return output
+
+        return FlaxSeq2SeqLMOutput(
+            logits=lm_logits,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+
+@add_start_docstrings(
+    "The PEGASUS Model with a language modeling head. Can be used for summarization.", PEGASUS_START_DOCSTRING
+)
+class FlaxPegasusForConditionalGeneration(FlaxPegasusPreTrainedModel):
+    module_class = FlaxPegasusForConditionalGenerationModule
+    dtype: jnp.dtype = jnp.float32
+
+    @add_start_docstrings(PEGASUS_DECODE_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=FlaxCausalLMOutputWithCrossAttentions, config_class=PegasusConfig)
+    def decode(
+        self,
+        decoder_input_ids,
+        encoder_outputs,
+        encoder_attention_mask: Optional[jnp.ndarray] = None,
+        decoder_attention_mask: Optional[jnp.ndarray] = None,
+        decoder_position_ids: Optional[jnp.ndarray] = None,
+        past_key_values: Optional[dict] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        deterministic: bool = True,
+        params: Optional[dict] = None,
+        dropout_rng: PRNGKey = None,
+    ):
+        r"""
+        Returns:
+
+        Example:
+
+        ```python
+        >>> import jax.numpy as jnp
+        >>> from transformers import AutoTokenizer, FlaxPegasusForConditionalGeneration
+
+        >>> model = FlaxPegasusForConditionalGeneration.from_pretrained("google/pegasus-large")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/pegasus-large")
+
+        >>> text = "My friends are cool but they eat too many carbs."
+        >>> inputs = tokenizer(text, max_length=1024, return_tensors="np")
+        >>> encoder_outputs = model.encode(**inputs)
+
+        >>> decoder_start_token_id = model.config.decoder_start_token_id
+        >>> decoder_input_ids = jnp.ones((inputs.input_ids.shape[0], 1), dtype="i4") * decoder_start_token_id
+
+        >>> outputs = model.decode(decoder_input_ids, encoder_outputs)
+        >>> logits = outputs.logits
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.return_dict
+
+        encoder_hidden_states = encoder_outputs[0]
+        if encoder_attention_mask is None:
+            batch_size, sequence_length = encoder_hidden_states.shape[:2]
+            encoder_attention_mask = jnp.ones((batch_size, sequence_length))
+
+        batch_size, sequence_length = decoder_input_ids.shape
+        if decoder_attention_mask is None:
+            decoder_attention_mask = jnp.ones((batch_size, sequence_length))
+
+        if decoder_position_ids is None:
+            if past_key_values is not None:
+                raise ValueError("Make sure to provide `decoder_position_ids` when passing `past_key_values`.")
+
+            decoder_position_ids = jnp.broadcast_to(
+                jnp.arange(sequence_length)[None, :], (batch_size, sequence_length)
+            )
+
+        # Handle any PRNG if needed
+        rngs = {}
+        if dropout_rng is not None:
+            rngs["dropout"] = dropout_rng
+
+        inputs = {"params": params or self.params}
+
+        # if past_key_values are passed then cache is already initialized a private flag init_cache has to be
+        # passed down to ensure cache is used. It has to be made sure that cache is marked as mutable so that
+        # it can be changed by FlaxPegasusAttention module
+        if past_key_values:
+            inputs["cache"] = past_key_values
+            mutable = ["cache"]
+        else:
+            mutable = False
+
+        def _decoder_forward(module, decoder_input_ids, decoder_attention_mask, decoder_position_ids, **kwargs):
+            decoder_module = module._get_decoder_module()
+            outputs = decoder_module(
+                decoder_input_ids,
+                decoder_attention_mask,
+                decoder_position_ids,
+                **kwargs,
+            )
+            hidden_states = outputs[0]
+
+            if self.config.tie_word_embeddings:
+                shared_embedding = module.model.variables["params"]["shared"]["embedding"]
+                lm_logits = module.lm_head.apply({"params": {"kernel": shared_embedding.T}}, hidden_states)
+            else:
+                lm_logits = module.lm_head(hidden_states)
+
+            lm_logits += module.final_logits_bias.astype(self.dtype)
+            return lm_logits, outputs
+
+        outputs = self.module.apply(
+            inputs,
+            decoder_input_ids=jnp.array(decoder_input_ids, dtype="i4"),
+            decoder_attention_mask=jnp.array(decoder_attention_mask, dtype="i4"),
+            decoder_position_ids=jnp.array(decoder_position_ids, dtype="i4"),
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=jnp.array(encoder_attention_mask, dtype="i4"),
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            deterministic=deterministic,
+            rngs=rngs,
+            mutable=mutable,
+            method=_decoder_forward,
+        )
+
+        if past_key_values is None:
+            lm_logits, decoder_outputs = outputs
+        else:
+            (lm_logits, decoder_outputs), past = outputs
+
+        if return_dict:
+            outputs = FlaxCausalLMOutputWithCrossAttentions(
+                logits=lm_logits,
+                hidden_states=decoder_outputs.hidden_states,
+                attentions=decoder_outputs.attentions,
+                cross_attentions=decoder_outputs.cross_attentions,
+            )
+        else:
+            outputs = (lm_logits,) + decoder_outputs[1:]
+
+        # add updated cache to model output
+        if past_key_values is not None and return_dict:
+            outputs["past_key_values"] = unfreeze(past["cache"])
+            return outputs
+        elif past_key_values is not None and not return_dict:
+            outputs = outputs[:1] + (unfreeze(past["cache"]),) + outputs[1:]
+
+        return outputs
+
+    def prepare_inputs_for_generation(
+        self,
+        decoder_input_ids,
+        max_length,
+        attention_mask: Optional[jax.Array] = None,
+        decoder_attention_mask: Optional[jax.Array] = None,
+        encoder_outputs=None,
+        **kwargs,
+    ):
+        # initializing the cache
+        batch_size, seq_length = decoder_input_ids.shape
+
+        past_key_values = self.init_cache(batch_size, max_length, encoder_outputs)
+        # Note that usually one would have to put 0's in the attention_mask for x > input_ids.shape[-1] and x < cache_length.
+        # But since the decoder uses a causal mask, those positions are masked anyways.
+        # Thus we can create a single static attention_mask here, which is more efficient for compilation
+        extended_attention_mask = jnp.ones((batch_size, max_length), dtype="i4")
+        if decoder_attention_mask is not None:
+            position_ids = decoder_attention_mask.cumsum(axis=-1) - 1
+            extended_attention_mask = lax.dynamic_update_slice(extended_attention_mask, decoder_attention_mask, (0, 0))
+        else:
+            position_ids = jnp.broadcast_to(jnp.arange(seq_length, dtype="i4")[None, :], (batch_size, seq_length))
+
+        return {
+            "past_key_values": past_key_values,
+            "encoder_outputs": encoder_outputs,
+            "encoder_attention_mask": attention_mask,
+            "decoder_attention_mask": extended_attention_mask,
+            "decoder_position_ids": position_ids,
+        }
+
+    def update_inputs_for_generation(self, model_outputs, model_kwargs):
+        model_kwargs["past_key_values"] = model_outputs.past_key_values
+        model_kwargs["decoder_position_ids"] = model_kwargs["decoder_position_ids"][:, -1:] + 1
+        return model_kwargs
+
+
+FLAX_PEGASUS_CONDITIONAL_GENERATION_DOCSTRING = """
+    Returns:
+
+    Summarization example:
+
+    ```python
+    >>> from transformers import AutoTokenizer, FlaxPegasusForConditionalGeneration
+
+    >>> model = FlaxPegasusForConditionalGeneration.from_pretrained('google/pegasus-large')
+    >>> tokenizer = AutoTokenizer.from_pretrained('google/pegasus-large')
+
+    >>> ARTICLE_TO_SUMMARIZE = "My friends are cool but they eat too many carbs."
+    >>> inputs = tokenizer([ARTICLE_TO_SUMMARIZE], max_length=1024, return_tensors='np')
+
+    >>> # Generate Summary
+    >>> summary_ids = model.generate(inputs['input_ids']).sequences
+    >>> print(tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False))
+    ```
+
+    Mask filling example:
+
+    ```python
+    >>> from transformers import AutoTokenizer, FlaxPegasusForConditionalGeneration
+
+    >>> tokenizer = AutoTokenizer.from_pretrained("google/pegasus-large")
+    >>> TXT = "My friends are <mask> but they eat too many carbs."
+
+    >>> model = FlaxPegasusForConditionalGeneration.from_pretrained("google/pegasus-large")
+    >>> input_ids = tokenizer([TXT], return_tensors="np")["input_ids"]
+    >>> logits = model(input_ids).logits
+
+    >>> masked_index = (input_ids[0] == tokenizer.mask_token_id).nonzero().item()
+    >>> probs = jax.nn.softmax(logits[0, masked_index], axis=0)
+    >>> values, predictions = jax.lax.top_k(probs)
+
+    >>> tokenizer.decode(predictions).split()
+    ```
+"""
+
+overwrite_call_docstring(
+    FlaxPegasusForConditionalGeneration, PEGASUS_INPUTS_DOCSTRING + FLAX_PEGASUS_CONDITIONAL_GENERATION_DOCSTRING
+)
+append_replace_return_docstrings(
+    FlaxPegasusForConditionalGeneration, output_type=FlaxSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC
+)
+
+
+__all__ = ["FlaxPegasusForConditionalGeneration", "FlaxPegasusModel", "FlaxPegasusPreTrainedModel"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pegasus/modeling_pegasus.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pegasus/modeling_pegasus.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc3a8005acac814393789cfa6146f413af2863bd
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pegasus/modeling_pegasus.py
@@ -0,0 +1,1671 @@
+# coding=utf-8
+# Copyright 2021, Google and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch PEGASUS model."""
+
+import copy
+import math
+from typing import Callable, Optional, Union
+
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
+from ...generation import GenerationMixin
+from ...modeling_attn_mask_utils import (
+    AttentionMaskConverter,
+    _prepare_4d_attention_mask,
+    _prepare_4d_attention_mask_for_sdpa,
+)
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    CausalLMOutputWithCrossAttentions,
+    Seq2SeqLMOutput,
+    Seq2SeqModelOutput,
+)
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import (
+    auto_docstring,
+    is_torch_flex_attn_available,
+    is_torchdynamo_compiling,
+    logging,
+)
+from ...utils.deprecation import deprecate_kwarg
+from .configuration_pegasus import PegasusConfig
+
+
+if is_torch_flex_attn_available():
+    from ...integrations.flex_attention import BlockMask, make_flex_block_causal_mask
+
+
+logger = logging.get_logger(__name__)
+
+
+# Copied from transformers.models.bart.modeling_bart.shift_tokens_right
+def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
+    """
+    Shift input ids one token to the right.
+    """
+    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
+    shifted_input_ids[:, 0] = decoder_start_token_id
+
+    if pad_token_id is None:
+        raise ValueError("self.model.config.pad_token_id has to be defined.")
+    # replace possible -100 values in labels by `pad_token_id`
+    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+    return shifted_input_ids
+
+
+# Copied from transformers.models.marian.modeling_marian.MarianSinusoidalPositionalEmbedding with Marian->Pegasus
+class PegasusSinusoidalPositionalEmbedding(nn.Embedding):
+    """This module produces sinusoidal positional embeddings of any length."""
+
+    def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None) -> None:
+        super().__init__(num_positions, embedding_dim)
+
+    def _init_weight(self):
+        """
+        Identical to the XLM create_sinusoidal_embeddings except features are not interleaved. The cos features are in
+        the 2nd half of the vector. [dim // 2:]
+        """
+        n_pos, dim = self.weight.shape
+        position_enc = np.array(
+            [[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]
+        )
+        out = torch.empty(n_pos, dim, dtype=self.weight.dtype, requires_grad=False)
+        sentinel = dim // 2 if dim % 2 == 0 else (dim // 2) + 1
+        out[:, 0:sentinel] = torch.FloatTensor(np.sin(position_enc[:, 0::2]))
+        out[:, sentinel:] = torch.FloatTensor(np.cos(position_enc[:, 1::2]))
+        self.weight = nn.Parameter(out, requires_grad=False)
+
+    @torch.no_grad()
+    def forward(
+        self, input_ids_shape: torch.Size, past_key_values_length: int = 0, position_ids: Optional[torch.Tensor] = None
+    ) -> torch.Tensor:
+        """`input_ids_shape` is expected to be [bsz x seqlen]."""
+        if position_ids is None:
+            bsz, seq_len = input_ids_shape[:2]
+            position_ids = torch.arange(
+                past_key_values_length, past_key_values_length + seq_len, dtype=torch.long, device=self.weight.device
+            )
+        return super().forward(position_ids)
+
+
+# Copied from transformers.models.bart.modeling_bart.eager_attention_forward
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: Optional[float] = None,
+    dropout: float = 0.0,
+    head_mask: Optional[torch.Tensor] = None,
+    **kwargs,
+):
+    if scaling is None:
+        scaling = query.size(-1) ** -0.5
+
+    attn_weights = torch.matmul(query, key.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        attn_weights = attn_weights + attention_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+    if head_mask is not None:
+        attn_weights = attn_weights * head_mask.view(1, -1, 1, 1)
+
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+# Copied from transformers.models.bart.modeling_bart.BartAttention with Bart->Pegasus
+class PegasusAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+        is_causal: bool = False,
+        config: Optional[PegasusConfig] = None,
+        layer_idx: Optional[int] = None,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        self.config = config
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+        self.is_causal = is_causal
+        self.layer_idx = layer_idx
+        if layer_idx is None and self.is_decoder:
+            logger.warning_once(
+                f"Instantiating a decoder {self.__class__.__name__} without passing `layer_idx` is not recommended and "
+                "will lead to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        key_value_states: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        cache_position: Optional[torch.Tensor] = None,
+        # TODO: we need a refactor so that the different attention modules can get their specific kwargs
+        # ATM, we have mixed things encoder, decoder, and encoder-decoder attn
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+
+        # determine input shapes
+        bsz, tgt_len = hidden_states.shape[:-1]
+        src_len = key_value_states.shape[1] if is_cross_attention else tgt_len
+
+        q_input_shape = (bsz, tgt_len, -1, self.head_dim)
+        kv_input_shape = (bsz, src_len, -1, self.head_dim)
+
+        # get query proj
+        query_states = self.q_proj(hidden_states).view(*q_input_shape).transpose(1, 2)
+
+        is_updated = False
+        if past_key_values is not None:
+            if isinstance(past_key_values, EncoderDecoderCache):
+                is_updated = past_key_values.is_updated.get(self.layer_idx)
+                if is_cross_attention:
+                    # after the first generated id, we can subsequently re-use all key/value_states from cache
+                    curr_past_key_value = past_key_values.cross_attention_cache
+                else:
+                    curr_past_key_value = past_key_values.self_attention_cache
+            else:
+                curr_past_key_value = past_key_values
+
+        current_states = key_value_states if is_cross_attention else hidden_states
+        if is_cross_attention and past_key_values is not None and is_updated:
+            # reuse k,v, cross_attentions
+            key_states = curr_past_key_value.layers[self.layer_idx].keys
+            value_states = curr_past_key_value.layers[self.layer_idx].values
+        else:
+            key_states = self.k_proj(current_states)
+            value_states = self.v_proj(current_states)
+            key_states = key_states.view(*kv_input_shape).transpose(1, 2)
+            value_states = value_states.view(*kv_input_shape).transpose(1, 2)
+
+            if past_key_values is not None:
+                # save all key/value_states to cache to be re-used for fast auto-regressive generation
+                cache_position = cache_position if not is_cross_attention else None
+                key_states, value_states = curr_past_key_value.update(
+                    key_states, value_states, self.layer_idx, {"cache_position": cache_position}
+                )
+                # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls
+                if is_cross_attention and isinstance(past_key_values, EncoderDecoderCache):
+                    past_key_values.is_updated[self.layer_idx] = True
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.dropout,
+            scaling=self.scaling,
+            output_attentions=output_attentions,
+            head_mask=layer_head_mask,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(bsz, tgt_len, -1).contiguous()
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights
+
+
+# Copied from transformers.models.mbart.modeling_mbart.MBartEncoderLayer with MBart->Pegasus, MBART->PEGASUS
+class PegasusEncoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: PegasusConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        self.self_attn = PegasusAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            dropout=config.attention_dropout,
+            config=config,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(self.embed_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        layer_head_mask: torch.Tensor,
+        output_attentions: bool = False,
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        if hidden_states.dtype == torch.float16:
+            clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+            hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        return hidden_states, attn_weights
+
+
+# Copied from transformers.models.mbart.modeling_mbart.MBartDecoderLayer with MBart->Pegasus, MBART->PEGASUS
+class PegasusDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: PegasusConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.embed_dim = config.d_model
+
+        self.self_attn = PegasusAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+            is_causal=True,
+            config=config,
+            layer_idx=layer_idx,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.activation_dropout = config.activation_dropout
+
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.encoder_attn = PegasusAttention(
+            self.embed_dim,
+            config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+            config=config,
+            layer_idx=layer_idx,
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.fc1 = nn.Linear(self.embed_dim, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, self.embed_dim)
+        self.final_layer_norm = nn.LayerNorm(self.embed_dim)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        layer_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_layer_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = True,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            encoder_hidden_states (`torch.FloatTensor`):
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
+            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+                `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very large negative values.
+            layer_head_mask (`torch.FloatTensor`): mask for attention heads in a given layer of size
+                `(encoder_attention_heads,)`.
+            cross_attn_layer_head_mask (`torch.FloatTensor`): mask for cross-attention heads in a given layer of
+                size `(decoder_attention_heads,)`.
+            past_key_values (`Cache`): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence. It is used to update the
+                cache in the correct position and to infer the complete sequence length.
+        """
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+            output_attentions=output_attentions,
+            cache_position=cache_position,
+        )
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        # Cross-Attention Block
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+            hidden_states, cross_attn_weights = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+            )
+            hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+            hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        return outputs
+
+
+@auto_docstring
+class PegasusPreTrainedModel(PreTrainedModel):
+    config: PegasusConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+
+    _can_compile_fullgraph = True
+
+    def _init_weights(self, module):
+        std = self.config.init_std
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, PegasusSinusoidalPositionalEmbedding):
+            module._init_weight()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.weight.data.fill_(1.0)
+            module.bias.data.zero_()
+
+    # Copied from transformers.models.bart.modeling_bart.BartPreTrainedModel._update_full_mask
+    def _update_full_mask(
+        self,
+        attention_mask: Union[torch.Tensor, None],
+        inputs_embeds: torch.Tensor,
+    ):
+        if attention_mask is not None:
+            if self.config._attn_implementation == "flash_attention_2":
+                attention_mask = attention_mask if 0 in attention_mask else None
+            elif self.config._attn_implementation == "sdpa":
+                # output_attentions=True & head_mask can not be supported when using SDPA, fall back to
+                # the manual implementation that requires a 4D causal mask in all cases.
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                attention_mask = _prepare_4d_attention_mask_for_sdpa(attention_mask, inputs_embeds.dtype)
+            elif self.config._attn_implementation == "flex_attention":
+                if isinstance(attention_mask, torch.Tensor):
+                    attention_mask = make_flex_block_causal_mask(attention_mask, is_causal=False)
+            else:
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype)
+
+        return attention_mask
+
+    # Copied from transformers.models.bart.modeling_bart.BartPreTrainedModel._update_causal_mask
+    def _update_causal_mask(
+        self,
+        attention_mask: Optional[Union[torch.Tensor, "BlockMask"]],
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values: Cache,
+    ):
+        if self.config._attn_implementation == "flex_attention":
+            if isinstance(attention_mask, torch.Tensor):
+                attention_mask = make_flex_block_causal_mask(attention_mask)
+            # Other attention flavors support in-built causal (when `mask is None`)
+            # while we need to create our specific block mask regardless
+            elif attention_mask is None:
+                attention_mask = make_flex_block_causal_mask(
+                    torch.ones(
+                        size=(input_tensor.shape[0], input_tensor.shape[1]),
+                        device=attention_mask.device,
+                    )
+                )
+            return attention_mask
+
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and (attention_mask == 0.0).any():
+                return attention_mask
+            return None
+
+        # For SDPA, when possible, we will rely on its `is_causal` argument instead of its `attn_mask` argument, in
+        # order to dispatch on Flash Attention 2. This feature is not compatible with static cache, as SDPA will fail
+        # to infer the attention mask.
+        past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+        using_compilable_cache = past_key_values.is_compileable if past_key_values is not None else False
+
+        # When output attentions is True, sdpa implementation's forward method calls the eager implementation's forward
+        if self.config._attn_implementation == "sdpa" and not using_compilable_cache:
+            if AttentionMaskConverter._ignore_causal_mask_sdpa(
+                attention_mask,
+                inputs_embeds=input_tensor,
+                past_key_values_length=past_seen_tokens,
+                is_training=self.training,
+            ):
+                return None
+
+        dtype = input_tensor.dtype
+        sequence_length = input_tensor.shape[1]
+        if using_compilable_cache:
+            target_length = past_key_values.get_max_cache_shape()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+
+        # In case the provided `attention` mask is 2D, we generate a causal mask here (4D).
+        causal_mask = self._prepare_4d_causal_attention_mask_with_cache_position(
+            attention_mask,
+            sequence_length=sequence_length,
+            target_length=target_length,
+            dtype=dtype,
+            cache_position=cache_position,
+            batch_size=input_tensor.shape[0],
+        )
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type in ["cuda", "xpu", "npu"]
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
+    @staticmethod
+    # Copied from transformers.models.gptj.modeling_gptj.GPTJModel._prepare_4d_causal_attention_mask_with_cache_position
+    def _prepare_4d_causal_attention_mask_with_cache_position(
+        attention_mask: torch.Tensor,
+        sequence_length: int,
+        target_length: int,
+        dtype: torch.dtype,
+        cache_position: torch.Tensor,
+        batch_size: int,
+        **kwargs,
+    ):
+        """
+        Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
+        `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
+
+        Args:
+            attention_mask (`torch.Tensor`):
+                A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape
+                `(batch_size, 1, query_length, key_value_length)`.
+            sequence_length (`int`):
+                The sequence length being processed.
+            target_length (`int`):
+                The target length: when generating with static cache, the mask should be as long as the static cache,
+                to account for the 0 padding, the part of the cache that is not filled yet.
+            dtype (`torch.dtype`):
+                The dtype to use for the 4D attention mask.
+            cache_position (`torch.Tensor`):
+                Indices depicting the position of the input sequence tokens in the sequence.
+            batch_size (`torch.Tensor`):
+                Batch size.
+        """
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # In this case we assume that the mask comes already in inverted form and requires no inversion or slicing.
+            causal_mask = attention_mask
+        else:
+            min_dtype = torch.finfo(dtype).min
+            causal_mask = torch.full(
+                (sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=cache_position.device
+            )
+            if sequence_length != 1:
+                causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(target_length, device=cache_position.device) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(batch_size, 1, -1, -1)
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[:, :, :, :mask_length] + attention_mask[:, None, None, :].to(
+                    causal_mask.device
+                )
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[:, :, :, :mask_length].masked_fill(
+                    padding_mask, min_dtype
+                )
+
+        return causal_mask
+
+    # Copied from transformers.models.bart.modeling_bart.BartPreTrainedModel._update_cross_attn_mask
+    def _update_cross_attn_mask(
+        self,
+        encoder_hidden_states: Union[torch.Tensor, None],
+        encoder_attention_mask: Union[torch.Tensor, None],
+        input_shape: torch.Size,
+        inputs_embeds: torch.Tensor,
+    ):
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            if self.config._attn_implementation == "flash_attention_2":
+                encoder_attention_mask = encoder_attention_mask if 0 in encoder_attention_mask else None
+            elif self.config._attn_implementation == "sdpa":
+                # output_attentions=True & cross_attn_head_mask can not be supported when using SDPA, and we fall back on
+                # the manual implementation that requires a 4D causal mask in all cases.
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                encoder_attention_mask = _prepare_4d_attention_mask_for_sdpa(
+                    encoder_attention_mask,
+                    inputs_embeds.dtype,
+                    tgt_len=input_shape[-1],
+                )
+            elif self.config._attn_implementation == "flex_attention":
+                if isinstance(encoder_attention_mask, torch.Tensor):
+                    encoder_attention_mask = make_flex_block_causal_mask(
+                        encoder_attention_mask,
+                        query_length=input_shape[-1],
+                        is_causal=False,
+                    )
+            else:
+                # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+                encoder_attention_mask = _prepare_4d_attention_mask(
+                    encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+                )
+
+        return encoder_attention_mask
+
+
+class PegasusEncoder(PegasusPreTrainedModel):
+    """
+    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
+    [`PegasusEncoderLayer`].
+
+    Args:
+        config: PegasusConfig
+        embed_tokens (nn.Embedding): output embedding
+    """
+
+    def __init__(self, config: PegasusConfig, embed_tokens: Optional[nn.Embedding] = None):
+        super().__init__(config)
+
+        self.dropout = config.dropout
+        self.layerdrop = config.encoder_layerdrop
+
+        embed_dim = config.d_model
+        self.padding_idx = config.pad_token_id
+        self.max_source_positions = config.max_position_embeddings
+        self.embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
+
+        if embed_tokens is not None:
+            self.embed_tokens = embed_tokens
+        else:
+            self.embed_tokens = nn.Embedding(config.vocab_size, embed_dim, self.padding_idx)
+
+        self.embed_positions = PegasusSinusoidalPositionalEmbedding(
+            config.max_position_embeddings,
+            embed_dim,
+            self.padding_idx,
+        )
+        self.layers = nn.ModuleList([PegasusEncoderLayer(config) for _ in range(config.encoder_layers)])
+        self.layer_norm = nn.LayerNorm(config.d_model)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def resize_position_embeddings(self, new_num_position_embeddings: int):
+        """
+        Resizes position embeddings matrix of the model if `new_num_position_embeddings !=
+        config.max_position_embeddings`.
+
+        Arguments:
+            new_num_position_embeddings (`int`):
+                The number of new position embeddings. If position embeddings are learned, increasing the size will add
+                newly initialized vectors at the end, whereas reducing the size will remove vectors from the end. If
+                position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the size will
+                add correct vectors at the end following the position encoding algorithm, whereas reducing the size
+                will remove vectors from the end.
+        """
+        logger.info(f"Setting `config.max_position_embeddings={new_num_position_embeddings}`...")
+        self.config.max_position_embeddings = new_num_position_embeddings
+
+        self.embed_positions = PegasusSinusoidalPositionalEmbedding(
+            self.config.max_position_embeddings,
+            self.config.d_model,
+            self.padding_idx,
+        )
+        self.embed_positions._init_weight()
+        self.embed_positions.to(self.device)
+
+    def get_position_embeddings(self) -> nn.Embedding:
+        """
+        Returns the position embeddings matrix
+        """
+        return self.embed_positions
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        head_mask=None,
+        inputs_embeds=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`torch.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            self.warn_if_padding_and_no_attention_mask(input_ids, attention_mask)
+            input_shape = input_ids.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+
+        embed_pos = self.embed_positions(input_shape)
+
+        hidden_states = inputs_embeds + embed_pos
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        attention_mask = self._update_full_mask(
+            attention_mask,
+            inputs_embeds,
+        )
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        # check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            if head_mask.size()[0] != len(self.layers):
+                raise ValueError(
+                    f"The head_mask should be specified for {len(self.layers)} layers, but it is for"
+                    f" {head_mask.size()[0]}."
+                )
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            if to_drop:
+                layer_outputs = (None, None)
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                    output_attentions=output_attentions,
+                )
+
+                hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class PegasusDecoder(PegasusPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`PegasusDecoderLayer`]
+
+    Args:
+        config: PegasusConfig
+        embed_tokens (nn.Embedding): output embedding
+    """
+
+    def __init__(self, config: PegasusConfig, embed_tokens: Optional[nn.Embedding] = None):
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+        self.padding_idx = config.pad_token_id
+        self.max_target_positions = config.max_position_embeddings
+        self.embed_scale = math.sqrt(config.d_model) if config.scale_embedding else 1.0
+
+        if embed_tokens is not None:
+            self.embed_tokens = embed_tokens
+        else:
+            self.embed_tokens = nn.Embedding(config.vocab_size, config.d_model, self.padding_idx)
+
+        self.embed_positions = PegasusSinusoidalPositionalEmbedding(
+            config.max_position_embeddings,
+            config.d_model,
+            self.padding_idx,
+        )
+        self.layers = nn.ModuleList([PegasusDecoderLayer(config, layer_idx=i) for i in range(config.decoder_layers)])
+        self.layer_norm = nn.LayerNorm(config.d_model)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def resize_position_embeddings(self, new_num_position_embeddings: int):
+        """
+        Resizes position embeddings matrix of the model if `new_num_position_embeddings !=
+        config.max_position_embeddings`.
+
+        Arguments:
+            new_num_position_embeddings (`int`):
+                The number of new position embeddings. If position embeddings are learned, increasing the size will add
+                newly initialized vectors at the end, whereas reducing the size will remove vectors from the end. If
+                position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the size will
+                add correct vectors at the end following the position encoding algorithm, whereas reducing the size
+                will remove vectors from the end.
+        """
+        logger.info(f"Setting `config.max_position_embeddings={new_num_position_embeddings}`...")
+        self.config.max_position_embeddings = new_num_position_embeddings
+
+        self.embed_positions = PegasusSinusoidalPositionalEmbedding(
+            self.config.max_position_embeddings,
+            self.config.d_model,
+            self.padding_idx,
+        )
+        self.embed_positions._init_weight()
+        self.embed_positions.to(self.device)
+
+    def get_position_embeddings(self) -> nn.Embedding:
+        """
+        Returns the position embeddings matrix
+        """
+        return self.embed_positions
+
+    def forward(
+        self,
+        input_ids=None,
+        attention_mask=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        head_mask=None,
+        cross_attn_head_mask=None,
+        past_key_values=None,
+        inputs_embeds=None,
+        use_cache=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+        cache_position=None,
+    ):
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
+                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
+                selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the cross-attention modules in decoder to avoid performing
+                cross-attention on hidden heads. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+                It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+
+                Contains pre-computed hidden-states (key and values in the self-attention blocks and in the
+                cross-attention blocks) that can be used (see `past_key_values` input) to speed up sequential decoding.
+
+                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
+                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
+                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence. It is used to update the
+                cache in the correct position and to infer the complete sequence length.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            input = input_ids
+            input_shape = input.shape
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            input = inputs_embeds[:, :, -1]
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input)
+
+        # important to apply scale outside of `if` in case users pass `embeds`
+        inputs_embeds = inputs_embeds * self.embed_scale
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing`. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        # initialize `past_key_values`
+        if use_cache and past_key_values is None:
+            past_key_values = (
+                EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config))
+                if encoder_hidden_states is not None
+                else DynamicCache(config=self.config)
+            )
+        if use_cache and isinstance(past_key_values, tuple):
+            logger.warning_once(
+                "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. "
+                "You should pass an instance of `EncoderDecoderCache` instead, e.g. "
+                "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`."
+            )
+            past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values)
+
+        batch_size, seq_length = inputs_embeds.size()[:-1]
+        past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0
+        if cache_position is None:
+            cache_position = torch.arange(
+                past_key_values_length, past_key_values_length + seq_length, device=inputs_embeds.device
+            )
+
+        if attention_mask is None and not is_torchdynamo_compiling():
+            # required mask seq length can be calculated via length of past cache
+            mask_seq_length = past_key_values_length + seq_length
+            attention_mask = torch.ones(batch_size, mask_seq_length, device=inputs_embeds.device)
+
+        self_attn_cache = (
+            past_key_values.self_attention_cache
+            if isinstance(past_key_values, EncoderDecoderCache)
+            else past_key_values
+        )
+
+        causal_mask = self._update_causal_mask(
+            attention_mask,
+            inputs_embeds,
+            cache_position,
+            self_attn_cache,
+        )
+        encoder_attention_mask = self._update_cross_attn_mask(
+            encoder_hidden_states,
+            encoder_attention_mask,
+            input_shape,
+            inputs_embeds,
+        )
+
+        # embed positions
+        positions = self.embed_positions((batch_size, seq_length), past_key_values_length, position_ids=cache_position)
+        hidden_states = inputs_embeds + positions
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+
+        # check if head_mask/cross_attn_head_mask has a correct number of layers specified if desired
+        for attn_mask, mask_name in zip([head_mask, cross_attn_head_mask], ["head_mask", "cross_attn_head_mask"]):
+            if attn_mask is not None:
+                if attn_mask.size()[0] != len(self.layers):
+                    raise ValueError(
+                        f"The `{mask_name}` should be specified for {len(self.layers)} layers, but it is for"
+                        f" {head_mask.size()[0]}."
+                    )
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                causal_mask,
+                encoder_hidden_states,  # as a positional argument for gradient checkpointing
+                encoder_attention_mask=encoder_attention_mask,
+                layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                cross_attn_layer_head_mask=(cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None),
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+            )
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, past_key_values, all_hidden_states, all_self_attns, all_cross_attentions]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+@auto_docstring
+class PegasusModel(PegasusPreTrainedModel):
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight"]
+
+    def __init__(self, config: PegasusConfig):
+        super().__init__(config)
+
+        padding_idx, vocab_size = config.pad_token_id, config.vocab_size
+        self.shared = nn.Embedding(vocab_size, config.d_model, padding_idx)
+
+        self.encoder = PegasusEncoder(config, self.shared)
+        self.decoder = PegasusDecoder(config, self.shared)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, value):
+        self.shared = value
+        self.encoder.embed_tokens = self.shared
+        self.decoder.embed_tokens = self.shared
+
+    def get_encoder(self):
+        return self.encoder
+
+    def resize_position_embeddings(self, new_num_position_embeddings: int):
+        """
+        Resizes position embeddings matrix of the model if `new_num_position_embeddings !=
+        config.max_position_embeddings`.
+
+        Arguments:
+            new_num_position_embeddings (`int`):
+                The number of new position embeddings. If position embeddings are learned, increasing the size will add
+                newly initialized vectors at the end, whereas reducing the size will remove vectors from the end. If
+                position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the size will
+                add correct vectors at the end following the position encoding algorithm, whereas reducing the size
+                will remove vectors from the end.
+        """
+        self.config.max_position_embeddings = new_num_position_embeddings
+        self.encoder.resize_position_embeddings(new_num_position_embeddings)
+        self.decoder.resize_position_embeddings(new_num_position_embeddings)
+
+    def get_position_embeddings(self) -> tuple[nn.Embedding]:
+        """
+        Returns the position embeddings matrix
+        """
+        return (self.encoder.get_position_embeddings(), self.decoder.get_position_embeddings())
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.Tensor] = None,
+        decoder_attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[tuple[torch.FloatTensor]] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        decoder_inputs_embeds: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> Union[tuple, Seq2SeqModelOutput]:
+        r"""
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+
+            Pegasus uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
+            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
+            1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, PegasusModel
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/pegasus-large")
+        >>> model = PegasusModel.from_pretrained("google/pegasus-large")
+
+        >>> inputs = tokenizer("Studies have been shown that owning a dog is good for you", return_tensors="pt")
+        >>> decoder_inputs = tokenizer("Studies show that", return_tensors="pt")
+        >>> outputs = model(input_ids=inputs.input_ids, decoder_input_ids=decoder_inputs.input_ids)
+
+        >>> last_hidden_states = outputs.last_hidden_state
+        >>> list(last_hidden_states.shape)
+        [1, 4, 1024]
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        # decoder outputs consists of (dec_features, past_key_values, dec_hidden, dec_attn)
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=attention_mask,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
+        return Seq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    The PEGASUS Model with a language modeling head. Can be used for summarization.
+    """
+)
+class PegasusForConditionalGeneration(PegasusPreTrainedModel, GenerationMixin):
+    base_model_prefix = "model"
+    _keys_to_ignore_on_load_missing = ["final_logits_bias"]
+    _tied_weights_keys = ["encoder.embed_tokens.weight", "decoder.embed_tokens.weight", "lm_head.weight"]
+
+    def __init__(self, config: PegasusConfig):
+        super().__init__(config)
+        self.model = PegasusModel(config)
+        self.register_buffer("final_logits_bias", torch.zeros((1, self.model.shared.num_embeddings)))
+        self.lm_head = nn.Linear(config.d_model, self.model.shared.num_embeddings, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_encoder(self):
+        return self.model.get_encoder()
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    def resize_token_embeddings(
+        self, new_num_tokens: int, pad_to_multiple_of: Optional[int] = None, mean_resizing: bool = True
+    ) -> nn.Embedding:
+        new_embeddings = super().resize_token_embeddings(new_num_tokens, pad_to_multiple_of, mean_resizing)
+        self._resize_final_logits_bias(new_embeddings.weight.shape[0])
+        return new_embeddings
+
+    def _resize_final_logits_bias(self, new_num_tokens: int) -> None:
+        old_num_tokens = self.final_logits_bias.shape[-1]
+        if new_num_tokens <= old_num_tokens:
+            new_bias = self.final_logits_bias[:, :new_num_tokens]
+        else:
+            extra_bias = torch.zeros((1, new_num_tokens - old_num_tokens), device=self.final_logits_bias.device)
+            new_bias = torch.cat([self.final_logits_bias, extra_bias], dim=1)
+        self.register_buffer("final_logits_bias", new_bias)
+
+    def resize_position_embeddings(self, new_num_position_embeddings: int):
+        """
+        Resizes position embeddings matrix of the model if `new_num_position_embeddings !=
+        config.max_position_embeddings`.
+
+        Arguments:
+            new_num_position_embeddings (`int`):
+                The number of new position embeddings. If position embeddings are learned, increasing the size will add
+                newly initialized vectors at the end, whereas reducing the size will remove vectors from the end. If
+                position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the size will
+                add correct vectors at the end following the position encoding algorithm, whereas reducing the size
+                will remove vectors from the end.
+        """
+        self.config.max_position_embeddings = new_num_position_embeddings
+        self.model.encoder.resize_position_embeddings(new_num_position_embeddings)
+        self.model.decoder.resize_position_embeddings(new_num_position_embeddings)
+
+    def get_position_embeddings(self) -> tuple[nn.Embedding]:
+        """
+        Returns the position embeddings matrix
+        """
+        return (self.model.encoder.get_position_embeddings(), self.model.decoder.get_position_embeddings())
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.Tensor] = None,
+        decoder_attention_mask: Optional[torch.Tensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        decoder_head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[tuple[torch.FloatTensor]] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        decoder_inputs_embeds: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> Union[tuple, Seq2SeqLMOutput]:
+        r"""
+        decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+
+            Pegasus uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
+            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+        decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+            be used by default.
+        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules in the decoder. Mask values selected in `[0,
+            1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example Summarization:
+
+        ```python
+        >>> from transformers import AutoTokenizer, PegasusForConditionalGeneration
+
+        >>> model = PegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/pegasus-xsum")
+
+        >>> ARTICLE_TO_SUMMARIZE = (
+        ...     "PG&E stated it scheduled the blackouts in response to forecasts for high winds "
+        ...     "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were "
+        ...     "scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."
+        ... )
+        >>> inputs = tokenizer(ARTICLE_TO_SUMMARIZE, max_length=1024, return_tensors="pt")
+
+        >>> # Generate Summary
+        >>> summary_ids = model.generate(inputs["input_ids"])
+        >>> tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "California's largest electricity provider has turned off power to hundreds of thousands of customers."
+        ```
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if labels is not None:
+            if use_cache:
+                logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
+            use_cache = False
+            if decoder_input_ids is None and decoder_inputs_embeds is None:
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )
+
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            encoder_outputs=encoder_outputs,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+        lm_logits = self.lm_head(outputs[0]) + self.final_logits_bias
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=masked_lm_loss,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
+        return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
+
+
+# Copied from transformers.models.bart.modeling_bart.BartDecoderWrapper with Bart->Pegasus
+class PegasusDecoderWrapper(PegasusPreTrainedModel):
+    """
+    This wrapper class is a helper class to correctly load pretrained checkpoints when the causal language model is
+    used in combination with the [`EncoderDecoderModel`] framework.
+    """
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.decoder = PegasusDecoder(config)
+
+    def forward(self, *args, **kwargs):
+        return self.decoder(*args, **kwargs)
+
+
+class PegasusForCausalLM(PegasusPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config):
+        config = copy.deepcopy(config)
+        config.is_decoder = True
+        config.is_encoder_decoder = False
+        super().__init__(config)
+        self.model = PegasusDecoderWrapper(config)
+
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.decoder.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.decoder.embed_tokens = value
+
+    def set_decoder(self, decoder):
+        self.model.decoder = decoder
+
+    def get_decoder(self):
+        return self.model.decoder
+
+    def get_position_embeddings(self) -> nn.Embedding:
+        """
+        Returns the position embeddings matrix
+        """
+        return self.model.decoder.get_position_embeddings()
+
+    def resize_position_embeddings(self, new_num_position_embeddings: int):
+        """
+        Resizes position embeddings matrix of the model if `new_num_position_embeddings !=
+        config.max_position_embeddings`.
+
+        Arguments:
+            new_num_position_embeddings (`int`):
+                The number of new position embeddings. If position embeddings are learned, increasing the size will add
+                newly initialized vectors at the end, whereas reducing the size will remove vectors from the end. If
+                position embeddings are not learned (*e.g.* sinusoidal position embeddings), increasing the size will
+                add correct vectors at the end following the position encoding algorithm, whereas reducing the size
+                will remove vectors from the end.
+        """
+        self.config.max_position_embeddings = new_num_position_embeddings
+        self.model.decoder.resize_position_embeddings(new_num_position_embeddings)
+
+    @auto_docstring
+    # Copied from transformers.models.bart.modeling_bart.BartForCausalLM.forward with Bart->Pegasus, facebook/bart-base->google/pegasus-large
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.FloatTensor] = None,
+        head_mask: Optional[torch.Tensor] = None,
+        cross_attn_head_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[tuple, CausalLMOutputWithCrossAttentions]:
+        r"""
+        cross_attn_head_mask (`torch.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, PegasusForCausalLM
+
+        >>> tokenizer = AutoTokenizer.from_pretrained("google/pegasus-large")
+        >>> model = PegasusForCausalLM.from_pretrained("google/pegasus-large", add_cross_attention=False)
+        >>> assert model.config.is_decoder, f"{model.__class__} has to be configured as a decoder."
+        >>> inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
+        >>> outputs = model(**inputs)
+
+        >>> logits = outputs.logits
+        >>> expected_shape = [1, inputs.input_ids.shape[-1], model.config.vocab_size]
+        >>> list(logits.shape) == expected_shape
+        True
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model.decoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=encoder_attention_mask,
+            head_mask=head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+
+        logits = self.lm_head(outputs[0])
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithCrossAttentions(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            cross_attentions=outputs.cross_attentions,
+        )
+
+
+__all__ = ["PegasusForCausalLM", "PegasusForConditionalGeneration", "PegasusModel", "PegasusPreTrainedModel"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pegasus/modeling_tf_pegasus.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pegasus/modeling_tf_pegasus.py
new file mode 100644
index 0000000000000000000000000000000000000000..d159fc00138dcb6ca2f14213857b6b21e97c6865
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pegasus/modeling_tf_pegasus.py
@@ -0,0 +1,1573 @@
+# coding=utf-8
+# Copyright 2021, Google Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TF 2.0 Pegasus model."""
+
+from __future__ import annotations
+
+import random
+
+import numpy as np
+import tensorflow as tf
+
+from ...activations_tf import get_tf_activation
+from ...modeling_tf_outputs import (
+    TFBaseModelOutput,
+    TFBaseModelOutputWithPastAndCrossAttentions,
+    TFSeq2SeqLMOutput,
+    TFSeq2SeqModelOutput,
+)
+
+# Public API
+from ...modeling_tf_utils import (
+    TFCausalLanguageModelingLoss,
+    TFModelInputType,
+    TFPreTrainedModel,
+    keras,
+    keras_serializable,
+    unpack_inputs,
+)
+from ...tf_utils import check_embeddings_within_bounds, shape_list, stable_softmax
+from ...utils import (
+    add_code_sample_docstrings,
+    add_end_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from .configuration_pegasus import PegasusConfig
+
+
+logger = logging.get_logger(__name__)
+
+_CHECKPOINT_FOR_DOC = "google/pegasus-large"
+_CONFIG_FOR_DOC = "PegasusConfig"
+
+
+LARGE_NEGATIVE = -1e8
+
+
+# Copied from transformers.models.bart.modeling_tf_bart.shift_tokens_right
+def shift_tokens_right(input_ids: tf.Tensor, pad_token_id: int, decoder_start_token_id: int):
+    pad_token_id = tf.cast(pad_token_id, input_ids.dtype)
+    decoder_start_token_id = tf.cast(decoder_start_token_id, input_ids.dtype)
+    start_tokens = tf.fill(
+        (shape_list(input_ids)[0], 1), tf.convert_to_tensor(decoder_start_token_id, input_ids.dtype)
+    )
+    shifted_input_ids = tf.concat([start_tokens, input_ids[:, :-1]], -1)
+    # replace possible -100 values in labels by `pad_token_id`
+    shifted_input_ids = tf.where(
+        shifted_input_ids == -100,
+        tf.fill(shape_list(shifted_input_ids), tf.convert_to_tensor(pad_token_id, input_ids.dtype)),
+        shifted_input_ids,
+    )
+
+    # "Verify that `labels` has only positive values and -100"
+    assert_gte0 = tf.debugging.assert_greater_equal(shifted_input_ids, tf.constant(0, dtype=input_ids.dtype))
+
+    # Make sure the assertion op is called by wrapping the result in an identity no-op
+    with tf.control_dependencies([assert_gte0]):
+        shifted_input_ids = tf.identity(shifted_input_ids)
+
+    return shifted_input_ids
+
+
+# Copied from transformers.models.bart.modeling_tf_bart._make_causal_mask
+def _make_causal_mask(input_ids_shape: tf.TensorShape, past_key_values_length: int = 0):
+    """
+    Make causal mask used for bi-directional self-attention.
+    """
+    bsz = input_ids_shape[0]
+    tgt_len = input_ids_shape[1]
+    mask = tf.ones((tgt_len, tgt_len)) * LARGE_NEGATIVE
+    mask_cond = tf.range(shape_list(mask)[-1])
+
+    mask = tf.where(mask_cond < tf.reshape(mask_cond + 1, (shape_list(mask)[-1], 1)), 0.0, mask)
+
+    if past_key_values_length > 0:
+        mask = tf.concat([tf.zeros((tgt_len, past_key_values_length)), mask], axis=-1)
+
+    return tf.tile(mask[None, None, :, :], (bsz, 1, 1, 1))
+
+
+# Copied from transformers.models.bart.modeling_tf_bart._expand_mask
+def _expand_mask(mask: tf.Tensor, tgt_len: int | None = None):
+    """
+    Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`.
+    """
+    src_len = shape_list(mask)[1]
+    tgt_len = tgt_len if tgt_len is not None else src_len
+    one_cst = tf.constant(1.0)
+    mask = tf.cast(mask, dtype=one_cst.dtype)
+    expanded_mask = tf.tile(mask[:, None, None, :], (1, 1, tgt_len, 1))
+
+    return (one_cst - expanded_mask) * LARGE_NEGATIVE
+
+
+# Copied from transformers.models.marian.modeling_tf_marian.TFMarianSinusoidalPositionalEmbedding with Marian->Pegasus
+class TFPegasusSinusoidalPositionalEmbedding(keras.layers.Layer):
+    """This module produces sinusoidal positional embeddings of any length."""
+
+    def __init__(self, num_positions: int, embedding_dim: int, **kwargs):
+        super().__init__(**kwargs)
+
+        if embedding_dim % 2 != 0:
+            raise NotImplementedError(f"odd embedding_dim {embedding_dim} not supported")
+
+        self.embedding_dim = embedding_dim
+        self.num_positions = num_positions
+
+    def build(self, input_shape: tf.TensorShape):
+        """
+        Build shared token embedding layer Shared weights logic adapted from
+        https://github.com/tensorflow/models/blob/a009f4fb9d2fc4949e32192a944688925ef78659/official/transformer/v2/embedding_layer.py#L24
+        """
+
+        weight = self._init_weight(self.num_positions, self.embedding_dim)
+
+        self.weight = self.add_weight(
+            name="embeddings",
+            shape=[self.num_positions, self.embedding_dim],
+        )
+        weight = tf.cast(weight, dtype=self.weight.dtype)
+
+        self.weight.assign(weight)
+
+        super().build(input_shape)
+
+    @staticmethod
+    def _init_weight(n_pos: int, dim: int):
+        """
+        Identical to the XLM create_sinusoidal_embeddings except features are not interleaved. The cos features are in
+        the 2nd half of the vector. [dim // 2:]
+        """
+        position_enc = np.array(
+            [[pos / np.power(10000, 2 * (j // 2) / dim) for j in range(dim)] for pos in range(n_pos)]
+        )
+        table = np.zeros_like(position_enc)
+        # index 0 is all zero
+        table[:, 0 : dim // 2] = np.sin(position_enc[:, 0::2])
+        table[:, dim // 2 :] = np.cos(position_enc[:, 1::2])
+        # convert to tensor
+        table = tf.convert_to_tensor(table)
+        tf.stop_gradient(table)
+        return table
+
+    def call(
+        self, input_shape: tf.TensorShape, past_key_values_length: int = 0, position_ids: tf.Tensor | None = None
+    ):
+        """Input is expected to be of size [bsz x seqlen]."""
+        if position_ids is None:
+            seq_len = input_shape[1]
+            position_ids = tf.range(past_key_values_length, seq_len + past_key_values_length, delta=1, name="range")
+        return tf.gather(self.weight, position_ids)
+
+
+# Copied from transformers.models.bart.modeling_tf_bart.TFBartAttention with Bart->Pegasus
+class TFPegasusAttention(keras.layers.Layer):
+    """Multi-headed attention from "Attention Is All You Need"""
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.embed_dim = embed_dim
+
+        self.num_heads = num_heads
+        self.dropout = keras.layers.Dropout(dropout)
+        self.head_dim = embed_dim // num_heads
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+
+        self.k_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="k_proj")
+        self.q_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="q_proj")
+        self.v_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="v_proj")
+        self.out_proj = keras.layers.Dense(embed_dim, use_bias=bias, name="out_proj")
+
+    def _shape(self, tensor: tf.Tensor, seq_len: int, bsz: int):
+        return tf.transpose(tf.reshape(tensor, (bsz, seq_len, self.num_heads, self.head_dim)), (0, 2, 1, 3))
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        key_value_states: tf.Tensor | None = None,
+        past_key_value: tuple[tuple[tf.Tensor]] | None = None,
+        attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
+        training: bool | None = False,
+    ) -> tuple[tf.Tensor, tf.Tensor | None]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if key_value_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = key_value_states is not None
+        bsz, tgt_len, embed_dim = shape_list(hidden_states)
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+        # get key, value proj
+        if is_cross_attention and past_key_value is not None:
+            # reuse k,v, cross_attentions
+            key_states = past_key_value[0]
+            value_states = past_key_value[1]
+        elif is_cross_attention:
+            # cross_attentions
+            key_states = self._shape(self.k_proj(key_value_states), -1, bsz)
+            value_states = self._shape(self.v_proj(key_value_states), -1, bsz)
+        elif past_key_value is not None:
+            # reuse k, v, self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+            key_states = tf.concat([past_key_value[0], key_states], axis=2)
+            value_states = tf.concat([past_key_value[1], value_states], axis=2)
+        else:
+            # self_attention
+            key_states = self._shape(self.k_proj(hidden_states), -1, bsz)
+            value_states = self._shape(self.v_proj(hidden_states), -1, bsz)
+
+        if self.is_decoder:
+            # if cross_attention save Tuple(tf.Tensor, tf.Tensor) of all cross attention key/value_states.
+            # Further calls to cross_attention layer can then reuse all cross-attention
+            # key/value_states (first "if" case)
+            # if uni-directional self-attention (decoder) save Tuple(tf.Tensor, tf.Tensor) of
+            # all previous decoder key/value_states. Further calls to uni-directional self-attention
+            # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+            # if encoder bi-directional self-attention `past_key_value` is always `None`
+            past_key_value = (key_states, value_states)
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = tf.reshape(self._shape(query_states, tgt_len, bsz), proj_shape)
+        key_states = tf.reshape(key_states, proj_shape)
+        value_states = tf.reshape(value_states, proj_shape)
+
+        src_len = shape_list(key_states)[1]
+        attn_weights = tf.matmul(query_states, key_states, transpose_b=True)
+
+        tf.debugging.assert_equal(
+            shape_list(attn_weights),
+            [bsz * self.num_heads, tgt_len, src_len],
+            message=(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {shape_list(attn_weights)}"
+            ),
+        )
+
+        if attention_mask is not None:
+            tf.debugging.assert_equal(
+                shape_list(attention_mask),
+                [bsz, 1, tgt_len, src_len],
+                message=(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is"
+                    f" {shape_list(attention_mask)}"
+                ),
+            )
+
+            attention_mask = tf.cast(attention_mask, dtype=attn_weights.dtype)
+            attn_weights = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len)) + attention_mask
+            attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len))
+
+        attn_weights = stable_softmax(attn_weights, axis=-1)
+
+        if layer_head_mask is not None:
+            tf.debugging.assert_equal(
+                shape_list(layer_head_mask),
+                [self.num_heads],
+                message=(
+                    f"Head mask for a single layer should be of size {(self.num_heads)}, but is"
+                    f" {shape_list(layer_head_mask)}"
+                ),
+            )
+
+            attn_weights = tf.reshape(layer_head_mask, (1, -1, 1, 1)) * tf.reshape(
+                attn_weights, (bsz, self.num_heads, tgt_len, src_len)
+            )
+            attn_weights = tf.reshape(attn_weights, (bsz * self.num_heads, tgt_len, src_len))
+
+        attn_probs = self.dropout(attn_weights, training=training)
+        attn_output = tf.matmul(attn_probs, value_states)
+
+        tf.debugging.assert_equal(
+            shape_list(attn_output),
+            [bsz * self.num_heads, tgt_len, self.head_dim],
+            message=(
+                f"`attn_output` should be of size {(bsz, self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {shape_list(attn_output)}"
+            ),
+        )
+
+        attn_output = tf.transpose(
+            tf.reshape(attn_output, (bsz, self.num_heads, tgt_len, self.head_dim)), (0, 2, 1, 3)
+        )
+        attn_output = tf.reshape(attn_output, (bsz, tgt_len, embed_dim))
+
+        attn_output = self.out_proj(attn_output)
+        attn_weights: tf.Tensor = tf.reshape(attn_weights, (bsz, self.num_heads, tgt_len, src_len))
+
+        return attn_output, attn_weights, past_key_value
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "k_proj", None) is not None:
+            with tf.name_scope(self.k_proj.name):
+                self.k_proj.build([None, None, self.embed_dim])
+        if getattr(self, "q_proj", None) is not None:
+            with tf.name_scope(self.q_proj.name):
+                self.q_proj.build([None, None, self.embed_dim])
+        if getattr(self, "v_proj", None) is not None:
+            with tf.name_scope(self.v_proj.name):
+                self.v_proj.build([None, None, self.embed_dim])
+        if getattr(self, "out_proj", None) is not None:
+            with tf.name_scope(self.out_proj.name):
+                self.out_proj.build([None, None, self.embed_dim])
+
+
+# Copied from transformers.models.mbart.modeling_tf_mbart.TFMBartEncoderLayer with MBart->Pegasus
+class TFPegasusEncoderLayer(keras.layers.Layer):
+    def __init__(self, config: PegasusConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.embed_dim = config.d_model
+        self.self_attn = TFPegasusAttention(
+            self.embed_dim, config.encoder_attention_heads, dropout=config.attention_dropout, name="self_attn"
+        )
+        self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
+        self.dropout = keras.layers.Dropout(config.dropout)
+        self.activation_fn = get_tf_activation(config.activation_function)
+        self.activation_dropout = keras.layers.Dropout(config.activation_dropout)
+        self.fc1 = keras.layers.Dense(config.encoder_ffn_dim, name="fc1")
+        self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
+        self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
+        self.config = config
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: tf.Tensor,
+        layer_head_mask: tf.Tensor,
+        training: bool | None = False,
+    ):
+        """
+        Args:
+            hidden_states (`tf.Tensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
+            attention_mask (`tf.Tensor`): attention mask of size
+                *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+            layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
+                *(encoder_attention_heads,)*
+        """
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, self_attn_weights, _ = self.self_attn(
+            hidden_states=hidden_states, attention_mask=attention_mask, layer_head_mask=layer_head_mask
+        )
+
+        tf.debugging.assert_equal(
+            shape_list(hidden_states),
+            shape_list(residual),
+            message=f"Self attn modified the shape of query {shape_list(residual)} to {shape_list(hidden_states)}",
+        )
+
+        hidden_states = self.dropout(hidden_states, training=training)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = self.activation_dropout(hidden_states, training=training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = self.dropout(hidden_states, training=training)
+        hidden_states = residual + hidden_states
+
+        return hidden_states, self_attn_weights
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "self_attn", None) is not None:
+            with tf.name_scope(self.self_attn.name):
+                self.self_attn.build(None)
+        if getattr(self, "self_attn_layer_norm", None) is not None:
+            with tf.name_scope(self.self_attn_layer_norm.name):
+                self.self_attn_layer_norm.build([None, None, self.embed_dim])
+        if getattr(self, "fc1", None) is not None:
+            with tf.name_scope(self.fc1.name):
+                self.fc1.build([None, None, self.embed_dim])
+        if getattr(self, "fc2", None) is not None:
+            with tf.name_scope(self.fc2.name):
+                self.fc2.build([None, None, self.config.encoder_ffn_dim])
+        if getattr(self, "final_layer_norm", None) is not None:
+            with tf.name_scope(self.final_layer_norm.name):
+                self.final_layer_norm.build([None, None, self.embed_dim])
+
+
+# Copied from transformers.models.mbart.modeling_tf_mbart.TFMBartDecoderLayer with MBart->Pegasus
+class TFPegasusDecoderLayer(keras.layers.Layer):
+    def __init__(self, config: PegasusConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.embed_dim = config.d_model
+        self.self_attn = TFPegasusAttention(
+            embed_dim=self.embed_dim,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            name="self_attn",
+            is_decoder=True,
+        )
+        self.dropout = keras.layers.Dropout(config.dropout)
+        self.activation_fn = get_tf_activation(config.activation_function)
+        self.activation_dropout = keras.layers.Dropout(config.activation_dropout)
+
+        self.self_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="self_attn_layer_norm")
+        self.encoder_attn = TFPegasusAttention(
+            self.embed_dim,
+            config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+            name="encoder_attn",
+            is_decoder=True,
+        )
+        self.encoder_attn_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="encoder_attn_layer_norm")
+        self.fc1 = keras.layers.Dense(config.decoder_ffn_dim, name="fc1")
+        self.fc2 = keras.layers.Dense(self.embed_dim, name="fc2")
+        self.final_layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="final_layer_norm")
+        self.config = config
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        attention_mask: tf.Tensor | None = None,
+        encoder_hidden_states: tf.Tensor | None = None,
+        encoder_attention_mask: tf.Tensor | None = None,
+        layer_head_mask: tf.Tensor | None = None,
+        cross_attn_layer_head_mask: tf.Tensor | None = None,
+        past_key_value: tuple[tf.Tensor] | None = None,
+        training: bool | None = False,
+    ) -> tuple[tf.Tensor, tf.Tensor, tuple[tuple[tf.Tensor]]]:
+        """
+        Args:
+            hidden_states (`tf.Tensor`): input to the layer of shape *(batch, seq_len, embed_dim)*
+            attention_mask (`tf.Tensor`): attention mask of size
+                *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+            encoder_hidden_states (`tf.Tensor`):
+                cross attention input to the layer of shape *(batch, seq_len, embed_dim)*
+            encoder_attention_mask (`tf.Tensor`): encoder attention mask of size
+                *(batch, 1, tgt_len, src_len)* where padding elements are indicated by very large negative values.
+            layer_head_mask (`tf.Tensor`): mask for attention heads in a given layer of size
+                *(decoder_attention_heads,)*
+            cross_attn_layer_head_mask (`tf.Tensor`): mask for heads of the cross-attention module.
+                *(decoder_attention_heads,)*
+            past_key_value (`Tuple(tf.Tensor)`): cached past key and value projection states
+        """
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Self Attention
+        # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+        self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+        # add present self-attn cache to positions 1,2 of present_key_value tuple
+        hidden_states, self_attn_weights, present_key_value = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_value=self_attn_past_key_value,
+            attention_mask=attention_mask,
+            layer_head_mask=layer_head_mask,
+        )
+        hidden_states = self.dropout(hidden_states, training=training)
+        hidden_states = residual + hidden_states
+
+        # Cross-Attention Block
+        cross_attn_present_key_value = None
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+            hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+            # cross_attn cached key/values tuple is at positions 3,4 of present_key_value tuple
+            cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+            hidden_states, cross_attn_weights, cross_attn_present_key_value = self.encoder_attn(
+                hidden_states=hidden_states,
+                key_value_states=encoder_hidden_states,
+                attention_mask=encoder_attention_mask,
+                layer_head_mask=cross_attn_layer_head_mask,
+                past_key_value=cross_attn_past_key_value,
+            )
+            hidden_states = self.dropout(hidden_states, training=training)
+            hidden_states = residual + hidden_states
+
+            # add cross-attn to positions 3,4 of present_key_value tuple
+            present_key_value = present_key_value + cross_attn_present_key_value
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = self.activation_dropout(hidden_states, training=training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = self.dropout(hidden_states, training=training)
+        hidden_states = residual + hidden_states
+
+        return (
+            hidden_states,
+            self_attn_weights,
+            cross_attn_weights,
+            present_key_value,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "self_attn", None) is not None:
+            with tf.name_scope(self.self_attn.name):
+                self.self_attn.build(None)
+        if getattr(self, "self_attn_layer_norm", None) is not None:
+            with tf.name_scope(self.self_attn_layer_norm.name):
+                self.self_attn_layer_norm.build([None, None, self.embed_dim])
+        if getattr(self, "encoder_attn", None) is not None:
+            with tf.name_scope(self.encoder_attn.name):
+                self.encoder_attn.build(None)
+        if getattr(self, "encoder_attn_layer_norm", None) is not None:
+            with tf.name_scope(self.encoder_attn_layer_norm.name):
+                self.encoder_attn_layer_norm.build([None, None, self.embed_dim])
+        if getattr(self, "fc1", None) is not None:
+            with tf.name_scope(self.fc1.name):
+                self.fc1.build([None, None, self.embed_dim])
+        if getattr(self, "fc2", None) is not None:
+            with tf.name_scope(self.fc2.name):
+                self.fc2.build([None, None, self.config.decoder_ffn_dim])
+        if getattr(self, "final_layer_norm", None) is not None:
+            with tf.name_scope(self.final_layer_norm.name):
+                self.final_layer_norm.build([None, None, self.embed_dim])
+
+
+class TFPegasusPreTrainedModel(TFPreTrainedModel):
+    config_class = PegasusConfig
+    base_model_prefix = "model"
+
+
+PEGASUS_START_DOCSTRING = r"""
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
+    behavior.
+
+    <Tip>
+
+    TensorFlow models and layers in `transformers` accept two formats as input:
+
+    - having all inputs as keyword arguments (like PyTorch models), or
+    - having all inputs as a list, tuple or dict in the first positional argument.
+
+    The reason the second format is supported is that Keras methods prefer this format when passing inputs to models
+    and layers. Because of this support, when using methods like `model.fit()` things should "just work" for you - just
+    pass your inputs and labels in any format that `model.fit()` supports! If, however, you want to use the second
+    format outside of Keras methods like `fit()` and `predict()`, such as when creating your own layers or models with
+    the Keras `Functional` API, there are three possibilities you can use to gather all the input Tensors in the first
+    positional argument:
+
+    - a single Tensor with `input_ids` only and nothing else: `model(input_ids)`
+    - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+    `model([input_ids, attention_mask])` or `model([input_ids, attention_mask, token_type_ids])`
+    - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+    `model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+    Note that when creating models and layers with
+    [subclassing](https://keras.io/guides/making_new_layers_and_models_via_subclassing/) then you don't need to worry
+    about any of this, as you can just pass inputs like you would to any other Python function!
+
+    </Tip>
+
+    Args:
+        config ([`PegasusConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+PEGASUS_GENERATION_EXAMPLE = r"""
+    Summarization example:
+
+    ```python
+    >>> from transformers import AutoTokenizer, TFPegasusForConditionalGeneration
+
+    >>> model = TFPegasusForConditionalGeneration.from_pretrained("google/pegasus-xsum")
+    >>> tokenizer = AutoTokenizer.from_pretrained("google/pegasus-xsum")
+
+    >>> ARTICLE_TO_SUMMARIZE = (
+    ...     "PG&E stated it scheduled the blackouts in response to forecasts for high winds "
+    ...     "amid dry conditions. The aim is to reduce the risk of wildfires. Nearly 800 thousand customers were "
+    ...     "scheduled to be affected by the shutoffs which were expected to last through at least midday tomorrow."
+    ... )
+    >>> inputs = tokenizer(ARTICLE_TO_SUMMARIZE, max_length=1024, return_tensors="tf")
+
+    >>> # Generate Summary
+    >>> summary_ids = model.generate(input_ids)
+    >>> print(tokenizer.batch_decode(summary_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False))
+    ```
+"""
+
+PEGASUS_INPUTS_DOCSTRING = r"""
+    Args:
+        input_ids (`tf.Tensor` of shape `({0})`):
+            Indices of input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are input IDs?](../glossary#input-ids)
+        attention_mask (`tf.Tensor` of shape `({0})`, *optional*):
+            Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+
+            [What are attention masks?](../glossary#attention-mask)
+        decoder_input_ids (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            Indices of decoder input sequence tokens in the vocabulary.
+
+            Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+            [`PreTrainedTokenizer.__call__`] for details.
+
+            [What are decoder input IDs?](../glossary#decoder-input-ids)
+
+            Pegasus uses the `pad_token_id` as the starting token for `decoder_input_ids` generation. If
+            `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
+            `past_key_values`).
+        decoder_attention_mask (`tf.Tensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+            will be made by default and ignore pad tokens. It is not recommended to set this for most use cases.
+        decoder_position_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the
+            range `[0, config.max_position_embeddings - 1]`.
+        head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the encoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        decoder_head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the attention modules in the decoder. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        cross_attn_head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+            Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
+
+            - 1 indicates the head is **not masked**,
+            - 0 indicates the head is **masked**.
+
+        encoder_outputs (`tf.FloatTensor`, *optional*):
+            hidden states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+            of shape `(batch_size, sequence_length, hidden_size)` is a sequence of
+        past_key_values (`tuple[tuple[tf.Tensor]]` of length `config.n_layers`)
+            contains precomputed key and value hidden states of the attention blocks. Can be used to speed up decoding.
+            If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those that
+            don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of all
+            `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+        inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+            This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+            than the model's internal embedding lookup matrix.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding (see
+            `past_key_values`). Set to `False` during training, `True` during generation output_attentions (`bool`,
+            *optional*): Whether or not to return the attentions tensors of all attention layers. See `attentions`
+            under returned tensors for more detail. This argument can be used only in eager mode, in graph mode the
+            value in the config will be used instead.
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
+            config will be used instead.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
+            used instead.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
+            eager mode, in graph mode the value will always be set to True.
+        training (`bool`, *optional*, defaults to `False`):
+            Whether or not to use the model in training mode (some modules like dropout modules have different
+            behaviors between training and evaluation).
+"""
+
+
+@keras_serializable
+class TFPegasusEncoder(keras.layers.Layer):
+    config_class = PegasusConfig
+    """
+    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a
+    [`TFPegasusEncoderLayer`].
+
+    Args:
+        config: PegasusConfig
+    """
+
+    def __init__(self, config: PegasusConfig, embed_tokens: keras.layers.Embedding | None = None, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.dropout = keras.layers.Dropout(config.dropout)
+        self.layerdrop = config.encoder_layerdrop
+        self.padding_idx = config.pad_token_id
+        self.max_source_positions = config.max_position_embeddings
+        self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0
+
+        self.embed_tokens = embed_tokens
+        self.embed_positions = TFPegasusSinusoidalPositionalEmbedding(
+            config.max_position_embeddings,
+            config.d_model,
+            name="embed_positions",
+        )
+        self.layers = [TFPegasusEncoderLayer(config, name=f"layers.{i}") for i in range(config.encoder_layers)]
+        self.layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm")
+
+    def get_embed_tokens(self):
+        return self.embed_tokens
+
+    def set_embed_tokens(self, embed_tokens):
+        self.embed_tokens = embed_tokens
+
+    @unpack_inputs
+    def call(
+        self,
+        input_ids: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool | None = False,
+    ):
+        """
+        Args:
+            input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`tf.Tensor` of shape `(encoder_layers, encoder_attention_heads)`, `optional):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail. This argument can be used only in eager mode, in graph mode the value
+                in the config will be used instead.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail. This argument can be used only in eager mode, in graph mode the value in the config
+                will be used instead.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used
+                in eager mode, in graph mode the value will always be set to True.
+            training (`bool`, *optional*, defaults to `False`):
+                Whether or not to use the model in training mode (some modules like dropout modules have different
+                behaviors between training and evaluation).
+        """
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = shape_list(input_ids)
+        elif inputs_embeds is not None:
+            input_shape = shape_list(inputs_embeds)[:-1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
+            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+
+        embed_pos = self.embed_positions(input_shape)
+        hidden_states = inputs_embeds + embed_pos
+        hidden_states = self.dropout(hidden_states, training=training)
+
+        # check attention mask and invert
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _expand_mask(attention_mask)
+        else:
+            attention_mask = None
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        # check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            tf.debugging.assert_equal(
+                shape_list(head_mask)[0],
+                len(self.layers),
+                message=(
+                    f"The head_mask should be specified for {len(self.layers)} layers, but it is for"
+                    f" {shape_list(head_mask)[0]}."
+                ),
+            )
+
+        # encoder layers
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            dropout_probability = random.uniform(0, 1)
+            if training and (dropout_probability < self.layerdrop):  # skip the layer
+                continue
+
+            hidden_states, attn = encoder_layer(
+                hidden_states,
+                attention_mask,
+                head_mask[idx] if head_mask is not None else None,
+            )
+
+            if output_attentions:
+                all_attentions += (attn,)
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return TFBaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "embed_positions", None) is not None:
+            with tf.name_scope(self.embed_positions.name):
+                self.embed_positions.build(None)
+        if getattr(self, "layer_norm", None) is not None:
+            with tf.name_scope(self.layer_norm.name):
+                self.layer_norm.build([None, None, self.config.d_model])
+        if getattr(self, "layers", None) is not None:
+            for layer in self.layers:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+
+
+@keras_serializable
+class TFPegasusDecoder(keras.layers.Layer):
+    config_class = PegasusConfig
+    """
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`TFPegasusDecoderLayer`]
+
+    Args:
+        config: PegasusConfig
+        embed_tokens: output embedding
+    """
+
+    def __init__(self, config: PegasusConfig, embed_tokens: keras.layers.Embedding | None = None, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+        self.padding_idx = config.pad_token_id
+        self.embed_tokens = embed_tokens
+        self.layerdrop = config.decoder_layerdrop
+        self.embed_positions = TFPegasusSinusoidalPositionalEmbedding(
+            config.max_position_embeddings,
+            config.d_model,
+            name="embed_positions",
+        )
+        self.embed_scale = tf.math.sqrt(float(config.d_model)) if config.scale_embedding else 1.0
+        self.layers = [TFPegasusDecoderLayer(config, name=f"layers.{i}") for i in range(config.decoder_layers)]
+        self.layer_norm = keras.layers.LayerNormalization(epsilon=1e-5, name="layer_norm")
+
+        self.dropout = keras.layers.Dropout(config.dropout)
+
+    def get_embed_tokens(self):
+        return self.embed_tokens
+
+    def set_embed_tokens(self, embed_tokens):
+        self.embed_tokens = embed_tokens
+
+    @unpack_inputs
+    def call(
+        self,
+        input_ids: tf.Tensor | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        position_ids: tf.Tensor | None = None,
+        encoder_hidden_states: tf.Tensor | None = None,
+        encoder_attention_mask: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        cross_attn_head_mask: tf.Tensor | None = None,
+        past_key_values: tuple[tuple[tf.Tensor]] | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool | None = False,
+    ):
+        r"""
+        Args:
+            input_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            position_ids (`tf.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Indices of positions of each decoder input sequence tokens in the position embeddings. Selected in the
+                range `[0, config.max_position_embeddings - 1]`.
+            encoder_hidden_states (`tf.Tensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (`tf.Tensor` of shape `(batch_size, encoder_sequence_length)`, *optional*):
+                Mask to avoid performing cross-attention on padding tokens indices of encoder input_ids. Mask values
+                selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            cross_attn_head_mask (`tf.Tensor` of shape `(decoder_layers, decoder_attention_heads)`, *optional*):
+                Mask to nullify selected heads of the cross-attention modules. Mask values selected in `[0, 1]`:
+
+                - 1 indicates the head is **not masked**,
+                - 0 indicates the head is **masked**.
+
+            past_key_values (`tuple[tuple[tf.Tensor]]` of length `config.n_layers` with each tuple having 2 tuples each of which has 2 tensors of shape `(batch_size, num_heads, sequence_length - 1, embed_size_per_head)`):
+                Contains precomputed key and value hidden-states of the attention blocks. Can be used to speed up
+                decoding.
+
+                If `past_key_values` are used, the user can optionally input only the last `decoder_input_ids` (those
+                that don't have their past key value states given to this model) of shape `(batch_size, 1)` instead of
+                all `decoder_input_ids` of shape `(batch_size, sequence_length)`.
+            inputs_embeds (`tf.Tensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail. This argument can be used only in eager mode, in graph mode the value
+                in the config will be used instead.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail. This argument can be used only in eager mode, in graph mode the value in the config
+                will be used instead.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used
+                in eager mode, in graph mode the value will always be set to True.
+            training (`bool`, *optional*, defaults to `False`):
+                Whether or not to use the model in training mode (some modules like dropout modules have different
+                behaviors between training and evaluation).
+        """
+
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            input_shape = shape_list(input_ids)
+        elif inputs_embeds is not None:
+            input_shape = shape_list(inputs_embeds)[:-1]
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        past_key_values_length = shape_list(past_key_values[0][0])[2] if past_key_values is not None else 0
+
+        # embed positions
+        if position_ids is None:
+            positions = self.embed_positions(input_shape, past_key_values_length)
+        else:
+            positions = self.embed_positions(input_shape, position_ids=position_ids)
+
+        if inputs_embeds is None:
+            check_embeddings_within_bounds(input_ids, self.embed_tokens.input_dim)
+            inputs_embeds = self.embed_tokens(input_ids) * self.embed_scale
+
+        hidden_states = inputs_embeds
+
+        # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+        if input_shape[-1] > 1:
+            combined_attention_mask = _make_causal_mask(input_shape, past_key_values_length=past_key_values_length)
+        else:
+            combined_attention_mask = _expand_mask(
+                tf.ones((input_shape[0], input_shape[1] + past_key_values_length)), tgt_len=input_shape[-1]
+            )
+
+        if attention_mask is not None:
+            combined_attention_mask = combined_attention_mask + _expand_mask(attention_mask, tgt_len=input_shape[-1])
+
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _expand_mask(encoder_attention_mask, tgt_len=input_shape[-1])
+
+        hidden_states = self.dropout(hidden_states + positions, training=training)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attns = () if (output_attentions and encoder_hidden_states is not None) else None
+        present_key_values = () if use_cache else None
+
+        # check if head_mask and cross_attn_head_mask have a correct number of layers specified if desired
+        for attn_mask_name, attn_mask in [("head_mask", head_mask), ("cross_attn_head_mask", cross_attn_head_mask)]:
+            if attn_mask is not None:
+                tf.debugging.assert_equal(
+                    shape_list(attn_mask)[0],
+                    len(self.layers),
+                    message=(
+                        f"The {attn_mask_name} should be specified for {len(self.layers)} layers, but it is for"
+                        f" {shape_list(attn_mask)[0]}."
+                    ),
+                )
+
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            dropout_probability = random.uniform(0, 1)
+
+            if training and (dropout_probability < self.layerdrop):
+                continue
+
+            past_key_value = past_key_values[idx] if past_key_values is not None else None
+
+            hidden_states, layer_self_attn, layer_cross_attn, present_key_value = decoder_layer(
+                hidden_states,
+                attention_mask=combined_attention_mask,
+                encoder_hidden_states=encoder_hidden_states,
+                encoder_attention_mask=encoder_attention_mask,
+                layer_head_mask=head_mask[idx] if head_mask is not None else None,
+                cross_attn_layer_head_mask=cross_attn_head_mask[idx] if cross_attn_head_mask is not None else None,
+                past_key_value=past_key_value,
+            )
+
+            if use_cache:
+                present_key_values += (present_key_value,)
+
+            if output_attentions:
+                all_self_attns += (layer_self_attn,)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attns += (layer_cross_attn,)
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        if not return_dict:
+            return hidden_states, present_key_values, all_hidden_states, all_self_attns, all_cross_attns
+        else:
+            return TFBaseModelOutputWithPastAndCrossAttentions(
+                last_hidden_state=hidden_states,
+                past_key_values=present_key_values,
+                hidden_states=all_hidden_states,
+                attentions=all_self_attns,
+                cross_attentions=all_cross_attns,
+            )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "embed_positions", None) is not None:
+            with tf.name_scope(self.embed_positions.name):
+                self.embed_positions.build(None)
+        if getattr(self, "layer_norm", None) is not None:
+            with tf.name_scope(self.layer_norm.name):
+                self.layer_norm.build([None, None, self.config.d_model])
+        if getattr(self, "layers", None) is not None:
+            for layer in self.layers:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+
+
+@keras_serializable
+class TFPegasusMainLayer(keras.layers.Layer):
+    config_class = PegasusConfig
+
+    def __init__(self, config: PegasusConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.config = config
+        self.shared = keras.layers.Embedding(
+            input_dim=config.vocab_size,
+            output_dim=config.d_model,
+            embeddings_initializer=keras.initializers.TruncatedNormal(stddev=self.config.init_std),
+            name="model.shared",
+        )
+        # Additional attribute to specify the expected name scope of the layer (for loading/storing weights)
+        self.shared.load_weight_prefix = "model.shared"
+
+        self.encoder = TFPegasusEncoder(config, self.shared, name="encoder")
+        self.decoder = TFPegasusDecoder(config, self.shared, name="decoder")
+
+    def get_input_embeddings(self):
+        return self.shared
+
+    def set_input_embeddings(self, new_embeddings):
+        self.shared = new_embeddings
+        self.encoder.embed_tokens = self.shared
+        self.decoder.embed_tokens = self.shared
+
+    @unpack_inputs
+    def call(
+        self,
+        input_ids: tf.Tensor | None = None,
+        attention_mask: tf.Tensor | None = None,
+        decoder_input_ids: tf.Tensor | None = None,
+        decoder_attention_mask: tf.Tensor | None = None,
+        decoder_position_ids: tf.Tensor | None = None,
+        head_mask: tf.Tensor | None = None,
+        decoder_head_mask: tf.Tensor | None = None,
+        cross_attn_head_mask: tf.Tensor | None = None,
+        encoder_outputs: tuple | TFBaseModelOutput | None = None,
+        past_key_values: tuple[tuple[tf.Tensor]] | None = None,
+        inputs_embeds: tf.Tensor | None = None,
+        decoder_inputs_embeds: tf.Tensor | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool | None = False,
+        **kwargs,
+    ):
+        if decoder_input_ids is None and decoder_inputs_embeds is None:
+            use_cache = False
+
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                head_mask=head_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+                training=training,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a TFBaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, TFBaseModelOutput):
+            encoder_outputs = TFBaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+        # If the user passed a TFBaseModelOutput for encoder_outputs, we wrap it in a tuple when return_dict=False
+        elif not return_dict and not isinstance(encoder_outputs, tuple):
+            encoder_outputs = encoder_outputs.to_tuple()
+
+        decoder_outputs = self.decoder(
+            decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            position_ids=decoder_position_ids,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=attention_mask,
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
+        return TFSeq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        # The shared/tied weights expect to be in the model base namespace
+        # Adding "/" to the end (not the start!) of a tf.name_scope puts it in the root namespace rather than
+        # the current one.
+        with tf.name_scope(self.shared.load_weight_prefix + "/" + self.shared.name + "/"):
+            self.shared.build(None)
+        if getattr(self, "encoder", None) is not None:
+            with tf.name_scope(self.encoder.name):
+                self.encoder.build(None)
+        if getattr(self, "decoder", None) is not None:
+            with tf.name_scope(self.decoder.name):
+                self.decoder.build(None)
+
+
+@add_start_docstrings(
+    "The bare PEGASUS Model outputting raw hidden-states without any specific head on top.",
+    PEGASUS_START_DOCSTRING,
+)
+class TFPegasusModel(TFPegasusPreTrainedModel):
+    def __init__(self, config: PegasusConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.model = TFPegasusMainLayer(config, name="model")
+
+    def get_encoder(self):
+        return self.model.encoder
+
+    def get_decoder(self):
+        return self.model.decoder
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(PEGASUS_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFSeq2SeqModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+    )
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_input_ids: np.ndarray | tf.Tensor | None = None,
+        decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_head_mask: np.ndarray | tf.Tensor | None = None,
+        cross_attn_head_mask: np.ndarray | tf.Tensor | None = None,
+        encoder_outputs: tuple | TFBaseModelOutput | None = None,
+        past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool = False,
+        **kwargs,
+    ) -> TFSeq2SeqModelOutput | tuple[tf.Tensor]:
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            decoder_attention_mask=decoder_attention_mask,
+            decoder_position_ids=decoder_position_ids,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            encoder_outputs=encoder_outputs,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+
+        return outputs
+
+    # Copied from transformers.models.bart.modeling_tf_bart.TFBartModel.serving_output
+    def serving_output(self, output):
+        pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None
+        dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None
+        dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.output_attentions else None
+        cross_attns = tf.convert_to_tensor(output.cross_attentions) if self.config.output_attentions else None
+        enc_hs = tf.convert_to_tensor(output.encoder_hidden_states) if self.config.output_hidden_states else None
+        enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.output_attentions else None
+
+        return TFSeq2SeqModelOutput(
+            last_hidden_state=output.last_hidden_state,
+            past_key_values=pkv,
+            decoder_hidden_states=dec_hs,
+            decoder_attentions=dec_attns,
+            cross_attentions=cross_attns,
+            encoder_last_hidden_state=output.encoder_last_hidden_state,
+            encoder_hidden_states=enc_hs,
+            encoder_attentions=enc_attns,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "model", None) is not None:
+            with tf.name_scope(self.model.name):
+                self.model.build(None)
+
+
+# Copied from transformers.models.bart.modeling_tf_bart.BiasLayer
+class BiasLayer(keras.layers.Layer):
+    """
+    Bias as a layer. It is used for serialization purposes: `keras.Model.save_weights` stores on a per-layer basis,
+    so all weights have to be registered in a layer.
+    """
+
+    def __init__(self, shape, initializer, trainable, name, **kwargs):
+        super().__init__(name=name, **kwargs)
+        # Note: the name of this variable will NOT be scoped when serialized, i.e. it will not be in the format of
+        # "outer_layer/inner_layer/.../name:0". Instead, it will be "name:0". For further details, see:
+        # https://github.com/huggingface/transformers/pull/18833#issuecomment-1233090214
+        self.bias = self.add_weight(name=name, shape=shape, initializer=initializer, trainable=trainable)
+
+    def call(self, x):
+        return x + self.bias
+
+
+@add_start_docstrings(
+    "The PEGASUS Model with a language modeling head. Can be used for summarization.",
+    PEGASUS_START_DOCSTRING,
+)
+class TFPegasusForConditionalGeneration(TFPegasusPreTrainedModel, TFCausalLanguageModelingLoss):
+    _keys_to_ignore_on_load_unexpected = [
+        r"model.encoder.embed_tokens.weight",
+        r"model.decoder.embed_tokens.weight",
+    ]
+
+    def __init__(self, config, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.model = TFPegasusMainLayer(config, name="model")
+        self.use_cache = config.use_cache
+        # final_bias_logits is registered as a buffer in pytorch, so not trainable for the sake of consistency.
+        self.bias_layer = BiasLayer(
+            name="final_logits_bias", shape=[1, config.vocab_size], initializer="zeros", trainable=False
+        )
+
+    def get_decoder(self):
+        return self.model.decoder
+
+    def get_encoder(self):
+        return self.model.encoder
+
+    def get_output_embeddings(self):
+        return self.get_input_embeddings()
+
+    def set_output_embeddings(self, value):
+        self.set_input_embeddings(value)
+
+    def get_bias(self):
+        return {"final_logits_bias": self.bias_layer.bias}
+
+    def set_bias(self, value):
+        # Replaces the existing layers containing bias for correct (de)serialization.
+        vocab_size = value["final_logits_bias"].shape[-1]
+        self.bias_layer = BiasLayer(
+            name="final_logits_bias", shape=[1, vocab_size], initializer="zeros", trainable=False
+        )
+        self.bias_layer.bias.assign(value["final_logits_bias"])
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(PEGASUS_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=TFSeq2SeqLMOutput, config_class=_CONFIG_FOR_DOC)
+    @add_end_docstrings(PEGASUS_GENERATION_EXAMPLE)
+    def call(
+        self,
+        input_ids: TFModelInputType | None = None,
+        attention_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_input_ids: np.ndarray | tf.Tensor | None = None,
+        decoder_attention_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_position_ids: np.ndarray | tf.Tensor | None = None,
+        head_mask: np.ndarray | tf.Tensor | None = None,
+        decoder_head_mask: np.ndarray | tf.Tensor | None = None,
+        cross_attn_head_mask: np.ndarray | tf.Tensor | None = None,
+        encoder_outputs: TFBaseModelOutput | None = None,
+        past_key_values: tuple[tuple[np.ndarray | tf.Tensor]] | None = None,
+        inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        decoder_inputs_embeds: np.ndarray | tf.Tensor | None = None,
+        use_cache: bool | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        labels: np.ndarray | tf.Tensor | None = None,
+        training: bool = False,
+    ) -> TFSeq2SeqLMOutput | tuple[tf.Tensor]:
+        """
+        labels (`tf.tensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Returns:
+
+        """
+
+        if labels is not None:
+            labels = tf.where(
+                labels == self.config.pad_token_id,
+                tf.cast(tf.fill(shape_list(labels), -100), labels.dtype),
+                labels,
+            )
+            use_cache = False
+            if decoder_input_ids is None and decoder_inputs_embeds is None:
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )
+
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            encoder_outputs=encoder_outputs,
+            decoder_attention_mask=decoder_attention_mask,
+            decoder_position_ids=decoder_position_ids,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        lm_logits = tf.matmul(outputs[0], self.model.shared.weights, transpose_b=True)
+        lm_logits = self.bias_layer(lm_logits)
+        masked_lm_loss = None if labels is None else self.hf_compute_loss(labels, lm_logits)
+
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+        return TFSeq2SeqLMOutput(
+            loss=masked_lm_loss,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,  # index 1 of d outputs
+            decoder_hidden_states=outputs.decoder_hidden_states,  # index 2 of d outputs
+            decoder_attentions=outputs.decoder_attentions,  # index 3 of d outputs
+            cross_attentions=outputs.cross_attentions,  # index 4 of d outputs
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,  # index 0 of encoder outputs
+            encoder_hidden_states=outputs.encoder_hidden_states,  # 1 of e out
+            encoder_attentions=outputs.encoder_attentions,  # 2 of e out
+        )
+
+    # Copied from transformers.models.bart.modeling_tf_bart.TFBartForConditionalGeneration.serving_output
+    def serving_output(self, output):
+        pkv = tf.tuple(output.past_key_values)[1] if self.config.use_cache else None
+        dec_hs = tf.convert_to_tensor(output.decoder_hidden_states) if self.config.output_hidden_states else None
+        dec_attns = tf.convert_to_tensor(output.decoder_attentions) if self.config.output_attentions else None
+        cross_attns = tf.convert_to_tensor(output.cross_attentions) if self.config.output_attentions else None
+        enc_hs = tf.convert_to_tensor(output.encoder_hidden_states) if self.config.output_hidden_states else None
+        enc_attns = tf.convert_to_tensor(output.encoder_attentions) if self.config.output_attentions else None
+
+        return TFSeq2SeqLMOutput(
+            logits=output.logits,
+            past_key_values=pkv,
+            decoder_hidden_states=dec_hs,
+            decoder_attentions=dec_attns,
+            cross_attentions=cross_attns,
+            encoder_last_hidden_state=output.encoder_last_hidden_state,
+            encoder_hidden_states=enc_hs,
+            encoder_attentions=enc_attns,
+        )
+
+    # Copied from transformers.models.bart.modeling_tf_bart.TFBartForConditionalGeneration.prepare_inputs_for_generation
+    def prepare_inputs_for_generation(
+        self,
+        decoder_input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        decoder_attention_mask=None,
+        head_mask=None,
+        decoder_head_mask=None,
+        cross_attn_head_mask=None,
+        use_cache=None,
+        encoder_outputs=None,
+        **kwargs,
+    ):
+        # cut decoder_input_ids if past_key_values is used
+        if past_key_values is not None:
+            decoder_input_ids = decoder_input_ids[:, -1:]
+
+        if decoder_attention_mask is not None:  # xla
+            decoder_position_ids = tf.math.cumsum(decoder_attention_mask, axis=-1, exclusive=True)[:, -1:]
+        elif past_key_values is not None:  # no xla + past_key_values
+            decoder_position_ids = past_key_values[0][0].shape[2]
+        else:  # no xla + no past_key_values
+            decoder_position_ids = tf.range(decoder_input_ids.shape[1])
+
+        return {
+            "input_ids": None,  # encoder_outputs is defined. input_ids not needed
+            "encoder_outputs": encoder_outputs,
+            "past_key_values": past_key_values,
+            "decoder_input_ids": decoder_input_ids,
+            "attention_mask": attention_mask,
+            "decoder_attention_mask": decoder_attention_mask,
+            "decoder_position_ids": decoder_position_ids,
+            "head_mask": head_mask,
+            "decoder_head_mask": decoder_head_mask,
+            "cross_attn_head_mask": cross_attn_head_mask,
+            "use_cache": use_cache,  # change this to avoid caching (presumably for debugging)
+        }
+
+    def prepare_decoder_input_ids_from_labels(self, labels: tf.Tensor):
+        return shift_tokens_right(labels, self.config.pad_token_id, self.config.decoder_start_token_id)
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "model", None) is not None:
+            with tf.name_scope(self.model.name):
+                self.model.build(None)
+        if getattr(self, "bias_layer", None) is not None:
+            with tf.name_scope(self.bias_layer.name):
+                self.bias_layer.build(None)
+
+
+__all__ = ["TFPegasusForConditionalGeneration", "TFPegasusModel", "TFPegasusPreTrainedModel"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pegasus/tokenization_pegasus.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pegasus/tokenization_pegasus.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8a4a1c737d1592d9362f506e93b20854edfe7a5
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pegasus/tokenization_pegasus.py
@@ -0,0 +1,292 @@
+# coding=utf-8
+# Copyright 2020 Google and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from shutil import copyfile
+from typing import Any, Optional
+
+import sentencepiece as spm
+
+from ...tokenization_utils import AddedToken, PreTrainedTokenizer
+from ...utils import logging
+from ...utils.import_utils import requires
+
+
+SPIECE_UNDERLINE = "▁"
+
+VOCAB_FILES_NAMES = {"vocab_file": "spiece.model"}
+
+
+logger = logging.get_logger(__name__)
+
+
+# TODO ArthurZ refactor this to only use the added_tokens_encoder
+
+
+@requires(backends=("sentencepiece",))
+class PegasusTokenizer(PreTrainedTokenizer):
+    r"""
+    Construct a PEGASUS tokenizer. Based on [SentencePiece](https://github.com/google/sentencepiece).
+
+    This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
+    this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
+            The token used is the `sep_token`.
+
+            </Tip>
+
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        mask_token (`str`, *optional*, defaults to `"<mask_2>"`):
+            The token used for masking single token values. This is the token used when training this model with masked
+            language modeling (MLM). This is the token that the PEGASUS encoder will try to predict during pretraining.
+            It corresponds to *[MASK2]* in [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive
+            Summarization](https://huggingface.co/papers/1912.08777).
+        mask_token_sent (`str`, *optional*, defaults to `"<mask_1>"`):
+            The token used for masking whole target sentences. This is the token used when training this model with gap
+            sentences generation (GSG). This is the sentence that the PEGASUS decoder will try to predict during
+            pretraining. It corresponds to *[MASK1]* in [PEGASUS: Pre-training with Extracted Gap-sentences for
+            Abstractive Summarization](https://huggingface.co/papers/1912.08777).
+        additional_special_tokens (`List[str]`, *optional*):
+            Additional special tokens used by the tokenizer. If no additional_special_tokens are provided <mask_2> and
+            <unk_2, ..., unk_102> are used as additional special tokens corresponding to the [original PEGASUS
+            tokenizer](https://github.com/google-research/pegasus/blob/939830367bcf411193d2b5eca2f2f90f3f9260ca/pegasus/ops/pretrain_parsing_ops.cc#L66)
+            that uses the tokens 2 - 104 only for pretraining
+        sp_model_kwargs (`dict`, *optional*):
+            Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
+            SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
+            to set:
+
+            - `enable_sampling`: Enable subword regularization.
+            - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
+
+              - `nbest_size = {0,1}`: No sampling is performed.
+              - `nbest_size > 1`: samples from the nbest_size results.
+              - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
+                using forward-filtering-and-backward-sampling algorithm.
+
+            - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
+              BPE-dropout.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file,
+        pad_token="<pad>",
+        eos_token="</s>",
+        unk_token="<unk>",
+        mask_token="<mask_2>",
+        mask_token_sent="<mask_1>",
+        additional_special_tokens=None,
+        offset=103,  # entries 2 - 104 are only used for pretraining
+        sp_model_kwargs: Optional[dict[str, Any]] = None,
+        **kwargs,
+    ) -> None:
+        self.offset = offset
+        if additional_special_tokens is not None:
+            if not isinstance(additional_special_tokens, list):
+                raise TypeError(
+                    f"additional_special_tokens should be of type {type(list)}, but is"
+                    f" {type(additional_special_tokens)}"
+                )
+            additional_special_tokens_extended = (
+                ([mask_token_sent] + additional_special_tokens)
+                if mask_token_sent not in additional_special_tokens and mask_token_sent is not None
+                else additional_special_tokens
+            )
+            # fill additional tokens with ..., <unk_token_102> in case not all additional tokens are already taken
+            additional_special_tokens_extended += [
+                f"<unk_{i}>" for i in range(len(additional_special_tokens_extended), self.offset - 1)
+            ]
+
+            if len(set(additional_special_tokens_extended)) != len(additional_special_tokens_extended):
+                raise ValueError(
+                    "Please make sure that the provided additional_special_tokens do not contain an incorrectly"
+                    f" shifted list of <unk_x> tokens. Found {additional_special_tokens_extended}."
+                )
+            additional_special_tokens = additional_special_tokens_extended
+        else:
+            additional_special_tokens_extended = []
+            additional_special_tokens = [mask_token_sent] if mask_token_sent is not None else []
+            additional_special_tokens += [f"<unk_{i}>" for i in range(2, self.offset)]
+
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        self.mask_token_sent = mask_token_sent
+        self.vocab_file = vocab_file
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(vocab_file)
+
+        _added_tokens_decoder = {
+            0: AddedToken(str(pad_token), special=True),
+            1: AddedToken(str(eos_token), special=True),
+        }
+
+        if self.mask_token_sent is not None:
+            _added_tokens_decoder[2] = AddedToken(mask_token_sent, special=True)
+            _added_tokens_decoder[3] = AddedToken(str(mask_token), special=True)
+
+        for i in range(2, self.offset):
+            _added_tokens_decoder[len(_added_tokens_decoder)] = AddedToken(f"<unk_{i}>", special=True)
+
+        # Force update as we want to make sure vocab is enforced (same as fast)
+        self._added_tokens_decoder = kwargs.pop("added_tokens_decoder", {})
+        self._added_tokens_decoder.update(_added_tokens_decoder)
+
+        super().__init__(
+            eos_token=eos_token,
+            unk_token=unk_token,
+            mask_token=mask_token,
+            pad_token=pad_token,
+            mask_token_sent=mask_token_sent,
+            offset=offset,
+            additional_special_tokens=additional_special_tokens,
+            sp_model_kwargs=self.sp_model_kwargs,
+            **kwargs,
+        )
+
+    @property
+    def vocab_size(self) -> int:
+        return len(self.sp_model) + self.offset
+
+    def get_vocab(self) -> dict[str, int]:
+        vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        return state
+
+    def __setstate__(self, d):
+        self.__dict__ = d
+
+        # for backward compatibility
+        if not hasattr(self, "sp_model_kwargs"):
+            self.sp_model_kwargs = {}
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.Load(self.vocab_file)
+
+    def _tokenize(self, text: str) -> list[str]:
+        """Take as input a string and return a list of strings (tokens) for words/sub-words"""
+        return self.sp_model.encode(text, out_type=str)
+
+    def _convert_token_to_id(self, token: str) -> int:
+        """Converts a token (str) to an id using the vocab."""
+        sp_id = self.sp_model.piece_to_id(token)
+        return sp_id + self.offset
+
+    def _convert_id_to_token(self, index: int) -> str:
+        """Converts an index (integer) to a token (str) using the vocab."""
+        if index < self.offset:
+            return self.sp_model.IdToPiece(index)
+        token = self.sp_model.IdToPiece(index - self.offset)
+        return token
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (string) in a single string."""
+        current_sub_tokens = []
+        out_string = ""
+        for token in tokens:
+            # make sure that special tokens are not decoded using sentencepiece model
+            if token in self.all_special_tokens:
+                out_string += self.sp_model.decode(current_sub_tokens) + token
+                current_sub_tokens = []
+            else:
+                current_sub_tokens.append(token)
+        out_string += self.sp_model.decode(current_sub_tokens)
+        return out_string.strip()
+
+    def num_special_tokens_to_add(self, pair=False):
+        """Just EOS"""
+        return 1
+
+    def _special_token_mask(self, seq):
+        all_special_ids = set(self.all_special_ids)  # call it once instead of inside list comp
+        all_special_ids.remove(self.unk_token_id)  # <unk> is only sometimes special
+
+        return [1 if x in all_special_ids else 0 for x in seq]
+
+    def get_special_tokens_mask(
+        self, token_ids_0: list, token_ids_1: Optional[list] = None, already_has_special_tokens: bool = False
+    ) -> list[int]:
+        """Get list where entries are [1] if a token is [eos] or [pad] else 0."""
+        if already_has_special_tokens:
+            return self._special_token_mask(token_ids_0)
+        elif token_ids_1 is None:
+            return self._special_token_mask(token_ids_0) + [1]
+        else:
+            return self._special_token_mask(token_ids_0 + token_ids_1) + [1]
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> list[int]:
+        """
+        Build model inputs from a sequence or a pair of sequences for sequence classification tasks by concatenating
+        and adding special tokens. A PEGASUS sequence has the following format, where `X` represents the sequence:
+
+        - single sequence: `X </s>`
+        - pair of sequences: `A B </s>` (not intended use)
+
+        BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
+        separator.
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return token_ids_0 + [self.eos_token_id]
+        # We don't expect to process pairs, but leave the pair logic for API consistency
+        return token_ids_0 + token_ids_1 + [self.eos_token_id]
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+
+        return (out_vocab_file,)
+
+
+__all__ = ["PegasusTokenizer"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pegasus/tokenization_pegasus_fast.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pegasus/tokenization_pegasus_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..92a37c44ff2e302fe1e2a56849f2e91fa481ec9b
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pegasus/tokenization_pegasus_fast.py
@@ -0,0 +1,215 @@
+# coding=utf-8
+# Copyright 2020 Google and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization class for model PEGASUS."""
+
+import os
+from shutil import copyfile
+from typing import Optional
+
+from ...tokenization_utils_fast import PreTrainedTokenizerFast
+from ...utils import is_sentencepiece_available, logging
+
+
+if is_sentencepiece_available():
+    from .tokenization_pegasus import PegasusTokenizer
+else:
+    PegasusTokenizer = None
+
+
+logger = logging.get_logger(__name__)
+
+
+SPIECE_UNDERLINE = "▁"
+
+VOCAB_FILES_NAMES = {"vocab_file": "spiece.model", "tokenizer_file": "tokenizer.json"}
+
+
+class PegasusTokenizerFast(PreTrainedTokenizerFast):
+    r"""
+    Construct a "fast" PEGASUS tokenizer (backed by HuggingFace's *tokenizers* library). Based on
+    [Unigram](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=unigram#models).
+
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
+
+    Args:
+        vocab_file (`str`):
+            [SentencePiece](https://github.com/google/sentencepiece) file (generally has a *.spm* extension) that
+            contains the vocabulary necessary to instantiate a tokenizer.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
+            The token used is the `sep_token`.
+
+            </Tip>
+
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        mask_token (`str`, *optional*, defaults to `"<mask_2>"`):
+            The token used for masking single token values. This is the token used when training this model with masked
+            language modeling (MLM). This is the token that the PEGASUS encoder will try to predict during pretraining.
+            It corresponds to *[MASK2]* in [PEGASUS: Pre-training with Extracted Gap-sentences for Abstractive
+            Summarization](https://huggingface.co/papers/1912.08777).
+        mask_token_sent (`str`, *optional*, defaults to `"<mask_1>"`):
+            The token used for masking whole target sentences. This is the token used when training this model with gap
+            sentences generation (GSG). This is the sentence that the PEGASUS decoder will try to predict during
+            pretraining. It corresponds to *[MASK1]* in [PEGASUS: Pre-training with Extracted Gap-sentences for
+            Abstractive Summarization](https://huggingface.co/papers/1912.08777).
+        additional_special_tokens (`List[str]`, *optional*):
+            Additional special tokens used by the tokenizer. If no additional_special_tokens are provided <mask_2> and
+            <unk_2, ..., unk_102> are used as additional special tokens corresponding to the [original PEGASUS
+            tokenizer](https://github.com/google-research/pegasus/blob/939830367bcf411193d2b5eca2f2f90f3f9260ca/pegasus/ops/pretrain_parsing_ops.cc#L66)
+            that uses the tokens 2 - 104 only for pretraining
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    slow_tokenizer_class = PegasusTokenizer
+    model_input_names = ["input_ids", "attention_mask"]
+
+    def __init__(
+        self,
+        vocab_file=None,
+        tokenizer_file=None,
+        pad_token="<pad>",
+        eos_token="</s>",
+        unk_token="<unk>",
+        mask_token="<mask_2>",
+        mask_token_sent="<mask_1>",
+        additional_special_tokens=None,
+        offset=103,  # entries 2 - 104 are only used for pretraining
+        **kwargs,
+    ):
+        self.offset = offset
+
+        if additional_special_tokens is not None:
+            if not isinstance(additional_special_tokens, list):
+                raise TypeError(
+                    f"additional_special_tokens should be of type {type(list)}, but is"
+                    f" {type(additional_special_tokens)}"
+                )
+
+            additional_special_tokens_extended = (
+                ([mask_token_sent] + additional_special_tokens)
+                if mask_token_sent not in additional_special_tokens and mask_token_sent is not None
+                else additional_special_tokens
+            )
+            # fill additional tokens with ..., <unk_token_102> in case not all additional tokens are already taken
+            additional_special_tokens_extended += [
+                f"<unk_{i}>" for i in range(len(additional_special_tokens_extended), self.offset - 1)
+            ]
+
+            if len(set(additional_special_tokens_extended)) != len(additional_special_tokens_extended):
+                raise ValueError(
+                    "Please make sure that the provided additional_special_tokens do not contain an incorrectly"
+                    f" shifted list of <unk_x> tokens. Found {additional_special_tokens_extended}."
+                )
+            additional_special_tokens = additional_special_tokens_extended
+        else:
+            additional_special_tokens = [mask_token_sent] if mask_token_sent is not None else []
+            additional_special_tokens += [f"<unk_{i}>" for i in range(2, self.offset)]
+
+        # pegasus was design to support changing the index of the first tokens. If one of the padding/eos/unk/mask token
+        # is different from default, we must rebuild the vocab
+        from_slow = kwargs.pop("from_slow", None)
+        from_slow = from_slow or str(pad_token) != "<pad>" or str(eos_token) != "</s>" or str(unk_token) != "<unk>"
+
+        kwargs.pop("added_tokens_decoder", {})
+
+        super().__init__(
+            vocab_file,
+            tokenizer_file=tokenizer_file,
+            pad_token=pad_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            mask_token=mask_token,
+            mask_token_sent=mask_token_sent,
+            offset=offset,
+            additional_special_tokens=additional_special_tokens,
+            from_slow=from_slow,
+            **kwargs,
+        )
+        self.vocab_file = vocab_file
+
+    def _special_token_mask(self, seq):
+        all_special_ids = set(self.all_special_ids)  # call it once instead of inside list comp
+        all_special_ids.remove(self.unk_token_id)  # <unk> is only sometimes special
+
+        if all_special_ids != set(range(len(self.additional_special_tokens) + 3)):
+            raise ValueError(
+                "There should be 3 special tokens: mask_token, pad_token, and eos_token +"
+                f" {len(self.additional_special_tokens)} additional_special_tokens, but got {all_special_ids}"
+            )
+
+        return [1 if x in all_special_ids else 0 for x in seq]
+
+    def get_special_tokens_mask(
+        self, token_ids_0: list, token_ids_1: Optional[list] = None, already_has_special_tokens: bool = False
+    ) -> list[int]:
+        """Get list where entries are [1] if a token is [eos] or [pad] else 0."""
+        if already_has_special_tokens:
+            return self._special_token_mask(token_ids_0)
+        elif token_ids_1 is None:
+            return self._special_token_mask(token_ids_0) + [1]
+        else:
+            return self._special_token_mask(token_ids_0 + token_ids_1) + [1]
+
+    def build_inputs_with_special_tokens(self, token_ids_0, token_ids_1=None) -> list[int]:
+        """
+        Build model inputs from a sequence by adding eos to the end. no bos token is added to the front.
+
+        - single sequence: `X </s>`
+        - pair of sequences: `A B </s>` (not intended use)
+
+        Args:
+            token_ids_0 (`List[int]`):
+                List of IDs to which the special tokens will be added
+            token_ids_1 (`List[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `List[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return token_ids_0 + [self.eos_token_id]
+        # We don't expect to process pairs, but leave the pair logic for API consistency
+        return token_ids_0 + token_ids_1 + [self.eos_token_id]
+
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        if not self.can_save_slow_tokenizer:
+            raise ValueError(
+                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
+                "tokenizer."
+            )
+
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+
+        return (out_vocab_file,)
+
+
+__all__ = ["PegasusTokenizerFast"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..81c3ba93bcf470394a99309dbf2f9869cd68d887
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_perception_lm import *
+    from .image_processing_perception_lm_fast import *
+    from .modeling_perception_lm import *
+    from .processing_perception_lm import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8e80bee88e51a38c7cb3fedd1ae176619f2f6a20
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/__pycache__/configuration_perception_lm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/__pycache__/configuration_perception_lm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7bc223af4874c3dbc78c19a72540faf51a61bf8a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/__pycache__/configuration_perception_lm.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/__pycache__/image_processing_perception_lm_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/__pycache__/image_processing_perception_lm_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6e421aa482e683b7fc6459504c5b4f3a478a576b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/__pycache__/image_processing_perception_lm_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/__pycache__/modeling_perception_lm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/__pycache__/modeling_perception_lm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..99e8d01b648d207e6486c400ecdce3c65a99e05a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/__pycache__/modeling_perception_lm.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/__pycache__/modular_perception_lm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/__pycache__/modular_perception_lm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..135fcea17e68a13d3cda7f6c6f56bfb510e52bbe
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/__pycache__/modular_perception_lm.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/__pycache__/processing_perception_lm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/__pycache__/processing_perception_lm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..703e4ea0ed6dc5522d0c651bd4222582e67449dd
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/__pycache__/processing_perception_lm.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/__pycache__/video_processing_perception_lm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/__pycache__/video_processing_perception_lm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5262a0bc7a3ea9f03b22d373ef92e7730502f319
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/__pycache__/video_processing_perception_lm.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/configuration_perception_lm.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/configuration_perception_lm.py
new file mode 100644
index 0000000000000000000000000000000000000000..08c084065ff86a519007afc9d6a1d97eacf6d381
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/configuration_perception_lm.py
@@ -0,0 +1,88 @@
+# coding=utf-8
+# Copyright 2025 Meta Platforms, Inc. and the HuggingFace Inc. team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PerceptionLM model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ..auto import CONFIG_MAPPING, AutoConfig
+from ..timm_wrapper.configuration_timm_wrapper import TimmWrapperConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class PerceptionLMConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`PerceptionLMForConditionalGeneration`]. It is used to instantiate an
+    PerceptionLM model according to the specified arguments, defining the model architecture.
+
+    Example models:
+    -  [facebook/Perception-LM-1B](https://huggingface.co/facebook/Perception-LM-1B).
+    -  [facebook/Perception-LM-3B](https://huggingface.co/facebook/Perception-LM-3B).
+    -  [facebook/Perception-LM-8B](https://huggingface.co/facebook/Perception-LM-8B).
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        vision_config (`Union[TimmWrapperConfig, dict]`, *optional*, defaults to `TimmWrapperConfig()`):
+            The config object or dictionary of the vision backbone.
+        text_config (`Union[PretrainedConfig, dict]`, *optional*, defaults to `LlamaConfig()`):
+            The config object or dictionary of the text backbone.
+        vision_use_cls_token (`bool`, *optional*, defaults to `True`):
+            Whether CLS token is used in the vision backbone. If used, we remove CLS token embedding from vision output.
+        projector_pooling_ratio (`int`, *optional*, defaults to 1):
+            The pooling ratio used in the multimodal projector.
+        image_token_id (`int`, *optional*, defaults to 128002):
+            The image token index to encode the image prompt.
+        video_token_id (`int`, *optional*, defaults to 128003):
+            The video token index to encode the video prompt.
+    """
+
+    model_type = "perception_lm"
+    sub_configs = {"text_config": AutoConfig, "vision_config": TimmWrapperConfig}
+
+    def __init__(
+        self,
+        vision_config=None,
+        text_config=None,
+        vision_use_cls_token=True,
+        projector_pooling_ratio=1,
+        image_token_id=128002,
+        video_token_id=128003,
+        **kwargs,
+    ):
+        self.image_token_id = image_token_id
+        self.video_token_id = video_token_id
+        if isinstance(vision_config, dict):
+            vision_config = TimmWrapperConfig(**vision_config)
+        elif isinstance(vision_config, TimmWrapperConfig):
+            pass
+        elif vision_config is None:
+            vision_config = TimmWrapperConfig()
+        self.vision_config = vision_config
+        self.vision_use_cls_token = vision_use_cls_token
+
+        if isinstance(text_config, dict):
+            text_config["model_type"] = text_config.get("model_type", "llama")
+            text_config = CONFIG_MAPPING[text_config["model_type"]](**text_config)
+        elif text_config is None:
+            text_config = CONFIG_MAPPING["llama"]()
+
+        self.text_config = text_config
+        self.projector_pooling_ratio = projector_pooling_ratio
+        super().__init__(**kwargs)
+
+
+__all__ = ["PerceptionLMConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/image_processing_perception_lm_fast.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/image_processing_perception_lm_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..c26132a484397067ad7354ea50705e493f063ff1
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/image_processing_perception_lm_fast.py
@@ -0,0 +1,309 @@
+# Copyright 2025 Meta Platforms, Inc. and the HuggingFace Inc. team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Image processor class for PerceptionLM."""
+
+import math
+from functools import reduce
+from typing import Optional, Union
+
+import numpy as np
+import torch
+from torchvision.transforms import functional as F
+
+from ...image_processing_utils import (
+    BatchFeature,
+)
+from ...image_processing_utils_fast import (
+    BaseImageProcessorFast,
+    DefaultFastImageProcessorKwargs,
+    get_image_size,
+    group_images_by_shape,
+    reorder_images,
+)
+from ...image_utils import (
+    IMAGENET_STANDARD_MEAN,
+    IMAGENET_STANDARD_STD,
+    ChannelDimension,
+    PILImageResampling,
+)
+from ...processing_utils import Unpack
+from ...utils import TensorType, auto_docstring
+
+
+class PerceptionLMFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+    r"""
+    vision_input_type (`str`, *optional*, defaults to `"thumb+tile"`):
+        Vision processing strategy. `"thumb+tile"` uses both thumbnails and multiple tiles for
+        multi-scale processing, otherwise uses single tile for lower memory usage.
+    tile_size (`int`, *optional*, defaults to `448`):
+        Height and width dimension (in pixels) of each tile used for image processing.
+    max_num_tiles (`int`, *optional*, defaults to `36`):
+        Maximum number of tiles an image can be split into based on its aspect ratio.
+    """
+
+    vision_input_type: str = "thumb+tile"
+    tile_size: int = 448
+    max_num_tiles: int = 36
+
+
+@auto_docstring
+class PerceptionLMImageProcessorFast(BaseImageProcessorFast):
+    resample = PILImageResampling.BICUBIC
+    image_mean = IMAGENET_STANDARD_MEAN
+    image_std = IMAGENET_STANDARD_STD
+    do_resize = True
+    do_center_crop = False
+    do_rescale = True
+    do_normalize = True
+    do_convert_rgb = True
+    size = {"width": 448, "height": 448}  # for backward compatibility in tests
+    valid_kwargs = PerceptionLMFastImageProcessorKwargs
+
+    def __init__(self, **kwargs: Unpack[PerceptionLMFastImageProcessorKwargs]) -> None:
+        super().__init__(**kwargs)
+
+    @auto_docstring
+    def preprocess(self, images, **kwargs: Unpack[PerceptionLMFastImageProcessorKwargs]) -> BatchFeature:
+        return super().preprocess(images, **kwargs)
+
+    @staticmethod
+    def _factors(n: int):
+        """Return all factors of a number."""
+        return set(
+            reduce(
+                list.__add__,
+                ([i, n // i] for i in range(1, int(n**0.5) + 1) if n % i == 0),
+            )
+        )
+
+    def _find_supported_aspect_ratios(self):
+        """
+        This function computes all the allowed aspect ratios for a fixed
+        number of input chunks. The order of returned items matters for the result of `_fit_image_to_canvas` function.
+        If tie exists in `_fit_image_to_canvas`, the latter in `_find_supported_aspect_ratios` wins.
+
+        For example, with `num_tiles=5`, it will return:
+        {
+            0.2: [(1, 5)],
+            5.0: [(5, 1)],
+            0.25: [(1, 4)],
+            1.0: [(2, 2), (1, 1)],
+            4.0: [(4, 1)],
+            0.3333333333333333: [(1, 3)],
+            3.0: [(3, 1)],
+            0.5: [(1, 2)],
+            2.0: [(2, 1)]
+        }
+        """
+        asp_dict = {}
+        for chunk_size in range(self.max_num_tiles, 0, -1):
+            _factors = sorted(self._factors(chunk_size))
+            _asp_ratios = [(x, chunk_size // x) for x in _factors]
+            for ratio in _asp_ratios:
+                k = ratio[0] / ratio[1]
+                if k not in asp_dict:
+                    asp_dict[k] = [ratio]
+                else:
+                    asp_dict[k].append(ratio)
+        return asp_dict
+
+    def _get_image_height_width(
+        self, image_width: int, image_height: int, target_width: int, target_height: int
+    ) -> tuple[int, int]:
+        """
+        Given image width, height and target width, height for the canvas, return the dimensions of how the image would be resized
+        with aspect ratio preservation.
+        """
+        scale = image_width / image_height
+
+        if scale > 1.0:
+            # Width is larger than height
+
+            # Rescaling factor is the minimum of the two scaling factors. Else one side would be outside of the canvas.
+            rescaling_factor = min(target_width / image_width, target_height / image_height)
+
+            # Set new width to target width and height to the rescaled height.
+            new_w = rescaling_factor * image_width
+            new_h = math.floor(new_w / scale)
+
+        else:
+            # Height is larger than width
+
+            # Rescaling factor is the minimum of the two scaling factors. Else one side would be outside of the canvas.
+            rescaling_factor = min(target_width / image_width, target_height / image_height)
+
+            # Set new height to target height and width to the rescaled width.
+            new_h = rescaling_factor * image_height
+            new_w = math.floor(new_h * scale)
+
+        return new_w, new_h
+
+    def _fit_image_to_canvas(self, img_width: int, img_height: int, tile_size: int):
+        """
+        Given an image width, height and target number of chunks this function will see if the image
+        can be fit into any of the canvases that can be build from arranging the tiles in a grid.
+        If the image can be fit onto several canvases, it will return the canvas where the shorter edge
+        of the image will be largest.
+        """
+        # Initialize the optimal canvas to None. If no canvas is found where image fits, function returns None.
+        optimal_canvas = None
+        optimal_image_width_height = None
+
+        scale = img_width / img_height
+
+        # Gather all potential supported image resolutions and iterate through them to find best match
+        potential_arrangements = [
+            item for sublist in self._find_supported_aspect_ratios().values() for item in sublist
+        ]
+        for n_w, n_h in potential_arrangements:
+            # Compute the canvas size
+            canvas_width, canvas_height = n_w * tile_size, n_h * tile_size
+
+            # Check if image can fit into the canvas without downsampling
+            if canvas_width >= img_width and canvas_height >= img_height:
+                # If we did not find a good canvas yet, we will use the current one
+                if optimal_canvas is None:
+                    # Set optimal canvas and determine the actual image height and width in the canvas with aspect ratio preserving resampling
+                    optimal_canvas = (n_w, n_h)
+                    optimal_image_width_height = self._get_image_height_width(
+                        image_width=img_width,
+                        image_height=img_height,
+                        target_width=n_w * tile_size,
+                        target_height=n_h * tile_size,
+                    )
+                else:
+                    # If we already found an optimal canvas before, we will check if the shorter edge of the image will be larger than the current optimal canvas.
+                    # This means we can potentially upsample the image resolution which is beneficial to performance.
+                    image_width_height = self._get_image_height_width(
+                        image_width=img_width,
+                        image_height=img_height,
+                        target_width=n_w * tile_size,
+                        target_height=n_h * tile_size,
+                    )
+                    # Llama3V dynamic tiling. Prioritize biggest canvas.
+                    if (scale < 1.0 and (image_width_height[0] >= optimal_image_width_height[0])) or (
+                        scale >= 1.0 and (image_width_height[1] >= optimal_image_width_height[1])
+                    ):
+                        optimal_canvas = (n_w, n_h)
+                        optimal_image_width_height = image_width_height
+        return optimal_canvas
+
+    def _find_closest_aspect_ratio(self, img_width: int, img_height: int, tile_size: int) -> tuple:
+        """
+        Given an image width, height and target number of chunks
+        this function will find the closest supported aspect ratio.
+        """
+        target_aspect_ratio = img_width / img_height
+        asp_dict = self._find_supported_aspect_ratios()
+        closest_aspect_ratio = None
+        if target_aspect_ratio >= 1:
+            closest_aspect_ratio = min(
+                [k for k in asp_dict if k <= target_aspect_ratio],
+                key=lambda x: abs(x - target_aspect_ratio),
+            )
+            tiles_given_aspect_ratio = asp_dict[closest_aspect_ratio]
+            # select largest width
+            return max(tiles_given_aspect_ratio, key=lambda x: x[0])
+        else:
+            closest_aspect_ratio = min(
+                [k for k in asp_dict if k > target_aspect_ratio],
+                key=lambda x: abs(1 / x - 1 / target_aspect_ratio),
+            )
+            tiles_given_aspect_ratio = asp_dict[closest_aspect_ratio]
+            # select largest height
+            return max(tiles_given_aspect_ratio, key=lambda x: x[1])
+
+    def _split(self, image: torch.Tensor, ncw: int, nch: int) -> torch.Tensor:
+        # Split image into number of required tiles (width x height)
+        batch_size, num_channels, height, width = image.size()
+        image = image.view(batch_size, num_channels, nch, height // nch, ncw, width // ncw)
+        # Permute dimensions to reorder the axes
+        image = image.permute(0, 2, 4, 1, 3, 5).contiguous()
+        # Reshape into the desired output shape (batch_size * 4, num_channels, width/2, height/2)
+        image = image.view(batch_size, ncw * nch, num_channels, height // nch, width // ncw)
+        return image
+
+    def resize(
+        self,
+        image: np.ndarray,
+        tile_size: int,
+        max_num_tiles: int,
+        resample: PILImageResampling = PILImageResampling.BICUBIC,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ):
+        height, width = get_image_size(image, channel_dim=input_data_format)
+        if max_num_tiles > 1:
+            aspect_ratio = self._fit_image_to_canvas(img_width=width, img_height=height, tile_size=tile_size)
+            if aspect_ratio is None:
+                # If we did not find a canvas, we have to find the closest aspect ratio and downsample the image
+                aspect_ratio = self._find_closest_aspect_ratio(img_width=width, img_height=height, tile_size=tile_size)
+        else:
+            aspect_ratio = (1, 1)
+        new_width, new_height = aspect_ratio[0] * tile_size, aspect_ratio[1] * tile_size
+        image = F.resize(image, (new_height, new_width), interpolation=resample)
+        return image, aspect_ratio
+
+    def _preprocess(
+        self,
+        images: list["torch.Tensor"],
+        do_resize: bool,
+        do_rescale: Optional[bool],
+        rescale_factor: Optional[Union[int, float]],
+        do_normalize: Optional[bool],
+        image_mean: Optional[Union[float, list[float]]],
+        image_std: Optional[Union[float, list[float]]],
+        vision_input_type: str,
+        tile_size: int,
+        max_num_tiles: int,
+        return_tensors: Optional[Union[str, TensorType]],
+        disable_grouping: bool,
+        **kwargs: Unpack[PerceptionLMFastImageProcessorKwargs],
+    ) -> BatchFeature:
+        # Group images by size for batched transformation
+        grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
+        resized_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            if do_resize:
+                if vision_input_type == "thumb+tile":
+                    thumbnails, _ = self.resize(stacked_images, tile_size, max_num_tiles=1)
+                    images_for_tiling, (tiles_w, tiles_h) = self.resize(
+                        stacked_images, tile_size, max_num_tiles=max_num_tiles
+                    )
+                    image_tiles = self._split(images_for_tiling, tiles_w, tiles_h)
+                    stacked_images = torch.cat([thumbnails.unsqueeze(1), image_tiles], dim=1)
+                else:  # vanilla single tile for low memory devices
+                    stacked_images, _ = self.resize(stacked_images, tile_size, max_num_tiles=1)
+
+            resized_images_grouped[shape] = stacked_images
+        resized_images = reorder_images(resized_images_grouped, grouped_images_index)
+
+        grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
+        processed_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            # Fused rescale and normalize
+            stacked_images = self.rescale_and_normalize(
+                stacked_images,
+                do_rescale,
+                rescale_factor,
+                do_normalize,
+                image_mean,
+                image_std,
+            )
+            processed_images_grouped[shape] = stacked_images
+        processed_images = reorder_images(processed_images_grouped, grouped_images_index)
+        processed_images = [p[None] if p.ndim == 3 else p for p in processed_images]  # add tiles dimension if needed
+        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
+        return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
+
+
+__all__ = ["PerceptionLMImageProcessorFast"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/modeling_perception_lm.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/modeling_perception_lm.py
new file mode 100644
index 0000000000000000000000000000000000000000..074e91e14e88d29296e40f8a3f5c94663221997d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/modeling_perception_lm.py
@@ -0,0 +1,487 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/perception_lm/modular_perception_lm.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_perception_lm.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 Meta Platforms, Inc. and the HuggingFace Inc. team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import math
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from ...cache_utils import Cache
+from ...generation import GenerationMixin
+from ...modeling_outputs import BaseModelOutputWithPast, ModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...utils import auto_docstring, can_return_tuple
+from ..auto import AutoModel
+from .configuration_perception_lm import PerceptionLMConfig
+
+
+class PerceptionLMAdaptiveAvgPooling(nn.Module):
+    def __init__(self, pooling_ratio=2):
+        super().__init__()
+        self.pooling_ratio = pooling_ratio
+
+    def forward(self, hidden_states):
+        b, num_tokens, c = hidden_states.shape
+        h = int(math.sqrt(num_tokens))
+        if h * h != num_tokens:
+            raise ValueError(f"num_tokens {num_tokens} is expected to be a square number")
+
+        shape = (h // self.pooling_ratio, h // self.pooling_ratio)
+        hidden_states = hidden_states.permute(0, 2, 1).reshape(b, -1, h, h)
+        hidden_states = F.adaptive_avg_pool2d(hidden_states, shape)
+        hidden_states = hidden_states.flatten(2).transpose(1, 2)
+
+        return hidden_states
+
+
+class PerceptionLMMultiModalProjector(nn.Module):
+    def __init__(self, config: PerceptionLMConfig):
+        super().__init__()
+        input_size = config.vision_config.model_args["embed_dim"]
+        output_size = config.text_config.hidden_size
+        self.linear_1 = nn.Linear(
+            in_features=input_size,
+            out_features=output_size,
+            bias=True,
+        )
+        self.gelu = nn.GELU()
+        self.linear_2 = nn.Linear(
+            in_features=output_size,
+            out_features=output_size,
+            bias=True,
+        )
+        self.pooling = (
+            PerceptionLMAdaptiveAvgPooling(config.projector_pooling_ratio)
+            if config.projector_pooling_ratio > 1
+            else nn.Identity()
+        )
+
+    def forward(self, features):
+        features = features.permute(1, 0, 2)  # NLD -> LND
+        features = self.linear_1(features)
+        features = self.gelu(features)
+        features = self.linear_2(features)
+        features = features.permute(1, 0, 2)  # LND -> NLD
+        features = self.pooling(features)
+        return features
+
+
+@auto_docstring
+class PerceptionLMPreTrainedModel(PreTrainedModel):
+    config: PerceptionLMConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _skip_keys_device_placement = "past_key_values"
+
+    _supports_flash_attn = True
+    _supports_sdpa = True
+
+    _can_compile_fullgraph = True
+    _supports_flex_attn = True
+    _supports_attention_backend = True
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for PerceptionLM outputs, with hidden states and attentions.
+    """
+)
+class PerceptionLMModelOutputWithPast(BaseModelOutputWithPast):
+    r"""
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    image_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+        Image hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    video_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_videos, sequence_length, hidden_size)`.
+        Video hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    """
+
+    image_hidden_states: Optional[torch.FloatTensor] = None
+
+    video_hidden_states: Optional[torch.FloatTensor] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for PerceptionLM causal language model (or autoregressive) outputs.
+    """
+)
+class PerceptionLMCausalLMOutputWithPast(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Language modeling loss (for next-token prediction).
+    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    image_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+        Image hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    video_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_videos, sequence_length, hidden_size)`.
+        Video hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    logits: Optional[torch.FloatTensor] = None
+    past_key_values: Optional[Cache] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    image_hidden_states: Optional[torch.FloatTensor] = None
+
+    video_hidden_states: Optional[torch.FloatTensor] = None
+
+
+@auto_docstring
+class PerceptionLMModel(PerceptionLMPreTrainedModel):
+    _checkpoint_conversion_mapping = {}
+
+    def __init__(self, config: PerceptionLMConfig):
+        super().__init__(config)
+        self.vision_tower = AutoModel.from_config(config.vision_config)
+        self.multi_modal_projector = PerceptionLMMultiModalProjector(config)
+        self.language_model = AutoModel.from_config(config.text_config)
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.language_model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.language_model.set_input_embeddings(value)
+
+    def set_decoder(self, decoder):
+        self.language_model = decoder
+
+    def get_decoder(self):
+        return self.language_model
+
+    def get_image_features(
+        self,
+        pixel_values: torch.FloatTensor,
+        **kwargs,
+    ):
+        """
+        Obtains image last hidden states from the vision tower and apply multimodal projection.
+
+        Args:
+            pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_tiles, channels, height, width)`)
+               The tensors corresponding to the input images.
+        Returns:
+            image_features (`torch.Tensor`): Image feature tensor of shape `(num_tiles, num_patches, embed_dim)`).
+        """
+        image_outputs = self.vision_tower(pixel_values.flatten(0, 1))
+        image_outputs = image_outputs.last_hidden_state
+        if self.config.vision_use_cls_token:
+            image_outputs = image_outputs[:, 1:, :]
+        image_features = self.multi_modal_projector(image_outputs)
+        return image_features
+
+    def get_placeholder_mask(
+        self,
+        input_ids: torch.LongTensor,
+        inputs_embeds: torch.FloatTensor,
+        image_features: Optional[torch.FloatTensor] = None,
+        video_features: Optional[torch.FloatTensor] = None,
+    ):
+        """
+        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
+        equal to the length of multimodal features. If the lengths are different, an error is raised.
+        """
+        if input_ids is None:
+            special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_image_mask = special_image_mask.all(-1)
+            special_video_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.video_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_video_mask = special_video_mask.all(-1)
+        else:
+            special_image_mask = input_ids == self.config.image_token_id
+            special_video_mask = input_ids == self.config.video_token_id
+
+        n_image_tokens = special_image_mask.sum()
+        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        if image_features is not None and inputs_embeds[special_image_mask].numel() != image_features.numel():
+            raise ValueError(
+                f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {image_features.size()[:-1].numel()}"
+            )
+
+        n_video_tokens = special_video_mask.sum()
+        special_video_mask = special_video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        if video_features is not None and inputs_embeds[special_video_mask].numel() != video_features.numel():
+            raise ValueError(
+                f"Videos features and image tokens do not match: tokens: {n_video_tokens}, features {video_features.size()[:-1].numel()}"
+            )
+
+        return special_image_mask, special_video_mask
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        pixel_values_videos: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **lm_kwargs,
+    ) -> Union[tuple, PerceptionLMModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if (pixel_values is not None or pixel_values_videos is not None) and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both (pixel_values or pixel_values_videos) and inputs_embeds at the same time, and must specify either one"
+            )
+
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+        image_features = None
+        if pixel_values is not None:
+            image_features = self.get_image_features(pixel_values=pixel_values)
+            image_features = image_features.to(inputs_embeds.device, dtype=inputs_embeds.dtype)
+            special_image_mask, _ = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, image_features=image_features
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
+        video_features = None
+        if pixel_values_videos is not None:
+            video_features = self.get_image_features(pixel_values=pixel_values_videos)
+            video_features = video_features.to(inputs_embeds.device, dtype=inputs_embeds.dtype)
+            _, special_video_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, video_features=video_features
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(special_video_mask, video_features)
+
+        outputs = self.language_model(
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **lm_kwargs,
+        )
+        return PerceptionLMModelOutputWithPast(
+            last_hidden_state=outputs.last_hidden_state,
+            hidden_states=outputs.hidden_states,
+            past_key_values=outputs.past_key_values,
+            attentions=outputs.attentions,
+            image_hidden_states=image_features if pixel_values is not None else None,
+            video_hidden_states=(video_features if pixel_values_videos is not None else None),
+        )
+
+
+@auto_docstring
+class PerceptionLMForConditionalGeneration(PerceptionLMPreTrainedModel, GenerationMixin):
+    _checkpoint_conversion_mapping = {}
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(self, config: PerceptionLMConfig):
+        super().__init__(config)
+        self.model = PerceptionLMModel(config)
+        self.lm_head = nn.Linear(config.text_config.hidden_size, config.text_config.vocab_size, bias=False)
+        self.post_init()
+
+    def get_input_embeddings(self):
+        return self.model.get_input_embeddings()
+
+    def set_input_embeddings(self, value):
+        self.model.set_input_embeddings(value)
+
+    def get_output_embeddings(self) -> nn.Module:
+        return self.lm_head
+
+    def set_decoder(self, decoder):
+        self.model.set_decoder(decoder)
+
+    def get_decoder(self):
+        return self.model.get_decoder()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        pixel_values_videos: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **lm_kwargs,
+    ) -> Union[tuple, PerceptionLMCausalLMOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        from transformers import AutoProcessor, AutoModelForImageTextToText
+        from huggingface_hub import hf_hub_download
+
+        MODEL_PATH = "facebook/Perception-LM-1B"
+        processor = AutoProcessor.from_pretrained(MODEL_PATH, use_fast=True)
+        model = AutoModelForImageTextToText.from_pretrained(MODEL_PATH).to("cuda")
+        test_image_file = hf_hub_download(
+                    repo_id="shumingh/perception_lm_test_images",
+                    filename="14496_0.PNG",
+                    repo_type="dataset",
+        )
+        conversation = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "url": test_image_file,
+                    },
+                    {"type": "text", "text": "Describe the bar plot in the image."},
+                ],
+            }
+        ]
+
+        inputs = processor.apply_chat_template(
+            [conversation],
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+        )
+        inputs = inputs.to(model.device)
+        generate_ids = model.generate(**inputs, max_new_tokens=256)
+        input_length = inputs["input_ids"].shape[1]
+        generate_ids_without_inputs = generate_ids[:, input_length:]
+
+        for output in processor.batch_decode(generate_ids_without_inputs, skip_special_tokens=True):
+            print(output)
+        ```"""
+        outputs = self.model(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            pixel_values_videos=pixel_values_videos,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **lm_kwargs,
+        )
+
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(
+                logits=logits,
+                labels=labels,
+                vocab_size=self.config.text_config.vocab_size,
+                **lm_kwargs,
+            )
+
+        return PerceptionLMCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=outputs.image_hidden_states,
+            video_hidden_states=outputs.video_hidden_states,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        pixel_values=None,
+        pixel_values_videos=None,
+        attention_mask=None,
+        cache_position=None,
+        logits_to_keep=None,
+        **kwargs,
+    ):
+        # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
+
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
+
+        if cache_position[0] == 0:
+            # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+            # Otherwise we need pixel values to be passed to model
+            model_inputs["pixel_values"] = pixel_values
+            model_inputs["pixel_values_videos"] = pixel_values_videos
+        return model_inputs
+
+
+__all__ = ["PerceptionLMForConditionalGeneration", "PerceptionLMPreTrainedModel", "PerceptionLMModel"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/modular_perception_lm.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/modular_perception_lm.py
new file mode 100644
index 0000000000000000000000000000000000000000..2b50b824220210bebb65f5c27b206b8d8e101f84
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/modular_perception_lm.py
@@ -0,0 +1,441 @@
+# coding=utf-8
+# Copyright 2025 Meta Platforms, Inc. and the HuggingFace Inc. team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch PerceptionLM model."""
+
+import math
+from typing import Optional, Union
+
+import torch
+import torch.nn.functional as F
+from torch import nn
+
+from ...cache_utils import Cache
+from ...utils import (
+    auto_docstring,
+    can_return_tuple,
+    logging,
+)
+from ..auto import AutoModel
+from ..llava.modeling_llava import (
+    LlavaCausalLMOutputWithPast,
+    LlavaForConditionalGeneration,
+    LlavaModel,
+    LlavaModelOutputWithPast,
+    LlavaPreTrainedModel,
+)
+from .configuration_perception_lm import PerceptionLMConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class PerceptionLMAdaptiveAvgPooling(nn.Module):
+    def __init__(self, pooling_ratio=2):
+        super().__init__()
+        self.pooling_ratio = pooling_ratio
+
+    def forward(self, hidden_states):
+        b, num_tokens, c = hidden_states.shape
+        h = int(math.sqrt(num_tokens))
+        if h * h != num_tokens:
+            raise ValueError(f"num_tokens {num_tokens} is expected to be a square number")
+
+        shape = (h // self.pooling_ratio, h // self.pooling_ratio)
+        hidden_states = hidden_states.permute(0, 2, 1).reshape(b, -1, h, h)
+        hidden_states = F.adaptive_avg_pool2d(hidden_states, shape)
+        hidden_states = hidden_states.flatten(2).transpose(1, 2)
+
+        return hidden_states
+
+
+class PerceptionLMMultiModalProjector(nn.Module):
+    def __init__(self, config: PerceptionLMConfig):
+        super().__init__()
+        input_size = config.vision_config.model_args["embed_dim"]
+        output_size = config.text_config.hidden_size
+        self.linear_1 = nn.Linear(
+            in_features=input_size,
+            out_features=output_size,
+            bias=True,
+        )
+        self.gelu = nn.GELU()
+        self.linear_2 = nn.Linear(
+            in_features=output_size,
+            out_features=output_size,
+            bias=True,
+        )
+        self.pooling = (
+            PerceptionLMAdaptiveAvgPooling(config.projector_pooling_ratio)
+            if config.projector_pooling_ratio > 1
+            else nn.Identity()
+        )
+
+    def forward(self, features):
+        features = features.permute(1, 0, 2)  # NLD -> LND
+        features = self.linear_1(features)
+        features = self.gelu(features)
+        features = self.linear_2(features)
+        features = features.permute(1, 0, 2)  # LND -> NLD
+        features = self.pooling(features)
+        return features
+
+
+class PerceptionLMPreTrainedModel(LlavaPreTrainedModel):
+    base_model_prefix = "model"
+
+
+class PerceptionLMModelOutputWithPast(LlavaModelOutputWithPast):
+    r"""
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    image_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+        Image hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    video_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_videos, sequence_length, hidden_size)`.
+        Video hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    """
+
+    video_hidden_states: Optional[torch.FloatTensor] = None
+
+
+class PerceptionLMCausalLMOutputWithPast(LlavaCausalLMOutputWithPast):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Language modeling loss (for next-token prediction).
+    logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.vocab_size)`):
+        Prediction scores of the language modeling head (scores for each vocabulary token before SoftMax).
+    past_key_values (`Cache`, *optional*, returned when `use_cache=True` is passed or when `config.use_cache=True`):
+        It is a [`~cache_utils.Cache`] instance. For more details, see our [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache).
+
+        Contains pre-computed hidden-states (key and values in the self-attention blocks) that can be used (see
+        `past_key_values` input) to speed up sequential decoding.
+    image_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_images, sequence_length, hidden_size)`.
+        Image hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    video_hidden_states (`torch.FloatTensor`, *optional*):
+        A `torch.FloatTensor` of size `(batch_size, num_videos, sequence_length, hidden_size)`.
+        Video hidden_states of the model produced by the vision encoder and after projecting the last hidden state.
+    """
+
+    video_hidden_states: Optional[torch.FloatTensor] = None
+
+
+@auto_docstring
+class PerceptionLMModel(LlavaModel):
+    _checkpoint_conversion_mapping = {}
+
+    def __init__(self, config: PerceptionLMConfig):
+        super().__init__(config)
+        self.vision_tower = AutoModel.from_config(config.vision_config)
+        self.multi_modal_projector = PerceptionLMMultiModalProjector(config)
+        self.language_model = AutoModel.from_config(config.text_config)
+
+    def get_image_features(
+        self,
+        pixel_values: torch.FloatTensor,
+        **kwargs,
+    ):
+        """
+        Obtains image last hidden states from the vision tower and apply multimodal projection.
+
+        Args:
+            pixel_values (`torch.FloatTensor]` of shape `(batch_size, num_tiles, channels, height, width)`)
+               The tensors corresponding to the input images.
+        Returns:
+            image_features (`torch.Tensor`): Image feature tensor of shape `(num_tiles, num_patches, embed_dim)`).
+        """
+        image_outputs = self.vision_tower(pixel_values.flatten(0, 1))
+        image_outputs = image_outputs.last_hidden_state
+        if self.config.vision_use_cls_token:
+            image_outputs = image_outputs[:, 1:, :]
+        image_features = self.multi_modal_projector(image_outputs)
+        return image_features
+
+    def get_placeholder_mask(
+        self,
+        input_ids: torch.LongTensor,
+        inputs_embeds: torch.FloatTensor,
+        image_features: Optional[torch.FloatTensor] = None,
+        video_features: Optional[torch.FloatTensor] = None,
+    ):
+        """
+        Obtains multimodal placeholder mask from `input_ids` or `inputs_embeds`, and checks that the placeholder token count is
+        equal to the length of multimodal features. If the lengths are different, an error is raised.
+        """
+        if input_ids is None:
+            special_image_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.image_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_image_mask = special_image_mask.all(-1)
+            special_video_mask = inputs_embeds == self.get_input_embeddings()(
+                torch.tensor(self.config.video_token_id, dtype=torch.long, device=inputs_embeds.device)
+            )
+            special_video_mask = special_video_mask.all(-1)
+        else:
+            special_image_mask = input_ids == self.config.image_token_id
+            special_video_mask = input_ids == self.config.video_token_id
+
+        n_image_tokens = special_image_mask.sum()
+        special_image_mask = special_image_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        if image_features is not None and inputs_embeds[special_image_mask].numel() != image_features.numel():
+            raise ValueError(
+                f"Image features and image tokens do not match: tokens: {n_image_tokens}, features {image_features.size()[:-1].numel()}"
+            )
+
+        n_video_tokens = special_video_mask.sum()
+        special_video_mask = special_video_mask.unsqueeze(-1).expand_as(inputs_embeds).to(inputs_embeds.device)
+        if video_features is not None and inputs_embeds[special_video_mask].numel() != video_features.numel():
+            raise ValueError(
+                f"Videos features and image tokens do not match: tokens: {n_video_tokens}, features {video_features.size()[:-1].numel()}"
+            )
+
+        return special_image_mask, special_video_mask
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        pixel_values_videos: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **lm_kwargs,
+    ) -> Union[tuple, PerceptionLMModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+        if (pixel_values is not None or pixel_values_videos is not None) and inputs_embeds is not None:
+            raise ValueError(
+                "You cannot specify both (pixel_values or pixel_values_videos) and inputs_embeds at the same time, and must specify either one"
+            )
+
+        if inputs_embeds is None:
+            inputs_embeds = self.get_input_embeddings()(input_ids)
+
+        image_features = None
+        if pixel_values is not None:
+            image_features = self.get_image_features(pixel_values=pixel_values)
+            image_features = image_features.to(inputs_embeds.device, dtype=inputs_embeds.dtype)
+            special_image_mask, _ = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, image_features=image_features
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(special_image_mask, image_features)
+
+        video_features = None
+        if pixel_values_videos is not None:
+            video_features = self.get_image_features(pixel_values=pixel_values_videos)
+            video_features = video_features.to(inputs_embeds.device, dtype=inputs_embeds.dtype)
+            _, special_video_mask = self.get_placeholder_mask(
+                input_ids, inputs_embeds=inputs_embeds, video_features=video_features
+            )
+            inputs_embeds = inputs_embeds.masked_scatter(special_video_mask, video_features)
+
+        outputs = self.language_model(
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=True,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **lm_kwargs,
+        )
+        return PerceptionLMModelOutputWithPast(
+            last_hidden_state=outputs.last_hidden_state,
+            hidden_states=outputs.hidden_states,
+            past_key_values=outputs.past_key_values,
+            attentions=outputs.attentions,
+            image_hidden_states=image_features if pixel_values is not None else None,
+            video_hidden_states=(video_features if pixel_values_videos is not None else None),
+        )
+
+
+@auto_docstring
+class PerceptionLMForConditionalGeneration(LlavaForConditionalGeneration):
+    _checkpoint_conversion_mapping = {}
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        inputs_embeds=None,
+        pixel_values=None,
+        pixel_values_videos=None,
+        attention_mask=None,
+        cache_position=None,
+        logits_to_keep=None,
+        **kwargs,
+    ):
+        # Overwritten -- in specific circumstances we don't want to forward image inputs to the model
+
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **kwargs,
+        )
+
+        if cache_position[0] == 0:
+            # If we're in cached decoding stage, pixel values should be None because input ids do not contain special image token anymore
+            # Otherwise we need pixel values to be passed to model
+            model_inputs["pixel_values"] = pixel_values
+            model_inputs["pixel_values_videos"] = pixel_values_videos
+        return model_inputs
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        pixel_values: Optional[torch.FloatTensor] = None,
+        pixel_values_videos: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **lm_kwargs,
+    ) -> Union[tuple, PerceptionLMCausalLMOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        from transformers import AutoProcessor, AutoModelForImageTextToText
+        from huggingface_hub import hf_hub_download
+
+        MODEL_PATH = "facebook/Perception-LM-1B"
+        processor = AutoProcessor.from_pretrained(MODEL_PATH, use_fast=True)
+        model = AutoModelForImageTextToText.from_pretrained(MODEL_PATH).to("cuda")
+        test_image_file = hf_hub_download(
+                    repo_id="shumingh/perception_lm_test_images",
+                    filename="14496_0.PNG",
+                    repo_type="dataset",
+        )
+        conversation = [
+            {
+                "role": "user",
+                "content": [
+                    {
+                        "type": "image",
+                        "url": test_image_file,
+                    },
+                    {"type": "text", "text": "Describe the bar plot in the image."},
+                ],
+            }
+        ]
+
+        inputs = processor.apply_chat_template(
+            [conversation],
+            add_generation_prompt=True,
+            tokenize=True,
+            return_dict=True,
+            return_tensors="pt",
+        )
+        inputs = inputs.to(model.device)
+        generate_ids = model.generate(**inputs, max_new_tokens=256)
+        input_length = inputs["input_ids"].shape[1]
+        generate_ids_without_inputs = generate_ids[:, input_length:]
+
+        for output in processor.batch_decode(generate_ids_without_inputs, skip_special_tokens=True):
+            print(output)
+        ```"""
+        outputs = self.model(
+            input_ids=input_ids,
+            pixel_values=pixel_values,
+            pixel_values_videos=pixel_values_videos,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            logits_to_keep=logits_to_keep,
+            **lm_kwargs,
+        )
+
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(
+                logits=logits,
+                labels=labels,
+                vocab_size=self.config.text_config.vocab_size,
+                **lm_kwargs,
+            )
+
+        return PerceptionLMCausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            image_hidden_states=outputs.image_hidden_states,
+            video_hidden_states=outputs.video_hidden_states,
+        )
+
+    def get_image_features(self, **kwargs):
+        raise AttributeError("Not needed for PerceptionLM")
+
+    def language_model(self):
+        raise AttributeError("Not needed for PerceptionLM")
+
+    def vision_tower(self):
+        raise AttributeError("Not needed for PerceptionLM")
+
+    def multi_modal_projector(self):
+        raise AttributeError("Not needed for PerceptionLM")
+
+
+__all__ = [
+    "PerceptionLMForConditionalGeneration",
+    "PerceptionLMPreTrainedModel",
+    "PerceptionLMModel",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/processing_perception_lm.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/processing_perception_lm.py
new file mode 100644
index 0000000000000000000000000000000000000000..f61c54554d32c22b4f22ae8905233405c828f3da
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/processing_perception_lm.py
@@ -0,0 +1,244 @@
+# coding=utf-8
+# Copyright 2025 Meta Platforms, Inc. and the HuggingFace Inc. team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Processor class for PerceptionLM.
+"""
+
+from collections.abc import Iterable
+from typing import Optional, Union
+
+import numpy as np
+
+from ...feature_extraction_utils import BatchFeature
+from ...image_utils import ImageInput, get_image_size, to_numpy_array
+from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils import logging
+from ...video_utils import VideoInput
+
+
+logger = logging.get_logger(__name__)
+
+
+class PerceptionLMProcessorKwargs(ProcessingKwargs, total=False):
+    _defaults = {
+        "text_kwargs": {
+            "padding": False,
+            "return_mm_token_type_ids": False,
+        },
+    }
+
+
+class PerceptionLMProcessor(ProcessorMixin):
+    r"""
+    Constructs a PerceptionLM processor which wraps a PerceptionLM image processor, a PerceptionLM video processor, and a tokenizer into a single processor.
+
+    [`PerceptionLMProcessor`] offers all the functionalities of [`PerceptionLMImageProcessorFast`], [`PerceptionLMVideoProcessor`], and the tokenizer (e.g. [`LlamaTokenizerFast`]). See the
+    [`~PerceptionLMProcessor.__call__`] and [`~PerceptionLMProcessor.decode`] for more information.
+
+    Args:
+        video_processor ([`PerceptionLMVideoProcessor`], *optional*):
+            The video processor to process video inputs.
+        image_processor ([`PerceptionLMImageProcessorFast`], *optional*):
+            The image processor to process image inputs.
+        tokenizer ([`LlamaTokenizerFast`] or similar, *optional*):
+            The tokenizer to process text inputs.
+        patch_size (`int`, *optional*):
+            Patch size from the vision tower.
+        chat_template (`str`, *optional*):
+            A Jinja template which will be used to convert lists of messages in a chat into a tokenizable string.
+        pooling_ratio (`int`, *optional*, defaults to 2):
+            Pooling ratio for vision tokens. If not 1, 2D adaptive pooling is applied over projected vision tokens.
+    """
+
+    attributes = ["video_processor", "image_processor", "tokenizer"]
+    image_processor_class = "AutoImageProcessor"
+    video_processor_class = "AutoVideoProcessor"
+    tokenizer_class = "AutoTokenizer"
+
+    def __init__(
+        self,
+        video_processor=None,
+        image_processor=None,
+        tokenizer=None,
+        patch_size=None,
+        chat_template=None,
+        pooling_ratio=2,
+        **kwargs,
+    ):
+        self.patch_size = patch_size
+        self.pooling_ratio = pooling_ratio
+        self.image_token = tokenizer.image_token
+        self.video_token = tokenizer.video_token
+        self.image_token_id = tokenizer.image_token_id
+        self.video_token_id = tokenizer.video_token_id
+        super().__init__(video_processor, image_processor, tokenizer, chat_template=chat_template)
+
+    def __call__(
+        self,
+        images: Optional[ImageInput] = None,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
+        audio=None,
+        videos: Optional[VideoInput] = None,
+        **kwargs: Unpack[PerceptionLMProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Prepares a batch containing one or more sequences of text and/or images and/or videos.
+
+        If `text` is provided, it is tokenized using the tokenizer.
+        If `images` is provided, they are processed using the image processor.
+        If `videos` is provided, they are processed using the video processor.
+
+        Args:
+            images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`, *optional*):
+                The image or batch of images to be processed. Each image can be a PIL image, NumPy array, or PyTorch tensor.
+                Both channels-first and channels-last formats are supported.
+            text (`str`, `List[str]`, *optional*):
+                The sequence or batch of sequences to be tokenized. Each sequence can be a string.
+            videos (`Any`, *optional*):
+                The video or batch of videos to be processed.
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors of a particular framework. Acceptable values are:
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return NumPy `np.ndarray` objects.
+                - `'jax'`: Return JAX `jnp.ndarray` objects.
+
+        Returns:
+            [`BatchFeature`]: A [`BatchFeature`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is provided.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is provided).
+            - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is provided.
+            - **pixel_values_videos** -- Video pixel values to be fed to a model. Returned when `videos` is provided.
+        """
+        if text is None:
+            raise ValueError(
+                "You have to specify at least `text` input. Optionally, you can also specify `images` or `videos`."
+            )
+
+        output_kwargs = self._merge_kwargs(
+            PerceptionLMProcessorKwargs,
+            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
+            **kwargs,
+        )
+        if images is not None:
+            image_inputs = self.image_processor(images=images, **output_kwargs["images_kwargs"])
+        else:
+            image_inputs = {}
+
+        if videos is not None:
+            videos_inputs = self.video_processor(videos, **output_kwargs["videos_kwargs"])
+        else:
+            videos_inputs = {}
+
+        if isinstance(text, str):
+            text = [text]
+        elif not isinstance(text, list) and not isinstance(text[0], str):
+            raise ValueError("Invalid input text. Please provide a string, or a list of strings")
+
+        # try to expand inputs in processing if we have the necessary parts
+        prompt_strings = []
+
+        pixel_values = iter(image_inputs.get("pixel_values", []))
+        pixel_values_videos = iter(videos_inputs.get("pixel_values_videos", []))
+        for sample in text:
+            # Replace the media token with the expanded media token sequence
+            sample = self._expand_media_tokens(sample, self.tokenizer.image_token, pixel_values)
+            sample = self._expand_media_tokens(sample, self.tokenizer.video_token, pixel_values_videos)
+            prompt_strings.append(sample)
+
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
+        return_mm_token_type_ids = output_kwargs["text_kwargs"].pop("return_mm_token_type_ids", False)
+        text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"], return_tensors=None)
+        self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image", "video"])
+
+        if return_mm_token_type_ids:
+            array_ids = np.array(text_inputs["input_ids"])
+            mm_token_type_ids = np.zeros_like(text_inputs["input_ids"])
+            mm_token_type_ids[array_ids == self.image_token_id] = 1
+            text_inputs["mm_token_type_ids"] = mm_token_type_ids.tolist()
+
+        return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs}, tensor_type=return_tensors)
+
+    def _expand_media_tokens(self, sample, media_token: str, media_iter: Iterable):
+        media_count = sample.count(media_token)
+        if media_count > 0:
+            media_list = [next(media_iter) for _ in range(media_count)]
+            sample_splits = sample.split(media_token)
+            media_token_list = []
+            for media in media_list:
+                height, width = get_image_size(to_numpy_array(media))
+                num_tiles = media.shape[0]
+                num_media_tokens = (
+                    (height // self.patch_size // self.pooling_ratio)
+                    * (width // self.patch_size // self.pooling_ratio)
+                    * num_tiles
+                )
+                media_token_list.append(num_media_tokens)
+            sample = ""
+            for i, num_media_tokens in enumerate(media_token_list):
+                sample += sample_splits[i]
+                sample += media_token * num_media_tokens
+            sample += sample_splits[-1]
+        return sample
+
+    def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs):
+        """
+        Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
+
+        Args:
+            image_sizes (`list[list[int]]`, *optional*):
+                The input sizes formatted as (height, width) per each image.
+
+        Returns:
+            `MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
+            input modalities, along with other useful data.
+        """
+
+        vision_data = {}
+        if image_sizes is not None:
+            images_kwargs = PerceptionLMProcessorKwargs._defaults.get("images_kwargs", {})
+            images_kwargs.update(kwargs)
+            tile_size = images_kwargs.get("tile_size", None) or self.image_processor.tile_size
+            vision_input_type = images_kwargs.get("vision_input_type", None) or self.image_processor.vision_input_type
+
+            num_image_tokens = []
+            num_image_patches = []
+            for height, width in image_sizes:
+                if vision_input_type == "thumb+tile":
+                    aspect_ratio = self.image_processor._fit_image_to_canvas(
+                        img_width=width, img_height=height, tile_size=tile_size
+                    )
+                    if aspect_ratio is None:
+                        aspect_ratio = self.image_processor._find_closest_aspect_ratio(
+                            img_width=width, img_height=height, tile_size=tile_size
+                        )
+                    num_tiles = aspect_ratio[0] * aspect_ratio[1] + 1  # base image and tiles
+                else:
+                    num_tiles = 1
+
+                num_image_tokens.append(
+                    (tile_size // self.patch_size // self.pooling_ratio)
+                    * (tile_size // self.patch_size // self.pooling_ratio)
+                    * num_tiles
+                )
+                num_image_patches.append(num_tiles)
+
+            vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches})
+        return MultiModalData(**vision_data)
+
+
+__all__ = ["PerceptionLMProcessor"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/video_processing_perception_lm.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/video_processing_perception_lm.py
new file mode 100644
index 0000000000000000000000000000000000000000..1023aa7c589dab5b249c6dff3d61e4d6e523eda8
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/perception_lm/video_processing_perception_lm.py
@@ -0,0 +1,41 @@
+# coding=utf-8
+# Copyright 2025 Meta Platforms, Inc. and the HuggingFace Inc. team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Video processor class for PerceptionLM."""
+
+from ...image_utils import IMAGENET_STANDARD_MEAN, IMAGENET_STANDARD_STD, PILImageResampling
+from ...processing_utils import Unpack, VideosKwargs
+from ...video_processing_utils import BaseVideoProcessor
+
+
+class PerceptionLMFastVideoProcessorInitKwargs(VideosKwargs): ...
+
+
+class PerceptionLMVideoProcessor(BaseVideoProcessor):
+    resample = PILImageResampling.BICUBIC
+    image_mean = IMAGENET_STANDARD_MEAN
+    image_std = IMAGENET_STANDARD_STD
+    size = {"height": 448, "width": 448}
+    do_resize = True
+    do_center_crop = False
+    do_rescale = True
+    do_normalize = True
+    do_convert_rgb = True
+    valid_kwargs = PerceptionLMFastVideoProcessorInitKwargs
+    model_input_names = ["pixel_values_videos"]
+
+    def __init__(self, **kwargs: Unpack[PerceptionLMFastVideoProcessorInitKwargs]):
+        super().__init__(**kwargs)
+
+
+__all__ = ["PerceptionLMVideoProcessor"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/persimmon/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/persimmon/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3de7bff9396f3676b4f750cc9b14a5a9d5af039d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/persimmon/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/persimmon/__pycache__/configuration_persimmon.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/persimmon/__pycache__/configuration_persimmon.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c5dcdbfee5f9118482f6a0981abb4277dd7cfb2a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/persimmon/__pycache__/configuration_persimmon.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/persimmon/__pycache__/modeling_persimmon.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/persimmon/__pycache__/modeling_persimmon.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..994952322388b60ce4e78a7bc1881827bf6ac3a2
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/persimmon/__pycache__/modeling_persimmon.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..35779de1410250d586f01b35826ce2d7f8d6850d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi/__pycache__/configuration_phi.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi/__pycache__/configuration_phi.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..16869b0b233c07c42e94d8e2825d095fc4a35fd1
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi/__pycache__/configuration_phi.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi/__pycache__/modeling_phi.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi/__pycache__/modeling_phi.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9d9b62db832f30517530cd9945494e1ecb05e014
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi/__pycache__/modeling_phi.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi/__pycache__/modular_phi.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi/__pycache__/modular_phi.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d4280a842d28ae6544c2cb047ba9f9813e390c31
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi/__pycache__/modular_phi.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi4_multimodal/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi4_multimodal/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3ffc31db387bfce33d61444c70bc711ba18e4e7b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi4_multimodal/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi4_multimodal/__pycache__/configuration_phi4_multimodal.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi4_multimodal/__pycache__/configuration_phi4_multimodal.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6762407a6389a521407275b1f31ac6f50c39ffc3
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi4_multimodal/__pycache__/configuration_phi4_multimodal.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi4_multimodal/__pycache__/feature_extraction_phi4_multimodal.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi4_multimodal/__pycache__/feature_extraction_phi4_multimodal.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2e3a0b47843c5317030f00597c5154c7d1c34dd9
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi4_multimodal/__pycache__/feature_extraction_phi4_multimodal.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi4_multimodal/__pycache__/image_processing_phi4_multimodal_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi4_multimodal/__pycache__/image_processing_phi4_multimodal_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..669ad1f83d3bd13674775a7cf665d6d28080a059
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi4_multimodal/__pycache__/image_processing_phi4_multimodal_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi4_multimodal/__pycache__/modular_phi4_multimodal.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi4_multimodal/__pycache__/modular_phi4_multimodal.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f600cd1a810eb9f06962c83be4cbda823fcc6e64
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi4_multimodal/__pycache__/modular_phi4_multimodal.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi4_multimodal/__pycache__/processing_phi4_multimodal.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi4_multimodal/__pycache__/processing_phi4_multimodal.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6bae7e9c1b21f4aa96c983ab8cdd77372ed91f92
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phi4_multimodal/__pycache__/processing_phi4_multimodal.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phobert/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phobert/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..218eaf015bcef9db6be649129a065218f5506db1
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phobert/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phobert/__pycache__/tokenization_phobert.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phobert/__pycache__/tokenization_phobert.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3ebcc8bcd1139c21e41b93df6e697839a995e93d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/phobert/__pycache__/tokenization_phobert.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pixtral/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pixtral/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c9603e1080f397e067683ea9c9ee42fe8d14d3f6
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pixtral/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pixtral/__pycache__/configuration_pixtral.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pixtral/__pycache__/configuration_pixtral.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d9c863307d823e100104587bb1b0566e855d96c8
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pixtral/__pycache__/configuration_pixtral.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pixtral/__pycache__/image_processing_pixtral.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pixtral/__pycache__/image_processing_pixtral.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bc1daaa741c3ee32e9bcd09caf6b1015fcacdb01
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pixtral/__pycache__/image_processing_pixtral.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pixtral/__pycache__/image_processing_pixtral_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pixtral/__pycache__/image_processing_pixtral_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..036b948a70a8e40329a03a506e2f84d500b634da
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pixtral/__pycache__/image_processing_pixtral_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pixtral/__pycache__/modeling_pixtral.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pixtral/__pycache__/modeling_pixtral.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..17d04f1cb06895bb149564d56e9907571cfa56f3
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pixtral/__pycache__/modeling_pixtral.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pixtral/__pycache__/processing_pixtral.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pixtral/__pycache__/processing_pixtral.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..419e3e153cb4454c0161f443015d4a37b1bd02f0
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pixtral/__pycache__/processing_pixtral.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/plbart/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/plbart/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ea5914c13d396d7a49eafd15358e8a379dad079c
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/plbart/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/plbart/__pycache__/configuration_plbart.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/plbart/__pycache__/configuration_plbart.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4041b64275c71241f1a77c6c6f69af2818d7ef74
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/plbart/__pycache__/configuration_plbart.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/plbart/__pycache__/modeling_plbart.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/plbart/__pycache__/modeling_plbart.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b3cc5c70274ebf5e154cdec74ce28caa5dd30ce5
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/plbart/__pycache__/modeling_plbart.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/plbart/__pycache__/modular_plbart.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/plbart/__pycache__/modular_plbart.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f897c2dfcdca4d00cdce2262589d8d72d6c1add5
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/plbart/__pycache__/modular_plbart.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/plbart/__pycache__/tokenization_plbart.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/plbart/__pycache__/tokenization_plbart.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c5d45053d0d270d3747906b415ba5f40be29ddd9
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/plbart/__pycache__/tokenization_plbart.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pvt/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pvt/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e0287444a0b0511b4f3155c47ca2a7c4444f1a3d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pvt/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pvt/__pycache__/configuration_pvt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pvt/__pycache__/configuration_pvt.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..88718d88c37fd5926d3bc337a4a5bf196f287aa3
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pvt/__pycache__/configuration_pvt.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pvt/__pycache__/image_processing_pvt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pvt/__pycache__/image_processing_pvt.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..572a30e376d2b792c385b448cd9abf446789c26a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pvt/__pycache__/image_processing_pvt.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pvt/__pycache__/image_processing_pvt_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pvt/__pycache__/image_processing_pvt_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..230fba38a0be29110a166b3ee8e22489ded1d67b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pvt/__pycache__/image_processing_pvt_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pvt/__pycache__/modeling_pvt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pvt/__pycache__/modeling_pvt.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d2968be8fd6262e0a68a28a403124c30b6e5730a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pvt/__pycache__/modeling_pvt.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pvt_v2/__pycache__/configuration_pvt_v2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pvt_v2/__pycache__/configuration_pvt_v2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c79541e8f178c120225b56727757e30be575d9db
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/pvt_v2/__pycache__/configuration_pvt_v2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2_audio/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2_audio/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a46fde7375f4705708d890524357981a661ecba2
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2_audio/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2_audio/__pycache__/configuration_qwen2_audio.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2_audio/__pycache__/configuration_qwen2_audio.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..11e25c79e30334e8e668f12373d30d4a172fc2f7
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2_audio/__pycache__/configuration_qwen2_audio.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2_audio/__pycache__/modeling_qwen2_audio.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2_audio/__pycache__/modeling_qwen2_audio.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2a7a3f2f4dcb370238d122b0dc9040061a306933
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2_audio/__pycache__/modeling_qwen2_audio.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2_audio/__pycache__/processing_qwen2_audio.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2_audio/__pycache__/processing_qwen2_audio.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..30f22e32b420be13264d18225e6dbd56e89acbdc
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen2_audio/__pycache__/processing_qwen2_audio.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c206b7418327caa614e4d2a792463c4e7f65ec2d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3/__pycache__/configuration_qwen3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3/__pycache__/configuration_qwen3.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0cbd02d40d3805f10bde801ff4bc96206bf4c02f
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3/__pycache__/configuration_qwen3.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3/__pycache__/modeling_qwen3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3/__pycache__/modeling_qwen3.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..287b1ffc9ce0223fac3969b16943356f7e744296
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3/__pycache__/modeling_qwen3.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3/__pycache__/modular_qwen3.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3/__pycache__/modular_qwen3.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fac8e688842827a70932e9715c30c935fe31cdfd
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3/__pycache__/modular_qwen3.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_next/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_next/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ddc1c48f33740c1b197c183d64ae8861ff77391a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_next/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_next/__pycache__/configuration_qwen3_next.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_next/__pycache__/configuration_qwen3_next.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c7a71e423195a5a3436aec73ac163e175bc15ff3
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_next/__pycache__/configuration_qwen3_next.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_next/__pycache__/modeling_qwen3_next.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_next/__pycache__/modeling_qwen3_next.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..af30c5b5b2f7cf704af18129bb3ed8b10d8ec461
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_next/__pycache__/modeling_qwen3_next.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_next/__pycache__/modular_qwen3_next.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_next/__pycache__/modular_qwen3_next.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0cf4c1842f2f3bc36d147250a1974e2591a1cfe2
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_next/__pycache__/modular_qwen3_next.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..510063a436a8f06a623d01abcbb018fae38528d0
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl/__pycache__/processing_qwen3_vl.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl/__pycache__/processing_qwen3_vl.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..43d9c902c108f8f14de191e9b61580321bdfb291
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl/__pycache__/processing_qwen3_vl.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl_moe/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl_moe/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..edb67a2531d586fbbcb6cda77b8c861a286625e9
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl_moe/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl_moe/__pycache__/configuration_qwen3_vl_moe.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl_moe/__pycache__/configuration_qwen3_vl_moe.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8607078a78fbd8ecf6667028d96329c123e1f6cc
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl_moe/__pycache__/configuration_qwen3_vl_moe.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl_moe/__pycache__/modeling_qwen3_vl_moe.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl_moe/__pycache__/modeling_qwen3_vl_moe.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..22fe30ce0af56667a7829b8f99089266dc8f3f7d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl_moe/__pycache__/modeling_qwen3_vl_moe.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl_moe/__pycache__/modular_qwen3_vl_moe.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl_moe/__pycache__/modular_qwen3_vl_moe.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..71c34219d52ce900fc72a29097d5dbf18d7a2b0b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/qwen3_vl_moe/__pycache__/modular_qwen3_vl_moe.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3ee1a45c035034c34db8b997d55553d5c46c8e86
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/__pycache__/configuration_roberta_prelayernorm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/__pycache__/configuration_roberta_prelayernorm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..81fcdffeee7b037200da156d7d6b851df7e0c930
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/__pycache__/configuration_roberta_prelayernorm.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/__pycache__/modeling_flax_roberta_prelayernorm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/__pycache__/modeling_flax_roberta_prelayernorm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9ccbed1c48d3b997e16121d15b0ced0b663b71a5
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/__pycache__/modeling_flax_roberta_prelayernorm.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/__pycache__/modeling_roberta_prelayernorm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/__pycache__/modeling_roberta_prelayernorm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f4ff9c1f7624a02b888acafa67dc34ae3189bb85
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/__pycache__/modeling_roberta_prelayernorm.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/__pycache__/modeling_tf_roberta_prelayernorm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/__pycache__/modeling_tf_roberta_prelayernorm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8b9751e47ec4df58d1d32a29b16b0f0afbb5fbb1
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/roberta_prelayernorm/__pycache__/modeling_tf_roberta_prelayernorm.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..cd1fac5b679dc5ca527dcc0a6a45b9a873daa3dc
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/__init__.py
@@ -0,0 +1,33 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_rt_detr import *
+    from .configuration_rt_detr_resnet import *
+    from .image_processing_rt_detr import *
+    from .image_processing_rt_detr_fast import *
+    from .modeling_rt_detr import *
+    from .modeling_rt_detr_resnet import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/__pycache__/configuration_rt_detr_resnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/__pycache__/configuration_rt_detr_resnet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..261ffab3e8a20239b7444e4dd460d8d19e857cc5
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/__pycache__/configuration_rt_detr_resnet.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/__pycache__/image_processing_rt_detr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/__pycache__/image_processing_rt_detr.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..504bc5537a8274c8325a3c0998907420ac5cff4d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/__pycache__/image_processing_rt_detr.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/__pycache__/modular_rt_detr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/__pycache__/modular_rt_detr.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f71d4c33f8dcc1f09f9b4472bc6ca8673fa27f36
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/__pycache__/modular_rt_detr.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/configuration_rt_detr.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/configuration_rt_detr.py
new file mode 100644
index 0000000000000000000000000000000000000000..994d4a6fd6f02a9c11817c3ad1bdcdd83a14bb24
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/configuration_rt_detr.py
@@ -0,0 +1,372 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""RT-DETR model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ...utils.backbone_utils import verify_backbone_config_arguments
+from ..auto import CONFIG_MAPPING
+from .configuration_rt_detr_resnet import RTDetrResNetConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+class RTDetrConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`RTDetrModel`]. It is used to instantiate a
+    RT-DETR model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the RT-DETR
+    [PekingU/rtdetr_r50vd](https://huggingface.co/PekingU/rtdetr_r50vd) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        initializer_range (`float`, *optional*, defaults to 0.01):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        initializer_bias_prior_prob (`float`, *optional*):
+            The prior probability used by the bias initializer to initialize biases for `enc_score_head` and `class_embed`.
+            If `None`, `prior_prob` computed as `prior_prob = 1 / (num_labels + 1)` while initializing model weights.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        batch_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the batch normalization layers.
+        backbone_config (`Dict`, *optional*, defaults to `RTDetrResNetConfig()`):
+            The configuration of the backbone model.
+        backbone (`str`, *optional*):
+            Name of backbone to use when `backbone_config` is `None`. If `use_pretrained_backbone` is `True`, this
+            will load the corresponding pretrained weights from the timm or transformers library. If `use_pretrained_backbone`
+            is `False`, this loads the backbone's config and uses that to initialize the backbone with random weights.
+        use_pretrained_backbone (`bool`, *optional*, defaults to `False`):
+            Whether to use pretrained weights for the backbone.
+        use_timm_backbone (`bool`, *optional*, defaults to `False`):
+            Whether to load `backbone` from the timm library. If `False`, the backbone is loaded from the transformers
+            library.
+        freeze_backbone_batch_norms (`bool`, *optional*, defaults to `True`):
+            Whether to freeze the batch normalization layers in the backbone.
+        backbone_kwargs (`dict`, *optional*):
+            Keyword arguments to be passed to AutoBackbone when loading from a checkpoint
+            e.g. `{'out_indices': (0, 1, 2, 3)}`. Cannot be specified if `backbone_config` is set.
+        encoder_hidden_dim (`int`, *optional*, defaults to 256):
+            Dimension of the layers in hybrid encoder.
+        encoder_in_channels (`list`, *optional*, defaults to `[512, 1024, 2048]`):
+            Multi level features input for encoder.
+        feat_strides (`list[int]`, *optional*, defaults to `[8, 16, 32]`):
+            Strides used in each feature map.
+        encoder_layers (`int`, *optional*, defaults to 1):
+            Total of layers to be used by the encoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 1024):
+            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+        encoder_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        dropout (`float`, *optional*, defaults to 0.0):
+            The ratio for all dropout layers.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for activations inside the fully connected layer.
+        encode_proj_layers (`list[int]`, *optional*, defaults to `[2]`):
+            Indexes of the projected layers to be used in the encoder.
+        positional_encoding_temperature (`int`, *optional*, defaults to 10000):
+            The temperature parameter used to create the positional encodings.
+        encoder_activation_function (`str`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        activation_function (`str`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the general layer. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        eval_size (`tuple[int, int]`, *optional*):
+            Height and width used to computes the effective height and width of the position embeddings after taking
+            into account the stride.
+        normalize_before (`bool`, *optional*, defaults to `False`):
+            Determine whether to apply layer normalization in the transformer encoder layer before self-attention and
+            feed-forward modules.
+        hidden_expansion (`float`, *optional*, defaults to 1.0):
+            Expansion ratio to enlarge the dimension size of RepVGGBlock and CSPRepLayer.
+        d_model (`int`, *optional*, defaults to 256):
+            Dimension of the layers exclude hybrid encoder.
+        num_queries (`int`, *optional*, defaults to 300):
+            Number of object queries.
+        decoder_in_channels (`list`, *optional*, defaults to `[256, 256, 256]`):
+            Multi level features dimension for decoder
+        decoder_ffn_dim (`int`, *optional*, defaults to 1024):
+            Dimension of the "intermediate" (often named feed-forward) layer in decoder.
+        num_feature_levels (`int`, *optional*, defaults to 3):
+            The number of input feature levels.
+        decoder_n_points (`int`, *optional*, defaults to 4):
+            The number of sampled keys in each feature level for each attention head in the decoder.
+        decoder_layers (`int`, *optional*, defaults to 6):
+            Number of decoder layers.
+        decoder_attention_heads (`int`, *optional*, defaults to 8):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        decoder_activation_function (`str`, *optional*, defaults to `"relu"`):
+            The non-linear activation function (function or string) in the decoder. If string, `"gelu"`,
+            `"relu"`, `"silu"` and `"gelu_new"` are supported.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        num_denoising (`int`, *optional*, defaults to 100):
+            The total number of denoising tasks or queries to be used for contrastive denoising.
+        label_noise_ratio (`float`, *optional*, defaults to 0.5):
+            The fraction of denoising labels to which random noise should be added.
+        box_noise_scale (`float`, *optional*, defaults to 1.0):
+            Scale or magnitude of noise to be added to the bounding boxes.
+        learn_initial_query (`bool`, *optional*, defaults to `False`):
+            Indicates whether the initial query embeddings for the decoder should be learned during training
+        anchor_image_size (`tuple[int, int]`, *optional*):
+            Height and width of the input image used during evaluation to generate the bounding box anchors. If None, automatic generate anchor is applied.
+        disable_custom_kernels (`bool`, *optional*, defaults to `True`):
+            Whether to disable custom kernels.
+        with_box_refine (`bool`, *optional*, defaults to `True`):
+            Whether to apply iterative bounding box refinement, where each decoder layer refines the bounding boxes
+            based on the predictions from the previous layer.
+        is_encoder_decoder (`bool`, *optional*, defaults to `True`):
+            Whether the architecture has an encoder decoder structure.
+        matcher_alpha (`float`, *optional*, defaults to 0.25):
+            Parameter alpha used by the Hungarian Matcher.
+        matcher_gamma (`float`, *optional*, defaults to 2.0):
+            Parameter gamma used by the Hungarian Matcher.
+        matcher_class_cost (`float`, *optional*, defaults to 2.0):
+            The relative weight of the class loss used by the Hungarian Matcher.
+        matcher_bbox_cost (`float`, *optional*, defaults to 5.0):
+            The relative weight of the bounding box loss used by the Hungarian Matcher.
+        matcher_giou_cost (`float`, *optional*, defaults to 2.0):
+            The relative weight of the giou loss of used by the Hungarian Matcher.
+        use_focal_loss (`bool`, *optional*, defaults to `True`):
+            Parameter informing if focal focal should be used.
+        auxiliary_loss (`bool`, *optional*, defaults to `True`):
+            Whether auxiliary decoding losses (loss at each decoder layer) are to be used.
+        focal_loss_alpha (`float`, *optional*, defaults to 0.75):
+            Parameter alpha used to compute the focal loss.
+        focal_loss_gamma (`float`, *optional*, defaults to 2.0):
+            Parameter gamma used to compute the focal loss.
+        weight_loss_vfl (`float`, *optional*, defaults to 1.0):
+            Relative weight of the varifocal loss in the object detection loss.
+        weight_loss_bbox (`float`, *optional*, defaults to 5.0):
+            Relative weight of the L1 bounding box loss in the object detection loss.
+        weight_loss_giou (`float`, *optional*, defaults to 2.0):
+            Relative weight of the generalized IoU loss in the object detection loss.
+        eos_coefficient (`float`, *optional*, defaults to 0.0001):
+            Relative classification weight of the 'no-object' class in the object detection loss.
+
+    Examples:
+
+    ```python
+    >>> from transformers import RTDetrConfig, RTDetrModel
+
+    >>> # Initializing a RT-DETR configuration
+    >>> configuration = RTDetrConfig()
+
+    >>> # Initializing a model (with random weights) from the configuration
+    >>> model = RTDetrModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "rt_detr"
+    layer_types = ["basic", "bottleneck"]
+    attribute_map = {
+        "hidden_size": "d_model",
+        "num_attention_heads": "encoder_attention_heads",
+    }
+
+    def __init__(
+        self,
+        initializer_range=0.01,
+        initializer_bias_prior_prob=None,
+        layer_norm_eps=1e-5,
+        batch_norm_eps=1e-5,
+        # backbone
+        backbone_config=None,
+        backbone=None,
+        use_pretrained_backbone=False,
+        use_timm_backbone=False,
+        freeze_backbone_batch_norms=True,
+        backbone_kwargs=None,
+        # encoder HybridEncoder
+        encoder_hidden_dim=256,
+        encoder_in_channels=[512, 1024, 2048],
+        feat_strides=[8, 16, 32],
+        encoder_layers=1,
+        encoder_ffn_dim=1024,
+        encoder_attention_heads=8,
+        dropout=0.0,
+        activation_dropout=0.0,
+        encode_proj_layers=[2],
+        positional_encoding_temperature=10000,
+        encoder_activation_function="gelu",
+        activation_function="silu",
+        eval_size=None,
+        normalize_before=False,
+        hidden_expansion=1.0,
+        # decoder RTDetrTransformer
+        d_model=256,
+        num_queries=300,
+        decoder_in_channels=[256, 256, 256],
+        decoder_ffn_dim=1024,
+        num_feature_levels=3,
+        decoder_n_points=4,
+        decoder_layers=6,
+        decoder_attention_heads=8,
+        decoder_activation_function="relu",
+        attention_dropout=0.0,
+        num_denoising=100,
+        label_noise_ratio=0.5,
+        box_noise_scale=1.0,
+        learn_initial_query=False,
+        anchor_image_size=None,
+        disable_custom_kernels=True,
+        with_box_refine=True,
+        is_encoder_decoder=True,
+        # Loss
+        matcher_alpha=0.25,
+        matcher_gamma=2.0,
+        matcher_class_cost=2.0,
+        matcher_bbox_cost=5.0,
+        matcher_giou_cost=2.0,
+        use_focal_loss=True,
+        auxiliary_loss=True,
+        focal_loss_alpha=0.75,
+        focal_loss_gamma=2.0,
+        weight_loss_vfl=1.0,
+        weight_loss_bbox=5.0,
+        weight_loss_giou=2.0,
+        eos_coefficient=1e-4,
+        **kwargs,
+    ):
+        self.initializer_range = initializer_range
+        self.initializer_bias_prior_prob = initializer_bias_prior_prob
+        self.layer_norm_eps = layer_norm_eps
+        self.batch_norm_eps = batch_norm_eps
+        # backbone
+        if backbone_config is None and backbone is None:
+            logger.info(
+                "`backbone_config` and `backbone` are `None`. Initializing the config with the default `RTDetr-ResNet` backbone."
+            )
+            backbone_config = RTDetrResNetConfig(
+                num_channels=3,
+                embedding_size=64,
+                hidden_sizes=[256, 512, 1024, 2048],
+                depths=[3, 4, 6, 3],
+                layer_type="bottleneck",
+                hidden_act="relu",
+                downsample_in_first_stage=False,
+                downsample_in_bottleneck=False,
+                out_features=None,
+                out_indices=[2, 3, 4],
+            )
+        elif isinstance(backbone_config, dict):
+            backbone_model_type = backbone_config.pop("model_type")
+            config_class = CONFIG_MAPPING[backbone_model_type]
+            backbone_config = config_class.from_dict(backbone_config)
+
+        verify_backbone_config_arguments(
+            use_timm_backbone=use_timm_backbone,
+            use_pretrained_backbone=use_pretrained_backbone,
+            backbone=backbone,
+            backbone_config=backbone_config,
+            backbone_kwargs=backbone_kwargs,
+        )
+
+        self.backbone_config = backbone_config
+        self.backbone = backbone
+        self.use_pretrained_backbone = use_pretrained_backbone
+        self.use_timm_backbone = use_timm_backbone
+        self.freeze_backbone_batch_norms = freeze_backbone_batch_norms
+        self.backbone_kwargs = backbone_kwargs
+        # encoder
+        self.encoder_hidden_dim = encoder_hidden_dim
+        self.encoder_in_channels = encoder_in_channels
+        self.feat_strides = feat_strides
+        self.encoder_attention_heads = encoder_attention_heads
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.dropout = dropout
+        self.activation_dropout = activation_dropout
+        self.encode_proj_layers = encode_proj_layers
+        self.encoder_layers = encoder_layers
+        self.positional_encoding_temperature = positional_encoding_temperature
+        self.eval_size = eval_size
+        self.normalize_before = normalize_before
+        self.encoder_activation_function = encoder_activation_function
+        self.activation_function = activation_function
+        self.hidden_expansion = hidden_expansion
+        # decoder
+        self.d_model = d_model
+        self.num_queries = num_queries
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_in_channels = decoder_in_channels
+        self.num_feature_levels = num_feature_levels
+        self.decoder_n_points = decoder_n_points
+        self.decoder_layers = decoder_layers
+        self.decoder_attention_heads = decoder_attention_heads
+        self.decoder_activation_function = decoder_activation_function
+        self.attention_dropout = attention_dropout
+        self.num_denoising = num_denoising
+        self.label_noise_ratio = label_noise_ratio
+        self.box_noise_scale = box_noise_scale
+        self.learn_initial_query = learn_initial_query
+        self.anchor_image_size = anchor_image_size
+        self.auxiliary_loss = auxiliary_loss
+        self.disable_custom_kernels = disable_custom_kernels
+        self.with_box_refine = with_box_refine
+        # Loss
+        self.matcher_alpha = matcher_alpha
+        self.matcher_gamma = matcher_gamma
+        self.matcher_class_cost = matcher_class_cost
+        self.matcher_bbox_cost = matcher_bbox_cost
+        self.matcher_giou_cost = matcher_giou_cost
+        self.use_focal_loss = use_focal_loss
+        self.focal_loss_alpha = focal_loss_alpha
+        self.focal_loss_gamma = focal_loss_gamma
+        self.weight_loss_vfl = weight_loss_vfl
+        self.weight_loss_bbox = weight_loss_bbox
+        self.weight_loss_giou = weight_loss_giou
+        self.eos_coefficient = eos_coefficient
+        super().__init__(is_encoder_decoder=is_encoder_decoder, **kwargs)
+
+    @property
+    def num_attention_heads(self) -> int:
+        return self.encoder_attention_heads
+
+    @property
+    def hidden_size(self) -> int:
+        return self.d_model
+
+    @property
+    def sub_configs(self):
+        return (
+            {"backbone_config": type(self.backbone_config)}
+            if getattr(self, "backbone_config", None) is not None
+            else {}
+        )
+
+    @classmethod
+    def from_backbone_configs(cls, backbone_config: PretrainedConfig, **kwargs):
+        """Instantiate a [`RTDetrConfig`] (or a derived class) from a pre-trained backbone model configuration and DETR model
+        configuration.
+
+            Args:
+                backbone_config ([`PretrainedConfig`]):
+                    The backbone configuration.
+
+            Returns:
+                [`RTDetrConfig`]: An instance of a configuration object
+        """
+        return cls(
+            backbone_config=backbone_config,
+            **kwargs,
+        )
+
+
+__all__ = ["RTDetrConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/configuration_rt_detr_resnet.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/configuration_rt_detr_resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..73b9517ab1498fe7fb91fd8f67a0827a55cacc7d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/configuration_rt_detr_resnet.py
@@ -0,0 +1,114 @@
+# coding=utf-8
+# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""RT-DETR ResNet model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+from ...utils.backbone_utils import BackboneConfigMixin, get_aligned_output_features_output_indices
+
+
+logger = logging.get_logger(__name__)
+
+
+class RTDetrResNetConfig(BackboneConfigMixin, PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`RTDetrResnetBackbone`]. It is used to instantiate an
+    ResNet model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the ResNet
+    [microsoft/resnet-50](https://huggingface.co/microsoft/resnet-50) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+    Args:
+        num_channels (`int`, *optional*, defaults to 3):
+            The number of input channels.
+        embedding_size (`int`, *optional*, defaults to 64):
+            Dimensionality (hidden size) for the embedding layer.
+        hidden_sizes (`list[int]`, *optional*, defaults to `[256, 512, 1024, 2048]`):
+            Dimensionality (hidden size) at each stage.
+        depths (`list[int]`, *optional*, defaults to `[3, 4, 6, 3]`):
+            Depth (number of layers) for each stage.
+        layer_type (`str`, *optional*, defaults to `"bottleneck"`):
+            The layer to use, it can be either `"basic"` (used for smaller models, like resnet-18 or resnet-34) or
+            `"bottleneck"` (used for larger models like resnet-50 and above).
+        hidden_act (`str`, *optional*, defaults to `"relu"`):
+            The non-linear activation function in each block. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"`
+            are supported.
+        downsample_in_first_stage (`bool`, *optional*, defaults to `False`):
+            If `True`, the first stage will downsample the inputs using a `stride` of 2.
+        downsample_in_bottleneck (`bool`, *optional*, defaults to `False`):
+            If `True`, the first conv 1x1 in ResNetBottleNeckLayer will downsample the inputs using a `stride` of 2.
+        out_features (`list[str]`, *optional*):
+            If used as backbone, list of features to output. Can be any of `"stem"`, `"stage1"`, `"stage2"`, etc.
+            (depending on how many stages the model has). If unset and `out_indices` is set, will default to the
+            corresponding stages. If unset and `out_indices` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+        out_indices (`list[int]`, *optional*):
+            If used as backbone, list of indices of features to output. Can be any of 0, 1, 2, etc. (depending on how
+            many stages the model has). If unset and `out_features` is set, will default to the corresponding stages.
+            If unset and `out_features` is unset, will default to the last stage. Must be in the
+            same order as defined in the `stage_names` attribute.
+
+    Example:
+    ```python
+    >>> from transformers import RTDetrResNetConfig, RTDetrResnetBackbone
+
+    >>> # Initializing a ResNet resnet-50 style configuration
+    >>> configuration = RTDetrResNetConfig()
+
+    >>> # Initializing a model (with random weights) from the resnet-50 style configuration
+    >>> model = RTDetrResnetBackbone(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```
+    """
+
+    model_type = "rt_detr_resnet"
+    layer_types = ["basic", "bottleneck"]
+
+    def __init__(
+        self,
+        num_channels=3,
+        embedding_size=64,
+        hidden_sizes=[256, 512, 1024, 2048],
+        depths=[3, 4, 6, 3],
+        layer_type="bottleneck",
+        hidden_act="relu",
+        downsample_in_first_stage=False,
+        downsample_in_bottleneck=False,
+        out_features=None,
+        out_indices=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if layer_type not in self.layer_types:
+            raise ValueError(f"layer_type={layer_type} is not one of {','.join(self.layer_types)}")
+        self.num_channels = num_channels
+        self.embedding_size = embedding_size
+        self.hidden_sizes = hidden_sizes
+        self.depths = depths
+        self.layer_type = layer_type
+        self.hidden_act = hidden_act
+        self.downsample_in_first_stage = downsample_in_first_stage
+        self.downsample_in_bottleneck = downsample_in_bottleneck
+        self.stage_names = ["stem"] + [f"stage{idx}" for idx in range(1, len(depths) + 1)]
+        self._out_features, self._out_indices = get_aligned_output_features_output_indices(
+            out_features=out_features, out_indices=out_indices, stage_names=self.stage_names
+        )
+
+
+__all__ = ["RTDetrResNetConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/image_processing_rt_detr_fast.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/image_processing_rt_detr_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..9aae271deaccbab9784dd7e02059eba29d32d5f2
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/image_processing_rt_detr_fast.py
@@ -0,0 +1,559 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/rt_detr/modular_rt_detr.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_rt_detr.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+import pathlib
+from typing import Any, Optional, Union
+
+import torch
+from torchvision.transforms.v2 import functional as F
+
+from ...image_processing_utils import BatchFeature
+from ...image_processing_utils_fast import (
+    BaseImageProcessorFast,
+    DefaultFastImageProcessorKwargs,
+    SizeDict,
+    get_image_size_for_max_height_width,
+    get_max_height_width,
+    safe_squeeze,
+)
+from ...image_transforms import center_to_corners_format, corners_to_center_format
+from ...image_utils import (
+    IMAGENET_DEFAULT_MEAN,
+    IMAGENET_DEFAULT_STD,
+    AnnotationFormat,
+    AnnotationType,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    get_image_size,
+    validate_annotations,
+)
+from ...processing_utils import Unpack
+from ...utils import TensorType, auto_docstring, requires_backends
+from ...utils.import_utils import requires
+from .image_processing_rt_detr import get_size_with_aspect_ratio
+
+
+class RTDetrFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+    r"""
+    format (`str`, *optional*, defaults to `AnnotationFormat.COCO_DETECTION`):
+        Data format of the annotations. One of "coco_detection" or "coco_panoptic".
+    do_convert_annotations (`bool`, *optional*, defaults to `True`):
+        Controls whether to convert the annotations to the format expected by the RT_DETR model. Converts the
+        bounding boxes to the format `(center_x, center_y, width, height)` and in the range `[0, 1]`.
+        Can be overridden by the `do_convert_annotations` parameter in the `preprocess` method.
+    return_segmentation_masks (`bool`, *optional*, defaults to `False`):
+        Whether to return segmentation masks.
+    """
+
+    format: Optional[Union[str, AnnotationFormat]]
+    do_convert_annotations: Optional[bool]
+    return_segmentation_masks: Optional[bool]
+
+
+SUPPORTED_ANNOTATION_FORMATS = (AnnotationFormat.COCO_DETECTION, AnnotationFormat.COCO_PANOPTIC)
+
+
+def prepare_coco_detection_annotation(
+    image,
+    target,
+    return_segmentation_masks: bool = False,
+    input_data_format: Optional[Union[ChannelDimension, str]] = None,
+):
+    """
+    Convert the target in COCO format into the format expected by RT-DETR.
+    """
+    image_height, image_width = image.size()[-2:]
+
+    image_id = target["image_id"]
+    image_id = torch.as_tensor([image_id], dtype=torch.int64, device=image.device)
+
+    # Get all COCO annotations for the given image.
+    annotations = target["annotations"]
+    classes = []
+    area = []
+    boxes = []
+    keypoints = []
+    for obj in annotations:
+        if "iscrowd" not in obj or obj["iscrowd"] == 0:
+            classes.append(obj["category_id"])
+            area.append(obj["area"])
+            boxes.append(obj["bbox"])
+            if "keypoints" in obj:
+                keypoints.append(obj["keypoints"])
+
+    classes = torch.as_tensor(classes, dtype=torch.int64, device=image.device)
+    area = torch.as_tensor(area, dtype=torch.float32, device=image.device)
+    iscrowd = torch.zeros_like(classes, dtype=torch.int64, device=image.device)
+    # guard against no boxes via resizing
+    boxes = torch.as_tensor(boxes, dtype=torch.float32, device=image.device).reshape(-1, 4)
+    boxes[:, 2:] += boxes[:, :2]
+    boxes[:, 0::2] = boxes[:, 0::2].clip(min=0, max=image_width)
+    boxes[:, 1::2] = boxes[:, 1::2].clip(min=0, max=image_height)
+
+    keep = (boxes[:, 3] > boxes[:, 1]) & (boxes[:, 2] > boxes[:, 0])
+
+    new_target = {
+        "image_id": image_id,
+        "class_labels": classes[keep],
+        "boxes": boxes[keep],
+        "area": area[keep],
+        "iscrowd": iscrowd[keep],
+        "orig_size": torch.as_tensor([int(image_height), int(image_width)], dtype=torch.int64, device=image.device),
+    }
+
+    if keypoints:
+        keypoints = torch.as_tensor(keypoints, dtype=torch.float32, device=image.device)
+        # Apply the keep mask here to filter the relevant annotations
+        keypoints = keypoints[keep]
+        num_keypoints = keypoints.shape[0]
+        keypoints = keypoints.reshape((-1, 3)) if num_keypoints else keypoints
+        new_target["keypoints"] = keypoints
+
+    return new_target
+
+
+@auto_docstring
+@requires(backends=("torchvision", "torch"))
+class RTDetrImageProcessorFast(BaseImageProcessorFast):
+    resample = PILImageResampling.BILINEAR
+    image_mean = IMAGENET_DEFAULT_MEAN
+    image_std = IMAGENET_DEFAULT_STD
+    format = AnnotationFormat.COCO_DETECTION
+    do_resize = True
+    do_rescale = True
+    do_normalize = False
+    do_pad = False
+    size = {"height": 640, "width": 640}
+    default_to_square = False
+    model_input_names = ["pixel_values", "pixel_mask"]
+    valid_kwargs = RTDetrFastImageProcessorKwargs
+    do_convert_annotations = True
+
+    def __init__(self, **kwargs: Unpack[RTDetrFastImageProcessorKwargs]) -> None:
+        # Backwards compatibility
+        do_convert_annotations = kwargs.get("do_convert_annotations")
+        do_normalize = kwargs.get("do_normalize")
+        if do_convert_annotations is None and getattr(self, "do_convert_annotations", None) is None:
+            self.do_convert_annotations = do_normalize if do_normalize is not None else self.do_normalize
+
+        super().__init__(**kwargs)
+
+    def prepare_annotation(
+        self,
+        image: torch.Tensor,
+        target: dict,
+        format: Optional[AnnotationFormat] = None,
+        return_segmentation_masks: Optional[bool] = None,
+        masks_path: Optional[Union[str, pathlib.Path]] = None,
+        input_data_format: Optional[Union[str, ChannelDimension]] = None,
+    ) -> dict:
+        """
+        Prepare an annotation for feeding into RT_DETR model.
+        """
+        format = format if format is not None else self.format
+
+        if format == AnnotationFormat.COCO_DETECTION:
+            return_segmentation_masks = False if return_segmentation_masks is None else return_segmentation_masks
+            target = prepare_coco_detection_annotation(
+                image, target, return_segmentation_masks, input_data_format=input_data_format
+            )
+        else:
+            raise ValueError(f"Format {format} is not supported.")
+        return target
+
+    def resize(
+        self,
+        image: torch.Tensor,
+        size: SizeDict,
+        interpolation: Optional["F.InterpolationMode"] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        """
+        Resize the image to the given size. Size can be `min_size` (scalar) or `(height, width)` tuple. If size is an
+        int, smaller edge of the image will be matched to this number.
+
+        Args:
+            image (`torch.Tensor`):
+                Image to resize.
+            size (`SizeDict`):
+                Size of the image's `(height, width)` dimensions after resizing. Available options are:
+                    - `{"height": int, "width": int}`: The image will be resized to the exact size `(height, width)`.
+                        Do NOT keep the aspect ratio.
+                    - `{"shortest_edge": int, "longest_edge": int}`: The image will be resized to a maximum size respecting
+                        the aspect ratio and keeping the shortest edge less or equal to `shortest_edge` and the longest edge
+                        less or equal to `longest_edge`.
+                    - `{"max_height": int, "max_width": int}`: The image will be resized to the maximum size respecting the
+                        aspect ratio and keeping the height less or equal to `max_height` and the width less or equal to
+                        `max_width`.
+            interpolation (`InterpolationMode`, *optional*, defaults to `InterpolationMode.BILINEAR`):
+                Resampling filter to use if resizing the image.
+        """
+        interpolation = interpolation if interpolation is not None else F.InterpolationMode.BILINEAR
+        if size.shortest_edge and size.longest_edge:
+            # Resize the image so that the shortest edge or the longest edge is of the given size
+            # while maintaining the aspect ratio of the original image.
+            new_size = get_size_with_aspect_ratio(
+                image.size()[-2:],
+                size["shortest_edge"],
+                size["longest_edge"],
+            )
+        elif size.max_height and size.max_width:
+            new_size = get_image_size_for_max_height_width(image.size()[-2:], size["max_height"], size["max_width"])
+        elif size.height and size.width:
+            new_size = (size["height"], size["width"])
+        else:
+            raise ValueError(
+                "Size must contain 'height' and 'width' keys or 'shortest_edge' and 'longest_edge' keys. Got"
+                f" {size.keys()}."
+            )
+
+        image = F.resize(
+            image,
+            size=new_size,
+            interpolation=interpolation,
+            **kwargs,
+        )
+        return image
+
+    def resize_annotation(
+        self,
+        annotation: dict[str, Any],
+        orig_size: tuple[int, int],
+        target_size: tuple[int, int],
+        threshold: float = 0.5,
+        interpolation: Optional["F.InterpolationMode"] = None,
+    ):
+        """
+        Resizes an annotation to a target size.
+
+        Args:
+            annotation (`dict[str, Any]`):
+                The annotation dictionary.
+            orig_size (`tuple[int, int]`):
+                The original size of the input image.
+            target_size (`tuple[int, int]`):
+                The target size of the image, as returned by the preprocessing `resize` step.
+            threshold (`float`, *optional*, defaults to 0.5):
+                The threshold used to binarize the segmentation masks.
+            resample (`InterpolationMode`, defaults to `F.InterpolationMode.NEAREST_EXACT`):
+                The resampling filter to use when resizing the masks.
+        """
+        interpolation = interpolation if interpolation is not None else F.InterpolationMode.NEAREST_EXACT
+        ratio_height, ratio_width = [target / orig for target, orig in zip(target_size, orig_size)]
+
+        new_annotation = {}
+        new_annotation["size"] = target_size
+
+        for key, value in annotation.items():
+            if key == "boxes":
+                boxes = value
+                scaled_boxes = boxes * torch.as_tensor(
+                    [ratio_width, ratio_height, ratio_width, ratio_height], dtype=torch.float32, device=boxes.device
+                )
+                new_annotation["boxes"] = scaled_boxes
+            elif key == "area":
+                area = value
+                scaled_area = area * (ratio_width * ratio_height)
+                new_annotation["area"] = scaled_area
+            elif key == "masks":
+                masks = value[:, None]
+                masks = [F.resize(mask, target_size, interpolation=interpolation) for mask in masks]
+                masks = torch.stack(masks).to(torch.float32)
+                masks = masks[:, 0] > threshold
+                new_annotation["masks"] = masks
+            elif key == "size":
+                new_annotation["size"] = target_size
+            else:
+                new_annotation[key] = value
+
+        return new_annotation
+
+    def normalize_annotation(self, annotation: dict, image_size: tuple[int, int]) -> dict:
+        image_height, image_width = image_size
+        norm_annotation = {}
+        for key, value in annotation.items():
+            if key == "boxes":
+                boxes = value
+                boxes = corners_to_center_format(boxes)
+                boxes /= torch.as_tensor(
+                    [image_width, image_height, image_width, image_height], dtype=torch.float32, device=boxes.device
+                )
+                norm_annotation[key] = boxes
+            else:
+                norm_annotation[key] = value
+        return norm_annotation
+
+    def _update_annotation_for_padded_image(
+        self,
+        annotation: dict,
+        input_image_size: tuple[int, int],
+        output_image_size: tuple[int, int],
+        padding,
+        update_bboxes,
+    ) -> dict:
+        """
+        Update the annotation for a padded image.
+        """
+        new_annotation = {}
+        new_annotation["size"] = output_image_size
+        ratio_height, ratio_width = (input / output for output, input in zip(output_image_size, input_image_size))
+
+        for key, value in annotation.items():
+            if key == "masks":
+                masks = value
+                masks = F.pad(
+                    masks,
+                    padding,
+                    fill=0,
+                )
+                masks = safe_squeeze(masks, 1)
+                new_annotation["masks"] = masks
+            elif key == "boxes" and update_bboxes:
+                boxes = value
+                boxes *= torch.as_tensor([ratio_width, ratio_height, ratio_width, ratio_height], device=boxes.device)
+                new_annotation["boxes"] = boxes
+            elif key == "size":
+                new_annotation["size"] = output_image_size
+            else:
+                new_annotation[key] = value
+        return new_annotation
+
+    def pad(
+        self,
+        image: torch.Tensor,
+        padded_size: tuple[int, int],
+        annotation: Optional[dict[str, Any]] = None,
+        update_bboxes: bool = True,
+        fill: int = 0,
+    ):
+        original_size = image.size()[-2:]
+        padding_bottom = padded_size[0] - original_size[0]
+        padding_right = padded_size[1] - original_size[1]
+        if padding_bottom < 0 or padding_right < 0:
+            raise ValueError(
+                f"Padding dimensions are negative. Please make sure that the padded size is larger than the "
+                f"original size. Got padded size: {padded_size}, original size: {original_size}."
+            )
+        if original_size != padded_size:
+            padding = [0, 0, padding_right, padding_bottom]
+            image = F.pad(image, padding, fill=fill)
+            if annotation is not None:
+                annotation = self._update_annotation_for_padded_image(
+                    annotation, original_size, padded_size, padding, update_bboxes
+                )
+
+        # Make a pixel mask for the image, where 1 indicates a valid pixel and 0 indicates padding.
+        pixel_mask = torch.zeros(padded_size, dtype=torch.int64, device=image.device)
+        pixel_mask[: original_size[0], : original_size[1]] = 1
+
+        return image, pixel_mask, annotation
+
+    @auto_docstring
+    def preprocess(
+        self,
+        images: ImageInput,
+        annotations: Optional[Union[AnnotationType, list[AnnotationType]]] = None,
+        masks_path: Optional[Union[str, pathlib.Path]] = None,
+        **kwargs: Unpack[RTDetrFastImageProcessorKwargs],
+    ) -> BatchFeature:
+        r"""
+        annotations (`AnnotationType` or `list[AnnotationType]`, *optional*):
+            List of annotations associated with the image or batch of images. If annotation is for object
+            detection, the annotations should be a dictionary with the following keys:
+            - "image_id" (`int`): The image id.
+            - "annotations" (`list[Dict]`): List of annotations for an image. Each annotation should be a
+                dictionary. An image can have no annotations, in which case the list should be empty.
+            If annotation is for segmentation, the annotations should be a dictionary with the following keys:
+            - "image_id" (`int`): The image id.
+            - "segments_info" (`list[Dict]`): List of segments for an image. Each segment should be a dictionary.
+                An image can have no segments, in which case the list should be empty.
+            - "file_name" (`str`): The file name of the image.
+        masks_path (`str` or `pathlib.Path`, *optional*):
+            Path to the directory containing the segmentation masks.
+        """
+        return super().preprocess(images, annotations, masks_path, **kwargs)
+
+    def _preprocess(
+        self,
+        images: list["torch.Tensor"],
+        annotations: Optional[Union[AnnotationType, list[AnnotationType]]],
+        masks_path: Optional[Union[str, pathlib.Path]],
+        return_segmentation_masks: bool,
+        do_resize: bool,
+        size: SizeDict,
+        interpolation: Optional["F.InterpolationMode"],
+        do_rescale: bool,
+        rescale_factor: float,
+        do_normalize: bool,
+        do_convert_annotations: bool,
+        image_mean: Optional[Union[float, list[float]]],
+        image_std: Optional[Union[float, list[float]]],
+        do_pad: bool,
+        pad_size: Optional[SizeDict],
+        format: Optional[Union[str, AnnotationFormat]],
+        return_tensors: Optional[Union[str, TensorType]],
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Preprocess an image or a batch of images so that it can be used by the model.
+        """
+
+        if annotations is not None and isinstance(annotations, dict):
+            annotations = [annotations]
+
+        if annotations is not None and len(images) != len(annotations):
+            raise ValueError(
+                f"The number of images ({len(images)}) and annotations ({len(annotations)}) do not match."
+            )
+
+        format = AnnotationFormat(format)
+        if annotations is not None:
+            validate_annotations(format, SUPPORTED_ANNOTATION_FORMATS, annotations)
+
+        data = {}
+        processed_images = []
+        processed_annotations = []
+        pixel_masks = []  # Initialize pixel_masks here
+        for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)):
+            # prepare (COCO annotations as a list of Dict -> DETR target as a single Dict per image)
+            if annotations is not None:
+                annotation = self.prepare_annotation(
+                    image,
+                    annotation,
+                    format,
+                    return_segmentation_masks=return_segmentation_masks,
+                    masks_path=masks_path,
+                    input_data_format=ChannelDimension.FIRST,
+                )
+
+            if do_resize:
+                resized_image = self.resize(image, size=size, interpolation=interpolation)
+                if annotations is not None:
+                    annotation = self.resize_annotation(
+                        annotation,
+                        orig_size=image.size()[-2:],
+                        target_size=resized_image.size()[-2:],
+                    )
+                image = resized_image
+            # Fused rescale and normalize
+            image = self.rescale_and_normalize(image, do_rescale, rescale_factor, do_normalize, image_mean, image_std)
+            if do_convert_annotations and annotations is not None:
+                annotation = self.normalize_annotation(annotation, get_image_size(image, ChannelDimension.FIRST))
+
+            processed_images.append(image)
+            processed_annotations.append(annotation)
+        images = processed_images
+        annotations = processed_annotations if annotations is not None else None
+
+        if do_pad:
+            # depends on all resized image shapes so we need another loop
+            if pad_size is not None:
+                padded_size = (pad_size.height, pad_size.width)
+            else:
+                padded_size = get_max_height_width(images)
+
+            padded_images = []
+            padded_annotations = []
+            for image, annotation in zip(images, annotations if annotations is not None else [None] * len(images)):
+                # Pads images and returns their mask: {'pixel_values': ..., 'pixel_mask': ...}
+                if padded_size == image.size()[-2:]:
+                    padded_images.append(image)
+                    pixel_masks.append(torch.ones(padded_size, dtype=torch.int64, device=image.device))
+                    padded_annotations.append(annotation)
+                    continue
+                image, pixel_mask, annotation = self.pad(
+                    image, padded_size, annotation=annotation, update_bboxes=do_convert_annotations
+                )
+                padded_images.append(image)
+                padded_annotations.append(annotation)
+                pixel_masks.append(pixel_mask)
+            images = padded_images
+            annotations = padded_annotations if annotations is not None else None
+            data.update({"pixel_mask": torch.stack(pixel_masks, dim=0)})
+
+        data.update({"pixel_values": torch.stack(images, dim=0)})
+        encoded_inputs = BatchFeature(data, tensor_type=return_tensors)
+        if annotations is not None:
+            encoded_inputs["labels"] = [
+                BatchFeature(annotation, tensor_type=return_tensors) for annotation in annotations
+            ]
+        return encoded_inputs
+
+    def post_process_object_detection(
+        self,
+        outputs,
+        threshold: float = 0.5,
+        target_sizes: Union[TensorType, list[tuple]] = None,
+        use_focal_loss: bool = True,
+    ):
+        """
+        Converts the raw output of [`DetrForObjectDetection`] into final bounding boxes in (top_left_x, top_left_y,
+        bottom_right_x, bottom_right_y) format. Only supports PyTorch.
+
+        Args:
+            outputs ([`DetrObjectDetectionOutput`]):
+                Raw outputs of the model.
+            threshold (`float`, *optional*, defaults to 0.5):
+                Score threshold to keep object detection predictions.
+            target_sizes (`torch.Tensor` or `list[tuple[int, int]]`, *optional*):
+                Tensor of shape `(batch_size, 2)` or list of tuples (`tuple[int, int]`) containing the target size
+                `(height, width)` of each image in the batch. If unset, predictions will not be resized.
+            use_focal_loss (`bool` defaults to `True`):
+                Variable informing if the focal loss was used to predict the outputs. If `True`, a sigmoid is applied
+                to compute the scores of each detection, otherwise, a softmax function is used.
+
+        Returns:
+            `list[Dict]`: A list of dictionaries, each dictionary containing the scores, labels and boxes for an image
+            in the batch as predicted by the model.
+        """
+        requires_backends(self, ["torch"])
+        out_logits, out_bbox = outputs.logits, outputs.pred_boxes
+        # convert from relative cxcywh to absolute xyxy
+        boxes = center_to_corners_format(out_bbox)
+        if target_sizes is not None:
+            if len(out_logits) != len(target_sizes):
+                raise ValueError(
+                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
+                )
+            if isinstance(target_sizes, list):
+                img_h, img_w = torch.as_tensor(target_sizes).unbind(1)
+            else:
+                img_h, img_w = target_sizes.unbind(1)
+            scale_fct = torch.stack([img_w, img_h, img_w, img_h], dim=1).to(boxes.device)
+            boxes = boxes * scale_fct[:, None, :]
+
+        num_top_queries = out_logits.shape[1]
+        num_classes = out_logits.shape[2]
+
+        if use_focal_loss:
+            scores = torch.nn.functional.sigmoid(out_logits)
+            scores, index = torch.topk(scores.flatten(1), num_top_queries, axis=-1)
+            labels = index % num_classes
+            index = index // num_classes
+            boxes = boxes.gather(dim=1, index=index.unsqueeze(-1).repeat(1, 1, boxes.shape[-1]))
+        else:
+            scores = torch.nn.functional.softmax(out_logits)[:, :, :-1]
+            scores, labels = scores.max(dim=-1)
+            if scores.shape[1] > num_top_queries:
+                scores, index = torch.topk(scores, num_top_queries, dim=-1)
+                labels = torch.gather(labels, dim=1, index=index)
+                boxes = torch.gather(boxes, dim=1, index=index.unsqueeze(-1).tile(1, 1, boxes.shape[-1]))
+
+        results = []
+        for score, label, box in zip(scores, labels, boxes):
+            results.append(
+                {
+                    "scores": score[score > threshold],
+                    "labels": label[score > threshold],
+                    "boxes": box[score > threshold],
+                }
+            )
+
+        return results
+
+
+__all__ = ["RTDetrImageProcessorFast"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/modeling_rt_detr.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/modeling_rt_detr.py
new file mode 100644
index 0000000000000000000000000000000000000000..1d4b64496969acb596b037841c745b109f3d11b5
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/modeling_rt_detr.py
@@ -0,0 +1,2013 @@
+# coding=utf-8
+# Copyright 2024 Baidu Inc and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch RT-DETR model."""
+
+import math
+import warnings
+from dataclasses import dataclass
+from functools import partial
+from typing import Optional, Union
+
+import torch
+import torch.nn.functional as F
+from torch import Tensor, nn
+
+from ...activations import ACT2CLS, ACT2FN
+from ...image_transforms import center_to_corners_format, corners_to_center_format
+from ...integrations import use_kernel_forward_from_hub
+from ...modeling_outputs import BaseModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...pytorch_utils import compile_compatible_method_lru_cache
+from ...utils import (
+    ModelOutput,
+    auto_docstring,
+    logging,
+    torch_int,
+)
+from ...utils.backbone_utils import load_backbone
+from .configuration_rt_detr import RTDetrConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+# TODO: Replace all occurrences of the checkpoint with the final one
+
+
+@use_kernel_forward_from_hub("MultiScaleDeformableAttention")
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.MultiScaleDeformableAttention
+class MultiScaleDeformableAttention(nn.Module):
+    def forward(
+        self,
+        value: Tensor,
+        value_spatial_shapes: Tensor,
+        value_spatial_shapes_list: list[tuple],
+        level_start_index: Tensor,
+        sampling_locations: Tensor,
+        attention_weights: Tensor,
+        im2col_step: int,
+    ):
+        batch_size, _, num_heads, hidden_dim = value.shape
+        _, num_queries, num_heads, num_levels, num_points, _ = sampling_locations.shape
+        value_list = value.split([height * width for height, width in value_spatial_shapes_list], dim=1)
+        sampling_grids = 2 * sampling_locations - 1
+        sampling_value_list = []
+        for level_id, (height, width) in enumerate(value_spatial_shapes_list):
+            # batch_size, height*width, num_heads, hidden_dim
+            # -> batch_size, height*width, num_heads*hidden_dim
+            # -> batch_size, num_heads*hidden_dim, height*width
+            # -> batch_size*num_heads, hidden_dim, height, width
+            value_l_ = (
+                value_list[level_id]
+                .flatten(2)
+                .transpose(1, 2)
+                .reshape(batch_size * num_heads, hidden_dim, height, width)
+            )
+            # batch_size, num_queries, num_heads, num_points, 2
+            # -> batch_size, num_heads, num_queries, num_points, 2
+            # -> batch_size*num_heads, num_queries, num_points, 2
+            sampling_grid_l_ = sampling_grids[:, :, :, level_id].transpose(1, 2).flatten(0, 1)
+            # batch_size*num_heads, hidden_dim, num_queries, num_points
+            sampling_value_l_ = nn.functional.grid_sample(
+                value_l_,
+                sampling_grid_l_,
+                mode="bilinear",
+                padding_mode="zeros",
+                align_corners=False,
+            )
+            sampling_value_list.append(sampling_value_l_)
+        # (batch_size, num_queries, num_heads, num_levels, num_points)
+        # -> (batch_size, num_heads, num_queries, num_levels, num_points)
+        # -> (batch_size, num_heads, 1, num_queries, num_levels*num_points)
+        attention_weights = attention_weights.transpose(1, 2).reshape(
+            batch_size * num_heads, 1, num_queries, num_levels * num_points
+        )
+        output = (
+            (torch.stack(sampling_value_list, dim=-2).flatten(-2) * attention_weights)
+            .sum(-1)
+            .view(batch_size, num_heads * hidden_dim, num_queries)
+        )
+        return output.transpose(1, 2).contiguous()
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for outputs of the RTDetrDecoder. This class adds two attributes to
+    BaseModelOutputWithCrossAttentions, namely:
+    - a stacked tensor of intermediate decoder hidden states (i.e. the output of each decoder layer)
+    - a stacked tensor of intermediate reference points.
+    """
+)
+class RTDetrDecoderOutput(ModelOutput):
+    r"""
+    intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
+        Stacked intermediate hidden states (output of each layer of the decoder).
+    intermediate_logits (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, config.num_labels)`):
+        Stacked intermediate logits (logits of each layer of the decoder).
+    intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, hidden_size)`):
+        Stacked intermediate reference points (reference points of each layer of the decoder).
+    intermediate_predicted_corners (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
+        Stacked intermediate predicted corners (predicted corners of each layer of the decoder).
+    initial_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
+        Stacked initial reference points (initial reference points of each layer of the decoder).
+    cross_attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` and `config.add_cross_attention=True` is passed or when `config.output_attentions=True`):
+        Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+        sequence_length)`. Attentions weights of the decoder's cross-attention layer, after the attention softmax,
+        used to compute the weighted average in the cross-attention heads.
+    """
+
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+    intermediate_logits: Optional[torch.FloatTensor] = None
+    intermediate_reference_points: Optional[torch.FloatTensor] = None
+    intermediate_predicted_corners: Optional[torch.FloatTensor] = None
+    initial_reference_points: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Base class for outputs of the RT-DETR encoder-decoder model.
+    """
+)
+class RTDetrModelOutput(ModelOutput):
+    r"""
+    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
+        Sequence of hidden-states at the output of the last layer of the decoder of the model.
+    intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
+        Stacked intermediate hidden states (output of each layer of the decoder).
+    intermediate_logits (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, sequence_length, config.num_labels)`):
+        Stacked intermediate logits (logits of each layer of the decoder).
+    intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
+        Stacked intermediate reference points (reference points of each layer of the decoder).
+    intermediate_predicted_corners (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
+        Stacked intermediate predicted corners (predicted corners of each layer of the decoder).
+    initial_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
+        Initial reference points used for the first decoder layer.
+    init_reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
+        Initial reference points sent through the Transformer decoder.
+    enc_topk_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`):
+        Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
+        picked as region proposals in the encoder stage. Output of bounding box binary classification (i.e.
+        foreground and background).
+    enc_topk_bboxes (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`):
+        Logits of predicted bounding boxes coordinates in the encoder stage.
+    enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+        Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
+        picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
+        foreground and background).
+    enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+        Logits of predicted bounding boxes coordinates in the first stage.
+    denoising_meta_values (`dict`):
+        Extra dictionary for the denoising related values.
+    """
+
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+    intermediate_logits: Optional[torch.FloatTensor] = None
+    intermediate_reference_points: Optional[torch.FloatTensor] = None
+    intermediate_predicted_corners: Optional[torch.FloatTensor] = None
+    initial_reference_points: Optional[torch.FloatTensor] = None
+    decoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[tuple[torch.FloatTensor]] = None
+    init_reference_points: Optional[torch.FloatTensor] = None
+    enc_topk_logits: Optional[torch.FloatTensor] = None
+    enc_topk_bboxes: Optional[torch.FloatTensor] = None
+    enc_outputs_class: Optional[torch.FloatTensor] = None
+    enc_outputs_coord_logits: Optional[torch.FloatTensor] = None
+    denoising_meta_values: Optional[dict] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Output type of [`RTDetrForObjectDetection`].
+    """
+)
+class RTDetrObjectDetectionOutput(ModelOutput):
+    r"""
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` are provided)):
+        Total loss as a linear combination of a negative log-likehood (cross-entropy) for class prediction and a
+        bounding box loss. The latter is defined as a linear combination of the L1 loss and the generalized
+        scale-invariant IoU loss.
+    loss_dict (`Dict`, *optional*):
+        A dictionary containing the individual losses. Useful for logging.
+    logits (`torch.FloatTensor` of shape `(batch_size, num_queries, num_classes + 1)`):
+        Classification logits (including no-object) for all queries.
+    pred_boxes (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)`):
+        Normalized boxes coordinates for all queries, represented as (center_x, center_y, width, height). These
+        values are normalized in [0, 1], relative to the size of each individual image in the batch (disregarding
+        possible padding). You can use [`~RTDetrImageProcessor.post_process_object_detection`] to retrieve the
+        unnormalized (absolute) bounding boxes.
+    auxiliary_outputs (`list[Dict]`, *optional*):
+        Optional, only returned when auxiliary losses are activated (i.e. `config.auxiliary_loss` is set to `True`)
+        and labels are provided. It is a list of dictionaries containing the two above keys (`logits` and
+        `pred_boxes`) for each decoder layer.
+    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
+        Sequence of hidden-states at the output of the last layer of the decoder of the model.
+    intermediate_hidden_states (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, hidden_size)`):
+        Stacked intermediate hidden states (output of each layer of the decoder).
+    intermediate_logits (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, config.num_labels)`):
+        Stacked intermediate logits (logits of each layer of the decoder).
+    intermediate_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
+        Stacked intermediate reference points (reference points of each layer of the decoder).
+    intermediate_predicted_corners (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
+        Stacked intermediate predicted corners (predicted corners of each layer of the decoder).
+    initial_reference_points (`torch.FloatTensor` of shape `(batch_size, config.decoder_layers, num_queries, 4)`):
+        Stacked initial reference points (initial reference points of each layer of the decoder).
+    init_reference_points (`torch.FloatTensor` of shape  `(batch_size, num_queries, 4)`):
+        Initial reference points sent through the Transformer decoder.
+    enc_topk_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+        Logits of predicted bounding boxes coordinates in the encoder.
+    enc_topk_bboxes (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+        Logits of predicted bounding boxes coordinates in the encoder.
+    enc_outputs_class (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.num_labels)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+        Predicted bounding boxes scores where the top `config.two_stage_num_proposals` scoring bounding boxes are
+        picked as region proposals in the first stage. Output of bounding box binary classification (i.e.
+        foreground and background).
+    enc_outputs_coord_logits (`torch.FloatTensor` of shape `(batch_size, sequence_length, 4)`, *optional*, returned when `config.with_box_refine=True` and `config.two_stage=True`):
+        Logits of predicted bounding boxes coordinates in the first stage.
+    denoising_meta_values (`dict`):
+        Extra dictionary for the denoising related values
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    loss_dict: Optional[dict] = None
+    logits: Optional[torch.FloatTensor] = None
+    pred_boxes: Optional[torch.FloatTensor] = None
+    auxiliary_outputs: Optional[list[dict]] = None
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    intermediate_hidden_states: Optional[torch.FloatTensor] = None
+    intermediate_logits: Optional[torch.FloatTensor] = None
+    intermediate_reference_points: Optional[torch.FloatTensor] = None
+    intermediate_predicted_corners: Optional[torch.FloatTensor] = None
+    initial_reference_points: Optional[torch.FloatTensor] = None
+    decoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[tuple[torch.FloatTensor]] = None
+    cross_attentions: Optional[tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[tuple[torch.FloatTensor]] = None
+    init_reference_points: Optional[tuple[torch.FloatTensor]] = None
+    enc_topk_logits: Optional[torch.FloatTensor] = None
+    enc_topk_bboxes: Optional[torch.FloatTensor] = None
+    enc_outputs_class: Optional[torch.FloatTensor] = None
+    enc_outputs_coord_logits: Optional[torch.FloatTensor] = None
+    denoising_meta_values: Optional[dict] = None
+
+
+def _get_clones(partial_module, N):
+    return nn.ModuleList([partial_module() for i in range(N)])
+
+
+# Copied from transformers.models.conditional_detr.modeling_conditional_detr.inverse_sigmoid
+def inverse_sigmoid(x, eps=1e-5):
+    x = x.clamp(min=0, max=1)
+    x1 = x.clamp(min=eps)
+    x2 = (1 - x).clamp(min=eps)
+    return torch.log(x1 / x2)
+
+
+# Copied from transformers.models.detr.modeling_detr.DetrFrozenBatchNorm2d with Detr->RTDetr
+class RTDetrFrozenBatchNorm2d(nn.Module):
+    """
+    BatchNorm2d where the batch statistics and the affine parameters are fixed.
+
+    Copy-paste from torchvision.misc.ops with added eps before rqsrt, without which any other models than
+    torchvision.models.resnet[18,34,50,101] produce nans.
+    """
+
+    def __init__(self, n):
+        super().__init__()
+        self.register_buffer("weight", torch.ones(n))
+        self.register_buffer("bias", torch.zeros(n))
+        self.register_buffer("running_mean", torch.zeros(n))
+        self.register_buffer("running_var", torch.ones(n))
+
+    def _load_from_state_dict(
+        self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+    ):
+        num_batches_tracked_key = prefix + "num_batches_tracked"
+        if num_batches_tracked_key in state_dict:
+            del state_dict[num_batches_tracked_key]
+
+        super()._load_from_state_dict(
+            state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs
+        )
+
+    def forward(self, x):
+        # move reshapes to the beginning
+        # to make it user-friendly
+        weight = self.weight.reshape(1, -1, 1, 1)
+        bias = self.bias.reshape(1, -1, 1, 1)
+        running_var = self.running_var.reshape(1, -1, 1, 1)
+        running_mean = self.running_mean.reshape(1, -1, 1, 1)
+        epsilon = 1e-5
+        scale = weight * (running_var + epsilon).rsqrt()
+        bias = bias - running_mean * scale
+        return x * scale + bias
+
+
+# Copied from transformers.models.detr.modeling_detr.replace_batch_norm with Detr->RTDetr
+def replace_batch_norm(model):
+    r"""
+    Recursively replace all `torch.nn.BatchNorm2d` with `RTDetrFrozenBatchNorm2d`.
+
+    Args:
+        model (torch.nn.Module):
+            input model
+    """
+    for name, module in model.named_children():
+        if isinstance(module, nn.BatchNorm2d):
+            new_module = RTDetrFrozenBatchNorm2d(module.num_features)
+
+            if module.weight.device != torch.device("meta"):
+                new_module.weight.data.copy_(module.weight)
+                new_module.bias.data.copy_(module.bias)
+                new_module.running_mean.data.copy_(module.running_mean)
+                new_module.running_var.data.copy_(module.running_var)
+
+            model._modules[name] = new_module
+
+        if len(list(module.children())) > 0:
+            replace_batch_norm(module)
+
+
+def get_contrastive_denoising_training_group(
+    targets,
+    num_classes,
+    num_queries,
+    class_embed,
+    num_denoising_queries=100,
+    label_noise_ratio=0.5,
+    box_noise_scale=1.0,
+):
+    """
+    Creates a contrastive denoising training group using ground-truth samples. It adds noise to labels and boxes.
+
+    Args:
+        targets (`list[dict]`):
+            The target objects, each containing 'class_labels' and 'boxes' for objects in an image.
+        num_classes (`int`):
+            Total number of classes in the dataset.
+        num_queries (`int`):
+            Number of query slots in the transformer.
+        class_embed (`callable`):
+            A function or a model layer to embed class labels.
+        num_denoising_queries (`int`, *optional*, defaults to 100):
+            Number of denoising queries.
+        label_noise_ratio (`float`, *optional*, defaults to 0.5):
+            Ratio of noise applied to labels.
+        box_noise_scale (`float`, *optional*, defaults to 1.0):
+            Scale of noise applied to bounding boxes.
+    Returns:
+        `tuple` comprising various elements:
+        - **input_query_class** (`torch.FloatTensor`) --
+          Class queries with applied label noise.
+        - **input_query_bbox** (`torch.FloatTensor`) --
+          Bounding box queries with applied box noise.
+        - **attn_mask** (`torch.FloatTensor`) --
+           Attention mask for separating denoising and reconstruction queries.
+        - **denoising_meta_values** (`dict`) --
+          Metadata including denoising positive indices, number of groups, and split sizes.
+    """
+
+    if num_denoising_queries <= 0:
+        return None, None, None, None
+
+    num_ground_truths = [len(t["class_labels"]) for t in targets]
+    device = targets[0]["class_labels"].device
+
+    max_gt_num = max(num_ground_truths)
+    if max_gt_num == 0:
+        return None, None, None, None
+
+    num_groups_denoising_queries = num_denoising_queries // max_gt_num
+    num_groups_denoising_queries = 1 if num_groups_denoising_queries == 0 else num_groups_denoising_queries
+    # pad gt to max_num of a batch
+    batch_size = len(num_ground_truths)
+
+    input_query_class = torch.full([batch_size, max_gt_num], num_classes, dtype=torch.int32, device=device)
+    input_query_bbox = torch.zeros([batch_size, max_gt_num, 4], device=device)
+    pad_gt_mask = torch.zeros([batch_size, max_gt_num], dtype=torch.bool, device=device)
+
+    for i in range(batch_size):
+        num_gt = num_ground_truths[i]
+        if num_gt > 0:
+            input_query_class[i, :num_gt] = targets[i]["class_labels"]
+            input_query_bbox[i, :num_gt] = targets[i]["boxes"]
+            pad_gt_mask[i, :num_gt] = 1
+    # each group has positive and negative queries.
+    input_query_class = input_query_class.tile([1, 2 * num_groups_denoising_queries])
+    input_query_bbox = input_query_bbox.tile([1, 2 * num_groups_denoising_queries, 1])
+    pad_gt_mask = pad_gt_mask.tile([1, 2 * num_groups_denoising_queries])
+    # positive and negative mask
+    negative_gt_mask = torch.zeros([batch_size, max_gt_num * 2, 1], device=device)
+    negative_gt_mask[:, max_gt_num:] = 1
+    negative_gt_mask = negative_gt_mask.tile([1, num_groups_denoising_queries, 1])
+    positive_gt_mask = 1 - negative_gt_mask
+    # contrastive denoising training positive index
+    positive_gt_mask = positive_gt_mask.squeeze(-1) * pad_gt_mask
+    denoise_positive_idx = torch.nonzero(positive_gt_mask)[:, 1]
+    denoise_positive_idx = torch.split(
+        denoise_positive_idx, [n * num_groups_denoising_queries for n in num_ground_truths]
+    )
+    # total denoising queries
+    num_denoising_queries = torch_int(max_gt_num * 2 * num_groups_denoising_queries)
+
+    if label_noise_ratio > 0:
+        mask = torch.rand_like(input_query_class, dtype=torch.float) < (label_noise_ratio * 0.5)
+        # randomly put a new one here
+        new_label = torch.randint_like(mask, 0, num_classes, dtype=input_query_class.dtype)
+        input_query_class = torch.where(mask & pad_gt_mask, new_label, input_query_class)
+
+    if box_noise_scale > 0:
+        known_bbox = center_to_corners_format(input_query_bbox)
+        diff = torch.tile(input_query_bbox[..., 2:] * 0.5, [1, 1, 2]) * box_noise_scale
+        rand_sign = torch.randint_like(input_query_bbox, 0, 2) * 2.0 - 1.0
+        rand_part = torch.rand_like(input_query_bbox)
+        rand_part = (rand_part + 1.0) * negative_gt_mask + rand_part * (1 - negative_gt_mask)
+        rand_part *= rand_sign
+        known_bbox += rand_part * diff
+        known_bbox.clip_(min=0.0, max=1.0)
+        input_query_bbox = corners_to_center_format(known_bbox)
+        input_query_bbox = inverse_sigmoid(input_query_bbox)
+
+    input_query_class = class_embed(input_query_class)
+
+    target_size = num_denoising_queries + num_queries
+    attn_mask = torch.full([target_size, target_size], 0, dtype=torch.float, device=device)
+    # match query cannot see the reconstruction
+    attn_mask[num_denoising_queries:, :num_denoising_queries] = -torch.inf
+
+    # reconstructions cannot see each other
+    for i in range(num_groups_denoising_queries):
+        idx_block_start = max_gt_num * 2 * i
+        idx_block_end = max_gt_num * 2 * (i + 1)
+        attn_mask[idx_block_start:idx_block_end, :idx_block_start] = -torch.inf
+        attn_mask[idx_block_start:idx_block_end, idx_block_end:num_denoising_queries] = -torch.inf
+
+    denoising_meta_values = {
+        "dn_positive_idx": denoise_positive_idx,
+        "dn_num_group": num_groups_denoising_queries,
+        "dn_num_split": [num_denoising_queries, num_queries],
+    }
+
+    return input_query_class, input_query_bbox, attn_mask, denoising_meta_values
+
+
+class RTDetrConvEncoder(nn.Module):
+    """
+    Convolutional backbone using the modeling_rt_detr_resnet.py.
+
+    nn.BatchNorm2d layers are replaced by RTDetrFrozenBatchNorm2d as defined above.
+    https://github.com/lyuwenyu/RT-DETR/blob/main/rtdetr_pytorch/src/nn/backbone/presnet.py#L142
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        backbone = load_backbone(config)
+
+        if config.freeze_backbone_batch_norms:
+            # replace batch norm by frozen batch norm
+            with torch.no_grad():
+                replace_batch_norm(backbone)
+        self.model = backbone
+        self.intermediate_channel_sizes = self.model.channels
+
+    def forward(self, pixel_values: torch.Tensor, pixel_mask: torch.Tensor):
+        # send pixel_values through the model to get list of feature maps
+        features = self.model(pixel_values).feature_maps
+
+        out = []
+        for feature_map in features:
+            # downsample pixel_mask to match shape of corresponding feature_map
+            mask = nn.functional.interpolate(pixel_mask[None].float(), size=feature_map.shape[-2:]).to(torch.bool)[0]
+            out.append((feature_map, mask))
+        return out
+
+
+class RTDetrConvNormLayer(nn.Module):
+    def __init__(self, config, in_channels, out_channels, kernel_size, stride, padding=None, activation=None):
+        super().__init__()
+        self.conv = nn.Conv2d(
+            in_channels,
+            out_channels,
+            kernel_size,
+            stride,
+            padding=(kernel_size - 1) // 2 if padding is None else padding,
+            bias=False,
+        )
+        self.norm = nn.BatchNorm2d(out_channels, config.batch_norm_eps)
+        self.activation = nn.Identity() if activation is None else ACT2CLS[activation]()
+
+    def forward(self, hidden_state):
+        hidden_state = self.conv(hidden_state)
+        hidden_state = self.norm(hidden_state)
+        hidden_state = self.activation(hidden_state)
+        return hidden_state
+
+
+class RTDetrEncoderLayer(nn.Module):
+    def __init__(self, config: RTDetrConfig):
+        super().__init__()
+        self.normalize_before = config.normalize_before
+
+        # self-attention
+        self.self_attn = RTDetrMultiheadAttention(
+            embed_dim=config.encoder_hidden_dim,
+            num_heads=config.num_attention_heads,
+            dropout=config.dropout,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(config.encoder_hidden_dim, eps=config.layer_norm_eps)
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.encoder_activation_function]
+        self.activation_dropout = config.activation_dropout
+        self.fc1 = nn.Linear(config.encoder_hidden_dim, config.encoder_ffn_dim)
+        self.fc2 = nn.Linear(config.encoder_ffn_dim, config.encoder_hidden_dim)
+        self.final_layer_norm = nn.LayerNorm(config.encoder_hidden_dim, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        **kwargs,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`): attention mask of size
+                `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
+                values.
+            position_embeddings (`torch.FloatTensor`, *optional*):
+                Object queries (also called content embeddings), to be added to the hidden states.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        if self.normalize_before:
+            hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_embeddings=position_embeddings,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        if not self.normalize_before:
+            hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        if self.normalize_before:
+            hidden_states = self.final_layer_norm(hidden_states)
+        residual = hidden_states
+
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+
+        hidden_states = self.fc2(hidden_states)
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        hidden_states = residual + hidden_states
+        if not self.normalize_before:
+            hidden_states = self.final_layer_norm(hidden_states)
+
+        if self.training:
+            if torch.isinf(hidden_states).any() or torch.isnan(hidden_states).any():
+                clamp_value = torch.finfo(hidden_states.dtype).max - 1000
+                hidden_states = torch.clamp(hidden_states, min=-clamp_value, max=clamp_value)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class RTDetrRepVggBlock(nn.Module):
+    """
+    RepVGG architecture block introduced by the work "RepVGG: Making VGG-style ConvNets Great Again".
+    """
+
+    def __init__(self, config: RTDetrConfig):
+        super().__init__()
+
+        activation = config.activation_function
+        hidden_channels = int(config.encoder_hidden_dim * config.hidden_expansion)
+        self.conv1 = RTDetrConvNormLayer(config, hidden_channels, hidden_channels, 3, 1, padding=1)
+        self.conv2 = RTDetrConvNormLayer(config, hidden_channels, hidden_channels, 1, 1, padding=0)
+        self.activation = nn.Identity() if activation is None else ACT2CLS[activation]()
+
+    def forward(self, x):
+        y = self.conv1(x) + self.conv2(x)
+        return self.activation(y)
+
+
+class RTDetrCSPRepLayer(nn.Module):
+    """
+    Cross Stage Partial (CSP) network layer with RepVGG blocks.
+    """
+
+    def __init__(self, config: RTDetrConfig):
+        super().__init__()
+
+        in_channels = config.encoder_hidden_dim * 2
+        out_channels = config.encoder_hidden_dim
+        num_blocks = 3
+        activation = config.activation_function
+
+        hidden_channels = int(out_channels * config.hidden_expansion)
+        self.conv1 = RTDetrConvNormLayer(config, in_channels, hidden_channels, 1, 1, activation=activation)
+        self.conv2 = RTDetrConvNormLayer(config, in_channels, hidden_channels, 1, 1, activation=activation)
+        self.bottlenecks = nn.Sequential(*[RTDetrRepVggBlock(config) for _ in range(num_blocks)])
+        if hidden_channels != out_channels:
+            self.conv3 = RTDetrConvNormLayer(config, hidden_channels, out_channels, 1, 1, activation=activation)
+        else:
+            self.conv3 = nn.Identity()
+
+    def forward(self, hidden_state):
+        hidden_state_1 = self.conv1(hidden_state)
+        hidden_state_1 = self.bottlenecks(hidden_state_1)
+        hidden_state_2 = self.conv2(hidden_state)
+        return self.conv3(hidden_state_1 + hidden_state_2)
+
+
+# Copied from transformers.models.deformable_detr.modeling_deformable_detr.DeformableDetrMultiscaleDeformableAttention with DeformableDetr->RTDetr
+class RTDetrMultiscaleDeformableAttention(nn.Module):
+    """
+    Multiscale deformable attention as proposed in Deformable DETR.
+    """
+
+    def __init__(self, config: RTDetrConfig, num_heads: int, n_points: int):
+        super().__init__()
+
+        self.attn = MultiScaleDeformableAttention()
+
+        if config.d_model % num_heads != 0:
+            raise ValueError(
+                f"embed_dim (d_model) must be divisible by num_heads, but got {config.d_model} and {num_heads}"
+            )
+        dim_per_head = config.d_model // num_heads
+        # check if dim_per_head is power of 2
+        if not ((dim_per_head & (dim_per_head - 1) == 0) and dim_per_head != 0):
+            warnings.warn(
+                "You'd better set embed_dim (d_model) in RTDetrMultiscaleDeformableAttention to make the"
+                " dimension of each attention head a power of 2 which is more efficient in the authors' CUDA"
+                " implementation."
+            )
+
+        self.im2col_step = 64
+
+        self.d_model = config.d_model
+        self.n_levels = config.num_feature_levels
+        self.n_heads = num_heads
+        self.n_points = n_points
+
+        self.sampling_offsets = nn.Linear(config.d_model, num_heads * self.n_levels * n_points * 2)
+        self.attention_weights = nn.Linear(config.d_model, num_heads * self.n_levels * n_points)
+        self.value_proj = nn.Linear(config.d_model, config.d_model)
+        self.output_proj = nn.Linear(config.d_model, config.d_model)
+
+        self.disable_custom_kernels = config.disable_custom_kernels
+
+    def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
+        return tensor if position_embeddings is None else tensor + position_embeddings
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        position_embeddings: Optional[torch.Tensor] = None,
+        reference_points=None,
+        spatial_shapes=None,
+        spatial_shapes_list=None,
+        level_start_index=None,
+        output_attentions: bool = False,
+    ):
+        # add position embeddings to the hidden states before projecting to queries and keys
+        if position_embeddings is not None:
+            hidden_states = self.with_pos_embed(hidden_states, position_embeddings)
+
+        batch_size, num_queries, _ = hidden_states.shape
+        batch_size, sequence_length, _ = encoder_hidden_states.shape
+        total_elements = sum(height * width for height, width in spatial_shapes_list)
+        if total_elements != sequence_length:
+            raise ValueError(
+                "Make sure to align the spatial shapes with the sequence length of the encoder hidden states"
+            )
+
+        value = self.value_proj(encoder_hidden_states)
+        if attention_mask is not None:
+            # we invert the attention_mask
+            value = value.masked_fill(~attention_mask[..., None], float(0))
+        value = value.view(batch_size, sequence_length, self.n_heads, self.d_model // self.n_heads)
+        sampling_offsets = self.sampling_offsets(hidden_states).view(
+            batch_size, num_queries, self.n_heads, self.n_levels, self.n_points, 2
+        )
+        attention_weights = self.attention_weights(hidden_states).view(
+            batch_size, num_queries, self.n_heads, self.n_levels * self.n_points
+        )
+        attention_weights = F.softmax(attention_weights, -1).view(
+            batch_size, num_queries, self.n_heads, self.n_levels, self.n_points
+        )
+        # batch_size, num_queries, n_heads, n_levels, n_points, 2
+        num_coordinates = reference_points.shape[-1]
+        if num_coordinates == 2:
+            offset_normalizer = torch.stack([spatial_shapes[..., 1], spatial_shapes[..., 0]], -1)
+            sampling_locations = (
+                reference_points[:, :, None, :, None, :]
+                + sampling_offsets / offset_normalizer[None, None, None, :, None, :]
+            )
+        elif num_coordinates == 4:
+            sampling_locations = (
+                reference_points[:, :, None, :, None, :2]
+                + sampling_offsets / self.n_points * reference_points[:, :, None, :, None, 2:] * 0.5
+            )
+        else:
+            raise ValueError(f"Last dim of reference_points must be 2 or 4, but got {reference_points.shape[-1]}")
+
+        output = self.attn(
+            value,
+            spatial_shapes,
+            spatial_shapes_list,
+            level_start_index,
+            sampling_locations,
+            attention_weights,
+            self.im2col_step,
+        )
+
+        output = self.output_proj(output)
+
+        return output, attention_weights
+
+
+class RTDetrMultiheadAttention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper.
+
+    Here, we add position embeddings to the queries and keys (as explained in the Deformable DETR paper).
+    """
+
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        bias: bool = True,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        if self.head_dim * num_heads != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim} and `num_heads`:"
+                f" {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    def _reshape(self, tensor: torch.Tensor, seq_len: int, batch_size: int):
+        return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2).contiguous()
+
+    def with_pos_embed(self, tensor: torch.Tensor, position_embeddings: Optional[Tensor]):
+        return tensor if position_embeddings is None else tensor + position_embeddings
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        batch_size, target_len, embed_dim = hidden_states.size()
+        # add position embeddings to the hidden states before projecting to queries and keys
+        if position_embeddings is not None:
+            hidden_states_original = hidden_states
+            hidden_states = self.with_pos_embed(hidden_states, position_embeddings)
+
+        # get queries, keys and values
+        query_states = self.q_proj(hidden_states) * self.scaling
+        key_states = self._reshape(self.k_proj(hidden_states), -1, batch_size)
+        value_states = self._reshape(self.v_proj(hidden_states_original), -1, batch_size)
+
+        proj_shape = (batch_size * self.num_heads, -1, self.head_dim)
+        query_states = self._reshape(query_states, target_len, batch_size).view(*proj_shape)
+        key_states = key_states.view(*proj_shape)
+        value_states = value_states.view(*proj_shape)
+
+        source_len = key_states.size(1)
+
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (batch_size * self.num_heads, target_len, source_len):
+            raise ValueError(
+                f"Attention weights should be of size {(batch_size * self.num_heads, target_len, source_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        # expand attention_mask
+        if attention_mask is not None:
+            # [seq_len, seq_len] -> [batch_size, 1, target_seq_len, source_seq_len]
+            attention_mask = attention_mask.expand(batch_size, 1, *attention_mask.size())
+
+        if attention_mask is not None:
+            if attention_mask.size() != (batch_size, 1, target_len, source_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(batch_size, 1, target_len, source_len)}, but is"
+                    f" {attention_mask.size()}"
+                )
+            if attention_mask.dtype == torch.bool:
+                attention_mask = torch.zeros_like(attention_mask, dtype=attn_weights.dtype).masked_fill_(
+                    attention_mask, -torch.inf
+                )
+            attn_weights = attn_weights.view(batch_size, self.num_heads, target_len, source_len) + attention_mask
+            attn_weights = attn_weights.view(batch_size * self.num_heads, target_len, source_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(batch_size, self.num_heads, target_len, source_len)
+            attn_weights = attn_weights_reshaped.view(batch_size * self.num_heads, target_len, source_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (batch_size * self.num_heads, target_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(batch_size, self.num_heads, target_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(batch_size, self.num_heads, target_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+        attn_output = attn_output.reshape(batch_size, target_len, embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped
+
+
+class RTDetrDecoderLayer(nn.Module):
+    def __init__(self, config: RTDetrConfig):
+        super().__init__()
+        # self-attention
+        self.self_attn = RTDetrMultiheadAttention(
+            embed_dim=config.d_model,
+            num_heads=config.decoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.decoder_activation_function]
+        self.activation_dropout = config.activation_dropout
+
+        self.self_attn_layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_eps)
+        # cross-attention
+        self.encoder_attn = RTDetrMultiscaleDeformableAttention(
+            config,
+            num_heads=config.decoder_attention_heads,
+            n_points=config.decoder_n_points,
+        )
+        self.encoder_attn_layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_eps)
+        # feedforward neural networks
+        self.fc1 = nn.Linear(config.d_model, config.decoder_ffn_dim)
+        self.fc2 = nn.Linear(config.decoder_ffn_dim, config.d_model)
+        self.final_layer_norm = nn.LayerNorm(config.d_model, eps=config.layer_norm_eps)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: Optional[torch.Tensor] = None,
+        reference_points=None,
+        spatial_shapes=None,
+        spatial_shapes_list=None,
+        level_start_index=None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ):
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                Input to the layer of shape `(seq_len, batch, embed_dim)`.
+            position_embeddings (`torch.FloatTensor`, *optional*):
+                Position embeddings that are added to the queries and keys in the self-attention layer.
+            reference_points (`torch.FloatTensor`, *optional*):
+                Reference points.
+            spatial_shapes (`torch.LongTensor`, *optional*):
+                Spatial shapes.
+            level_start_index (`torch.LongTensor`, *optional*):
+                Level start index.
+            encoder_hidden_states (`torch.FloatTensor`):
+                cross attention input to the layer of shape `(seq_len, batch, embed_dim)`
+            encoder_attention_mask (`torch.FloatTensor`): encoder attention mask of size
+                `(batch, 1, target_len, source_len)` where padding elements are indicated by very large negative
+                values.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=encoder_attention_mask,
+            position_embeddings=position_embeddings,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        second_residual = hidden_states
+
+        # Cross-Attention
+        cross_attn_weights = None
+        hidden_states, cross_attn_weights = self.encoder_attn(
+            hidden_states=hidden_states,
+            encoder_hidden_states=encoder_hidden_states,
+            position_embeddings=position_embeddings,
+            reference_points=reference_points,
+            spatial_shapes=spatial_shapes,
+            spatial_shapes_list=spatial_shapes_list,
+            level_start_index=level_start_index,
+            output_attentions=output_attentions,
+        )
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = second_residual + hidden_states
+
+        hidden_states = self.encoder_attn_layer_norm(hidden_states)
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.activation_fn(self.fc1(hidden_states))
+        hidden_states = nn.functional.dropout(hidden_states, p=self.activation_dropout, training=self.training)
+        hidden_states = self.fc2(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        hidden_states = residual + hidden_states
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights, cross_attn_weights)
+
+        return outputs
+
+
+@auto_docstring
+class RTDetrPreTrainedModel(PreTrainedModel):
+    config: RTDetrConfig
+    base_model_prefix = "rt_detr"
+    main_input_name = "pixel_values"
+    _no_split_modules = [r"RTDetrHybridEncoder", r"RTDetrDecoderLayer"]
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        if isinstance(module, (RTDetrForObjectDetection, RTDetrDecoder)):
+            if module.class_embed is not None:
+                for layer in module.class_embed:
+                    prior_prob = self.config.initializer_bias_prior_prob or 1 / (self.config.num_labels + 1)
+                    bias = float(-math.log((1 - prior_prob) / prior_prob))
+                    nn.init.xavier_uniform_(layer.weight)
+                    nn.init.constant_(layer.bias, bias)
+
+            if module.bbox_embed is not None:
+                for layer in module.bbox_embed:
+                    nn.init.constant_(layer.layers[-1].weight, 0)
+                    nn.init.constant_(layer.layers[-1].bias, 0)
+
+        elif isinstance(module, RTDetrMultiscaleDeformableAttention):
+            nn.init.constant_(module.sampling_offsets.weight.data, 0.0)
+            default_dtype = torch.get_default_dtype()
+            thetas = torch.arange(module.n_heads, dtype=torch.int64).to(default_dtype) * (
+                2.0 * math.pi / module.n_heads
+            )
+            grid_init = torch.stack([thetas.cos(), thetas.sin()], -1)
+            grid_init = (
+                (grid_init / grid_init.abs().max(-1, keepdim=True)[0])
+                .view(module.n_heads, 1, 1, 2)
+                .repeat(1, module.n_levels, module.n_points, 1)
+            )
+            for i in range(module.n_points):
+                grid_init[:, :, i, :] *= i + 1
+            with torch.no_grad():
+                module.sampling_offsets.bias = nn.Parameter(grid_init.view(-1))
+            nn.init.constant_(module.attention_weights.weight.data, 0.0)
+            nn.init.constant_(module.attention_weights.bias.data, 0.0)
+            nn.init.xavier_uniform_(module.value_proj.weight.data)
+            nn.init.constant_(module.value_proj.bias.data, 0.0)
+            nn.init.xavier_uniform_(module.output_proj.weight.data)
+            nn.init.constant_(module.output_proj.bias.data, 0.0)
+
+        elif isinstance(module, RTDetrModel):
+            prior_prob = self.config.initializer_bias_prior_prob or 1 / (self.config.num_labels + 1)
+            bias = float(-math.log((1 - prior_prob) / prior_prob))
+            nn.init.xavier_uniform_(module.enc_score_head.weight)
+            nn.init.constant_(module.enc_score_head.bias, bias)
+
+        elif isinstance(module, (nn.Linear, nn.Conv2d, nn.BatchNorm2d)):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+
+        elif isinstance(module, nn.LayerNorm):
+            module.weight.data.fill_(1.0)
+            module.bias.data.zero_()
+
+        if hasattr(module, "weight_embedding") and self.config.learn_initial_query:
+            nn.init.xavier_uniform_(module.weight_embedding.weight)
+        if hasattr(module, "denoising_class_embed") and self.config.num_denoising > 0:
+            nn.init.xavier_uniform_(module.denoising_class_embed.weight)
+
+
+class RTDetrEncoder(nn.Module):
+    def __init__(self, config: RTDetrConfig):
+        super().__init__()
+
+        self.layers = nn.ModuleList([RTDetrEncoderLayer(config) for _ in range(config.encoder_layers)])
+
+    def forward(self, src, src_mask=None, pos_embed=None, output_attentions: bool = False) -> torch.Tensor:
+        hidden_states = src
+        for layer in self.layers:
+            hidden_states = layer(
+                hidden_states,
+                attention_mask=src_mask,
+                position_embeddings=pos_embed,
+                output_attentions=output_attentions,
+            )
+        return hidden_states
+
+
+class RTDetrHybridEncoder(nn.Module):
+    """
+    Decoder consisting of a projection layer, a set of `RTDetrEncoder`, a top-down Feature Pyramid Network
+    (FPN) and a bottom-up Path Aggregation Network (PAN). More details on the paper: https://huggingface.co/papers/2304.08069
+
+    Args:
+        config: RTDetrConfig
+    """
+
+    def __init__(self, config: RTDetrConfig):
+        super().__init__()
+        self.config = config
+        self.in_channels = config.encoder_in_channels
+        self.feat_strides = config.feat_strides
+        self.encoder_hidden_dim = config.encoder_hidden_dim
+        self.encode_proj_layers = config.encode_proj_layers
+        self.positional_encoding_temperature = config.positional_encoding_temperature
+        self.eval_size = config.eval_size
+        self.out_channels = [self.encoder_hidden_dim for _ in self.in_channels]
+        self.out_strides = self.feat_strides
+        self.num_fpn_stages = len(self.in_channels) - 1
+        self.num_pan_stages = len(self.in_channels) - 1
+        activation = config.activation_function
+
+        # encoder transformer
+        self.encoder = nn.ModuleList([RTDetrEncoder(config) for _ in range(len(self.encode_proj_layers))])
+
+        # top-down FPN
+        self.lateral_convs = nn.ModuleList()
+        self.fpn_blocks = nn.ModuleList()
+        for _ in range(self.num_fpn_stages):
+            lateral_conv = RTDetrConvNormLayer(
+                config,
+                in_channels=self.encoder_hidden_dim,
+                out_channels=self.encoder_hidden_dim,
+                kernel_size=1,
+                stride=1,
+                activation=activation,
+            )
+            fpn_block = RTDetrCSPRepLayer(config)
+            self.lateral_convs.append(lateral_conv)
+            self.fpn_blocks.append(fpn_block)
+
+        # bottom-up PAN
+        self.downsample_convs = nn.ModuleList()
+        self.pan_blocks = nn.ModuleList()
+        for _ in range(self.num_pan_stages):
+            downsample_conv = RTDetrConvNormLayer(
+                config,
+                in_channels=self.encoder_hidden_dim,
+                out_channels=self.encoder_hidden_dim,
+                kernel_size=3,
+                stride=2,
+                activation=activation,
+            )
+            pan_block = RTDetrCSPRepLayer(config)
+            self.downsample_convs.append(downsample_conv)
+            self.pan_blocks.append(pan_block)
+
+    @staticmethod
+    def build_2d_sincos_position_embedding(
+        width, height, embed_dim=256, temperature=10000.0, device="cpu", dtype=torch.float32
+    ):
+        grid_w = torch.arange(torch_int(width), device=device).to(dtype)
+        grid_h = torch.arange(torch_int(height), device=device).to(dtype)
+        grid_w, grid_h = torch.meshgrid(grid_w, grid_h, indexing="ij")
+        if embed_dim % 4 != 0:
+            raise ValueError("Embed dimension must be divisible by 4 for 2D sin-cos position embedding")
+        pos_dim = embed_dim // 4
+        omega = torch.arange(pos_dim, device=device).to(dtype) / pos_dim
+        omega = 1.0 / (temperature**omega)
+
+        out_w = grid_w.flatten()[..., None] @ omega[None]
+        out_h = grid_h.flatten()[..., None] @ omega[None]
+
+        return torch.concat([out_w.sin(), out_w.cos(), out_h.sin(), out_h.cos()], dim=1)[None, :, :]
+
+    def forward(
+        self,
+        inputs_embeds=None,
+        attention_mask=None,
+        position_embeddings=None,
+        spatial_shapes=None,
+        level_start_index=None,
+        valid_ratios=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Flattened feature map (output of the backbone + projection layer) that is passed to the encoder.
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding pixel features. Mask values selected in `[0, 1]`:
+                - 1 for pixel features that are real (i.e. **not masked**),
+                - 0 for pixel features that are padding (i.e. **masked**).
+                [What are attention masks?](../glossary#attention-mask)
+            position_embeddings (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+                Position embeddings that are added to the queries and keys in each self-attention layer.
+            spatial_shapes (`torch.LongTensor` of shape `(num_feature_levels, 2)`):
+                Spatial shapes of each feature map.
+            level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`):
+                Starting index of each feature map.
+            valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`):
+                Ratio of valid area in each feature level.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        hidden_states = inputs_embeds
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        # encoder
+        if self.config.encoder_layers > 0:
+            for i, enc_ind in enumerate(self.encode_proj_layers):
+                if output_hidden_states:
+                    encoder_states = encoder_states + (hidden_states[enc_ind],)
+                height, width = hidden_states[enc_ind].shape[2:]
+                # flatten [batch, channel, height, width] to [batch, height*width, channel]
+                src_flatten = hidden_states[enc_ind].flatten(2).permute(0, 2, 1)
+                if self.training or self.eval_size is None:
+                    pos_embed = self.build_2d_sincos_position_embedding(
+                        width,
+                        height,
+                        self.encoder_hidden_dim,
+                        self.positional_encoding_temperature,
+                        device=src_flatten.device,
+                        dtype=src_flatten.dtype,
+                    )
+                else:
+                    pos_embed = None
+
+                layer_outputs = self.encoder[i](
+                    src_flatten,
+                    pos_embed=pos_embed,
+                    output_attentions=output_attentions,
+                )
+                hidden_states[enc_ind] = (
+                    layer_outputs[0].permute(0, 2, 1).reshape(-1, self.encoder_hidden_dim, height, width).contiguous()
+                )
+
+                if output_attentions:
+                    all_attentions = all_attentions + (layer_outputs[1],)
+
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states[enc_ind],)
+
+        # top-down FPN
+        fpn_feature_maps = [hidden_states[-1]]
+        for idx, (lateral_conv, fpn_block) in enumerate(zip(self.lateral_convs, self.fpn_blocks)):
+            backbone_feature_map = hidden_states[self.num_fpn_stages - idx - 1]
+            top_fpn_feature_map = fpn_feature_maps[-1]
+            # apply lateral block
+            top_fpn_feature_map = lateral_conv(top_fpn_feature_map)
+            fpn_feature_maps[-1] = top_fpn_feature_map
+            # apply fpn block
+            top_fpn_feature_map = F.interpolate(top_fpn_feature_map, scale_factor=2.0, mode="nearest")
+            fused_feature_map = torch.concat([top_fpn_feature_map, backbone_feature_map], dim=1)
+            new_fpn_feature_map = fpn_block(fused_feature_map)
+            fpn_feature_maps.append(new_fpn_feature_map)
+
+        fpn_feature_maps = fpn_feature_maps[::-1]
+
+        # bottom-up PAN
+        pan_feature_maps = [fpn_feature_maps[0]]
+        for idx, (downsample_conv, pan_block) in enumerate(zip(self.downsample_convs, self.pan_blocks)):
+            top_pan_feature_map = pan_feature_maps[-1]
+            fpn_feature_map = fpn_feature_maps[idx + 1]
+            downsampled_feature_map = downsample_conv(top_pan_feature_map)
+            fused_feature_map = torch.concat([downsampled_feature_map, fpn_feature_map], dim=1)
+            new_pan_feature_map = pan_block(fused_feature_map)
+            pan_feature_maps.append(new_pan_feature_map)
+
+        if not return_dict:
+            return tuple(v for v in [pan_feature_maps, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=pan_feature_maps, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+class RTDetrDecoder(RTDetrPreTrainedModel):
+    def __init__(self, config: RTDetrConfig):
+        super().__init__(config)
+
+        self.dropout = config.dropout
+        self.layers = nn.ModuleList([RTDetrDecoderLayer(config) for _ in range(config.decoder_layers)])
+        self.query_pos_head = RTDetrMLPPredictionHead(config, 4, 2 * config.d_model, config.d_model, num_layers=2)
+
+        # hack implementation for iterative bounding box refinement and two-stage Deformable DETR
+        self.bbox_embed = None
+        self.class_embed = None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        inputs_embeds=None,
+        encoder_hidden_states=None,
+        encoder_attention_mask=None,
+        position_embeddings=None,
+        reference_points=None,
+        spatial_shapes=None,
+        spatial_shapes_list=None,
+        level_start_index=None,
+        valid_ratios=None,
+        output_attentions=None,
+        output_hidden_states=None,
+        return_dict=None,
+    ):
+        r"""
+        Args:
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`):
+                The query embeddings that are passed into the decoder.
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            encoder_attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing cross-attention on padding pixel_values of the encoder. Mask values selected
+                in `[0, 1]`:
+                - 1 for pixels that are real (i.e. **not masked**),
+                - 0 for pixels that are padding (i.e. **masked**).
+            position_embeddings (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
+                Position embeddings that are added to the queries and keys in each self-attention layer.
+            reference_points (`torch.FloatTensor` of shape `(batch_size, num_queries, 4)` is `as_two_stage` else `(batch_size, num_queries, 2)` or , *optional*):
+                Reference point in range `[0, 1]`, top-left (0,0), bottom-right (1, 1), including padding area.
+            spatial_shapes (`torch.FloatTensor` of shape `(num_feature_levels, 2)`):
+                Spatial shapes of the feature maps.
+            level_start_index (`torch.LongTensor` of shape `(num_feature_levels)`, *optional*):
+                Indexes for the start of each feature level. In range `[0, sequence_length]`.
+            valid_ratios (`torch.FloatTensor` of shape `(batch_size, num_feature_levels, 2)`, *optional*):
+                Ratio of valid area in each feature level.
+
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~file_utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if inputs_embeds is not None:
+            hidden_states = inputs_embeds
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+        intermediate = ()
+        intermediate_reference_points = ()
+        intermediate_logits = ()
+
+        reference_points = F.sigmoid(reference_points)
+
+        # https://github.com/lyuwenyu/RT-DETR/blob/94f5e16708329d2f2716426868ec89aa774af016/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py#L252
+        for idx, decoder_layer in enumerate(self.layers):
+            reference_points_input = reference_points.unsqueeze(2)
+            position_embeddings = self.query_pos_head(reference_points)
+
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                position_embeddings=position_embeddings,
+                encoder_hidden_states=encoder_hidden_states,
+                reference_points=reference_points_input,
+                spatial_shapes=spatial_shapes,
+                spatial_shapes_list=spatial_shapes_list,
+                level_start_index=level_start_index,
+                encoder_attention_mask=encoder_attention_mask,
+                output_attentions=output_attentions,
+            )
+
+            hidden_states = layer_outputs[0]
+
+            # hack implementation for iterative bounding box refinement
+            if self.bbox_embed is not None:
+                predicted_corners = self.bbox_embed[idx](hidden_states)
+                new_reference_points = F.sigmoid(predicted_corners + inverse_sigmoid(reference_points))
+                reference_points = new_reference_points.detach()
+
+            intermediate += (hidden_states,)
+            intermediate_reference_points += (
+                (new_reference_points,) if self.bbox_embed is not None else (reference_points,)
+            )
+
+            if self.class_embed is not None:
+                logits = self.class_embed[idx](hidden_states)
+                intermediate_logits += (logits,)
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        # Keep batch_size as first dimension
+        intermediate = torch.stack(intermediate, dim=1)
+        intermediate_reference_points = torch.stack(intermediate_reference_points, dim=1)
+        if self.class_embed is not None:
+            intermediate_logits = torch.stack(intermediate_logits, dim=1)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [
+                    hidden_states,
+                    intermediate,
+                    intermediate_logits,
+                    intermediate_reference_points,
+                    all_hidden_states,
+                    all_self_attns,
+                    all_cross_attentions,
+                ]
+                if v is not None
+            )
+        return RTDetrDecoderOutput(
+            last_hidden_state=hidden_states,
+            intermediate_hidden_states=intermediate,
+            intermediate_logits=intermediate_logits,
+            intermediate_reference_points=intermediate_reference_points,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+# taken from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+class RTDetrMLPPredictionHead(nn.Module):
+    """
+    Very simple multi-layer perceptron (MLP, also called FFN), used to predict the normalized center coordinates,
+    height and width of a bounding box w.r.t. an image.
+
+    Copied from https://github.com/facebookresearch/detr/blob/master/models/detr.py
+    Origin from https://github.com/lyuwenyu/RT-DETR/blob/94f5e16708329d2f2716426868ec89aa774af016/rtdetr_paddle/ppdet/modeling/transformers/utils.py#L453
+
+    """
+
+    def __init__(self, config, input_dim, d_model, output_dim, num_layers):
+        super().__init__()
+        self.num_layers = num_layers
+        h = [d_model] * (num_layers - 1)
+        self.layers = nn.ModuleList(nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]))
+
+    def forward(self, x):
+        for i, layer in enumerate(self.layers):
+            x = nn.functional.relu(layer(x)) if i < self.num_layers - 1 else layer(x)
+        return x
+
+
+@auto_docstring(
+    custom_intro="""
+    RT-DETR Model (consisting of a backbone and encoder-decoder) outputting raw hidden states without any head on top.
+    """
+)
+class RTDetrModel(RTDetrPreTrainedModel):
+    def __init__(self, config: RTDetrConfig):
+        super().__init__(config)
+
+        # Create backbone
+        self.backbone = RTDetrConvEncoder(config)
+        intermediate_channel_sizes = self.backbone.intermediate_channel_sizes
+
+        # Create encoder input projection layers
+        # https://github.com/lyuwenyu/RT-DETR/blob/94f5e16708329d2f2716426868ec89aa774af016/rtdetr_pytorch/src/zoo/rtdetr/hybrid_encoder.py#L212
+        num_backbone_outs = len(intermediate_channel_sizes)
+        encoder_input_proj_list = []
+        for _ in range(num_backbone_outs):
+            in_channels = intermediate_channel_sizes[_]
+            encoder_input_proj_list.append(
+                nn.Sequential(
+                    nn.Conv2d(in_channels, config.encoder_hidden_dim, kernel_size=1, bias=False),
+                    nn.BatchNorm2d(config.encoder_hidden_dim),
+                )
+            )
+        self.encoder_input_proj = nn.ModuleList(encoder_input_proj_list)
+
+        # Create encoder
+        self.encoder = RTDetrHybridEncoder(config)
+
+        # denoising part
+        if config.num_denoising > 0:
+            self.denoising_class_embed = nn.Embedding(
+                config.num_labels + 1, config.d_model, padding_idx=config.num_labels
+            )
+
+        # decoder embedding
+        if config.learn_initial_query:
+            self.weight_embedding = nn.Embedding(config.num_queries, config.d_model)
+
+        # encoder head
+        self.enc_output = nn.Sequential(
+            nn.Linear(config.d_model, config.d_model),
+            nn.LayerNorm(config.d_model, eps=config.layer_norm_eps),
+        )
+        self.enc_score_head = nn.Linear(config.d_model, config.num_labels)
+        self.enc_bbox_head = RTDetrMLPPredictionHead(config, config.d_model, config.d_model, 4, num_layers=3)
+
+        # init encoder output anchors and valid_mask
+        if config.anchor_image_size:
+            self.anchors, self.valid_mask = self.generate_anchors(dtype=self.dtype)
+
+        # Create decoder input projection layers
+        # https://github.com/lyuwenyu/RT-DETR/blob/94f5e16708329d2f2716426868ec89aa774af016/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py#L412
+        num_backbone_outs = len(config.decoder_in_channels)
+        decoder_input_proj_list = []
+        for _ in range(num_backbone_outs):
+            in_channels = config.decoder_in_channels[_]
+            decoder_input_proj_list.append(
+                nn.Sequential(
+                    nn.Conv2d(in_channels, config.d_model, kernel_size=1, bias=False),
+                    nn.BatchNorm2d(config.d_model, config.batch_norm_eps),
+                )
+            )
+        for _ in range(config.num_feature_levels - num_backbone_outs):
+            decoder_input_proj_list.append(
+                nn.Sequential(
+                    nn.Conv2d(in_channels, config.d_model, kernel_size=3, stride=2, padding=1, bias=False),
+                    nn.BatchNorm2d(config.d_model, config.batch_norm_eps),
+                )
+            )
+            in_channels = config.d_model
+        self.decoder_input_proj = nn.ModuleList(decoder_input_proj_list)
+
+        # decoder
+        self.decoder = RTDetrDecoder(config)
+
+        self.post_init()
+
+    def get_encoder(self):
+        return self.encoder
+
+    def freeze_backbone(self):
+        for param in self.backbone.parameters():
+            param.requires_grad_(False)
+
+    def unfreeze_backbone(self):
+        for param in self.backbone.parameters():
+            param.requires_grad_(True)
+
+    @compile_compatible_method_lru_cache(maxsize=32)
+    def generate_anchors(self, spatial_shapes=None, grid_size=0.05, device="cpu", dtype=torch.float32):
+        if spatial_shapes is None:
+            spatial_shapes = [
+                [int(self.config.anchor_image_size[0] / s), int(self.config.anchor_image_size[1] / s)]
+                for s in self.config.feat_strides
+            ]
+        anchors = []
+        for level, (height, width) in enumerate(spatial_shapes):
+            grid_y, grid_x = torch.meshgrid(
+                torch.arange(end=height, device=device).to(dtype),
+                torch.arange(end=width, device=device).to(dtype),
+                indexing="ij",
+            )
+            grid_xy = torch.stack([grid_x, grid_y], -1)
+            grid_xy = grid_xy.unsqueeze(0) + 0.5
+            grid_xy[..., 0] /= width
+            grid_xy[..., 1] /= height
+            wh = torch.ones_like(grid_xy) * grid_size * (2.0**level)
+            anchors.append(torch.concat([grid_xy, wh], -1).reshape(-1, height * width, 4))
+        # define the valid range for anchor coordinates
+        eps = 1e-2
+        anchors = torch.concat(anchors, 1)
+        valid_mask = ((anchors > eps) * (anchors < 1 - eps)).all(-1, keepdim=True)
+        anchors = torch.log(anchors / (1 - anchors))
+        anchors = torch.where(valid_mask, anchors, torch.tensor(torch.finfo(dtype).max, dtype=dtype, device=device))
+
+        return anchors, valid_mask
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        pixel_mask: Optional[torch.LongTensor] = None,
+        encoder_outputs: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[list[dict]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.FloatTensor], RTDetrModelOutput]:
+        r"""
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
+            can choose to directly pass a flattened representation of an image.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
+            Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
+            embedded representation.
+        labels (`list[Dict]` of len `(batch_size,)`, *optional*):
+            Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
+            following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch
+            respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes
+            in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`.
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, RTDetrModel
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("PekingU/rtdetr_r50vd")
+        >>> model = RTDetrModel.from_pretrained("PekingU/rtdetr_r50vd")
+
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+
+        >>> outputs = model(**inputs)
+
+        >>> last_hidden_states = outputs.last_hidden_state
+        >>> list(last_hidden_states.shape)
+        [1, 300, 256]
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        batch_size, num_channels, height, width = pixel_values.shape
+        device = pixel_values.device
+
+        if pixel_mask is None:
+            pixel_mask = torch.ones(((batch_size, height, width)), device=device)
+
+        features = self.backbone(pixel_values, pixel_mask)
+
+        proj_feats = [self.encoder_input_proj[level](source) for level, (source, mask) in enumerate(features)]
+
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                proj_feats,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if output_hidden_states else None,
+                attentions=encoder_outputs[2]
+                if len(encoder_outputs) > 2
+                else encoder_outputs[1]
+                if output_attentions
+                else None,
+            )
+
+        # Equivalent to def _get_encoder_input
+        # https://github.com/lyuwenyu/RT-DETR/blob/94f5e16708329d2f2716426868ec89aa774af016/rtdetr_pytorch/src/zoo/rtdetr/rtdetr_decoder.py#L412
+        sources = []
+        for level, source in enumerate(encoder_outputs[0]):
+            sources.append(self.decoder_input_proj[level](source))
+
+        # Lowest resolution feature maps are obtained via 3x3 stride 2 convolutions on the final stage
+        if self.config.num_feature_levels > len(sources):
+            _len_sources = len(sources)
+            sources.append(self.decoder_input_proj[_len_sources](encoder_outputs[0])[-1])
+            for i in range(_len_sources + 1, self.config.num_feature_levels):
+                sources.append(self.decoder_input_proj[i](encoder_outputs[0][-1]))
+
+        # Prepare encoder inputs (by flattening)
+        source_flatten = []
+        spatial_shapes_list = []
+        spatial_shapes = torch.empty((len(sources), 2), device=device, dtype=torch.long)
+        for level, source in enumerate(sources):
+            height, width = source.shape[-2:]
+            spatial_shapes[level, 0] = height
+            spatial_shapes[level, 1] = width
+            spatial_shapes_list.append((height, width))
+            source = source.flatten(2).transpose(1, 2)
+            source_flatten.append(source)
+        source_flatten = torch.cat(source_flatten, 1)
+        level_start_index = torch.cat((spatial_shapes.new_zeros((1,)), spatial_shapes.prod(1).cumsum(0)[:-1]))
+
+        # prepare denoising training
+        if self.training and self.config.num_denoising > 0 and labels is not None:
+            (
+                denoising_class,
+                denoising_bbox_unact,
+                attention_mask,
+                denoising_meta_values,
+            ) = get_contrastive_denoising_training_group(
+                targets=labels,
+                num_classes=self.config.num_labels,
+                num_queries=self.config.num_queries,
+                class_embed=self.denoising_class_embed,
+                num_denoising_queries=self.config.num_denoising,
+                label_noise_ratio=self.config.label_noise_ratio,
+                box_noise_scale=self.config.box_noise_scale,
+            )
+        else:
+            denoising_class, denoising_bbox_unact, attention_mask, denoising_meta_values = None, None, None, None
+
+        batch_size = len(source_flatten)
+        device = source_flatten.device
+        dtype = source_flatten.dtype
+
+        # prepare input for decoder
+        if self.training or self.config.anchor_image_size is None:
+            # Pass spatial_shapes as tuple to make it hashable and make sure
+            # lru_cache is working for generate_anchors()
+            spatial_shapes_tuple = tuple(spatial_shapes_list)
+            anchors, valid_mask = self.generate_anchors(spatial_shapes_tuple, device=device, dtype=dtype)
+        else:
+            anchors, valid_mask = self.anchors, self.valid_mask
+            anchors, valid_mask = anchors.to(device, dtype), valid_mask.to(device, dtype)
+
+        # use the valid_mask to selectively retain values in the feature map where the mask is `True`
+        memory = valid_mask.to(source_flatten.dtype) * source_flatten
+
+        output_memory = self.enc_output(memory)
+
+        enc_outputs_class = self.enc_score_head(output_memory)
+        enc_outputs_coord_logits = self.enc_bbox_head(output_memory) + anchors
+
+        _, topk_ind = torch.topk(enc_outputs_class.max(-1).values, self.config.num_queries, dim=1)
+
+        reference_points_unact = enc_outputs_coord_logits.gather(
+            dim=1, index=topk_ind.unsqueeze(-1).repeat(1, 1, enc_outputs_coord_logits.shape[-1])
+        )
+
+        enc_topk_bboxes = F.sigmoid(reference_points_unact)
+        if denoising_bbox_unact is not None:
+            reference_points_unact = torch.concat([denoising_bbox_unact, reference_points_unact], 1)
+
+        enc_topk_logits = enc_outputs_class.gather(
+            dim=1, index=topk_ind.unsqueeze(-1).repeat(1, 1, enc_outputs_class.shape[-1])
+        )
+
+        # extract region features
+        if self.config.learn_initial_query:
+            target = self.weight_embedding.tile([batch_size, 1, 1])
+        else:
+            target = output_memory.gather(dim=1, index=topk_ind.unsqueeze(-1).repeat(1, 1, output_memory.shape[-1]))
+            target = target.detach()
+
+        if denoising_class is not None:
+            target = torch.concat([denoising_class, target], 1)
+
+        init_reference_points = reference_points_unact.detach()
+
+        # decoder
+        decoder_outputs = self.decoder(
+            inputs_embeds=target,
+            encoder_hidden_states=source_flatten,
+            encoder_attention_mask=attention_mask,
+            reference_points=init_reference_points,
+            spatial_shapes=spatial_shapes,
+            spatial_shapes_list=spatial_shapes_list,
+            level_start_index=level_start_index,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            enc_outputs = tuple(
+                value
+                for value in [enc_topk_logits, enc_topk_bboxes, enc_outputs_class, enc_outputs_coord_logits]
+                if value is not None
+            )
+            dn_outputs = tuple(value if value is not None else None for value in [denoising_meta_values])
+            tuple_outputs = decoder_outputs + encoder_outputs + (init_reference_points,) + enc_outputs + dn_outputs
+
+            return tuple_outputs
+
+        return RTDetrModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            intermediate_hidden_states=decoder_outputs.intermediate_hidden_states,
+            intermediate_logits=decoder_outputs.intermediate_logits,
+            intermediate_reference_points=decoder_outputs.intermediate_reference_points,
+            intermediate_predicted_corners=decoder_outputs.intermediate_predicted_corners,
+            initial_reference_points=decoder_outputs.initial_reference_points,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+            init_reference_points=init_reference_points,
+            enc_topk_logits=enc_topk_logits,
+            enc_topk_bboxes=enc_topk_bboxes,
+            enc_outputs_class=enc_outputs_class,
+            enc_outputs_coord_logits=enc_outputs_coord_logits,
+            denoising_meta_values=denoising_meta_values,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    RT-DETR Model (consisting of a backbone and encoder-decoder) outputting bounding boxes and logits to be further
+    decoded into scores and classes.
+    """
+)
+class RTDetrForObjectDetection(RTDetrPreTrainedModel):
+    # When using clones, all layers > 0 will be clones, but layer 0 *is* required
+    _tied_weights_keys = ["bbox_embed", "class_embed"]
+    # We can't initialize the model on meta device as some weights are modified during the initialization
+    _no_split_modules = None
+
+    def __init__(self, config: RTDetrConfig):
+        super().__init__(config)
+
+        # RTDETR encoder-decoder model
+        self.model = RTDetrModel(config)
+
+        # Detection heads on top
+        self.class_embed = partial(nn.Linear, config.d_model, config.num_labels)
+        self.bbox_embed = partial(RTDetrMLPPredictionHead, config, config.d_model, config.d_model, 4, num_layers=3)
+
+        # if two-stage, the last class_embed and bbox_embed is for region proposal generation
+        num_pred = config.decoder_layers
+        if config.with_box_refine:
+            self.class_embed = _get_clones(self.class_embed, num_pred)
+            self.bbox_embed = _get_clones(self.bbox_embed, num_pred)
+        else:
+            self.class_embed = nn.ModuleList([self.class_embed() for _ in range(num_pred)])
+            self.bbox_embed = nn.ModuleList([self.bbox_embed() for _ in range(num_pred)])
+
+        # hack implementation for iterative bounding box refinement
+        self.model.decoder.class_embed = self.class_embed
+        self.model.decoder.bbox_embed = self.bbox_embed
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @torch.jit.unused
+    def _set_aux_loss(self, outputs_class, outputs_coord):
+        # this is a workaround to make torchscript happy, as torchscript
+        # doesn't support dictionary with non-homogeneous values, such
+        # as a dict having both a Tensor and a list.
+        return [{"logits": a, "pred_boxes": b} for a, b in zip(outputs_class, outputs_coord)]
+
+    @auto_docstring
+    def forward(
+        self,
+        pixel_values: torch.FloatTensor,
+        pixel_mask: Optional[torch.LongTensor] = None,
+        encoder_outputs: Optional[torch.FloatTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[list[dict]] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[tuple[torch.FloatTensor], RTDetrObjectDetectionOutput]:
+        r"""
+        inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing the flattened feature map (output of the backbone + projection layer), you
+            can choose to directly pass a flattened representation of an image.
+        decoder_inputs_embeds (`torch.FloatTensor` of shape `(batch_size, num_queries, hidden_size)`, *optional*):
+            Optionally, instead of initializing the queries with a tensor of zeros, you can choose to directly pass an
+            embedded representation.
+        labels (`list[Dict]` of len `(batch_size,)`, *optional*):
+            Labels for computing the bipartite matching loss. List of dicts, each dictionary containing at least the
+            following 2 keys: 'class_labels' and 'boxes' (the class labels and bounding boxes of an image in the batch
+            respectively). The class labels themselves should be a `torch.LongTensor` of len `(number of bounding boxes
+            in the image,)` and the boxes a `torch.FloatTensor` of shape `(number of bounding boxes in the image, 4)`.
+
+        Examples:
+
+        ```python
+        >>> from transformers import RTDetrImageProcessor, RTDetrForObjectDetection
+        >>> from PIL import Image
+        >>> import requests
+        >>> import torch
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = RTDetrImageProcessor.from_pretrained("PekingU/rtdetr_r50vd")
+        >>> model = RTDetrForObjectDetection.from_pretrained("PekingU/rtdetr_r50vd")
+
+        >>> # prepare image for the model
+        >>> inputs = image_processor(images=image, return_tensors="pt")
+
+        >>> # forward pass
+        >>> outputs = model(**inputs)
+
+        >>> logits = outputs.logits
+        >>> list(logits.shape)
+        [1, 300, 80]
+
+        >>> boxes = outputs.pred_boxes
+        >>> list(boxes.shape)
+        [1, 300, 4]
+
+        >>> # convert outputs (bounding boxes and class logits) to Pascal VOC format (xmin, ymin, xmax, ymax)
+        >>> target_sizes = torch.tensor([image.size[::-1]])
+        >>> results = image_processor.post_process_object_detection(outputs, threshold=0.9, target_sizes=target_sizes)[
+        ...     0
+        ... ]
+
+        >>> for score, label, box in zip(results["scores"], results["labels"], results["boxes"]):
+        ...     box = [round(i, 2) for i in box.tolist()]
+        ...     print(
+        ...         f"Detected {model.config.id2label[label.item()]} with confidence "
+        ...         f"{round(score.item(), 3)} at location {box}"
+        ...     )
+        Detected sofa with confidence 0.97 at location [0.14, 0.38, 640.13, 476.21]
+        Detected cat with confidence 0.96 at location [343.38, 24.28, 640.14, 371.5]
+        Detected cat with confidence 0.958 at location [13.23, 54.18, 318.98, 472.22]
+        Detected remote with confidence 0.951 at location [40.11, 73.44, 175.96, 118.48]
+        Detected remote with confidence 0.924 at location [333.73, 76.58, 369.97, 186.99]
+        ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.model(
+            pixel_values,
+            pixel_mask=pixel_mask,
+            encoder_outputs=encoder_outputs,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            labels=labels,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        denoising_meta_values = (
+            outputs.denoising_meta_values if return_dict else outputs[-1] if self.training else None
+        )
+
+        outputs_class = outputs.intermediate_logits if return_dict else outputs[2]
+        outputs_coord = outputs.intermediate_reference_points if return_dict else outputs[3]
+        predicted_corners = outputs.intermediate_predicted_corners if return_dict else outputs[4]
+        initial_reference_points = outputs.initial_reference_points if return_dict else outputs[5]
+
+        logits = outputs_class[:, -1]
+        pred_boxes = outputs_coord[:, -1]
+
+        loss, loss_dict, auxiliary_outputs, enc_topk_logits, enc_topk_bboxes = None, None, None, None, None
+        if labels is not None:
+            enc_topk_logits = outputs.enc_topk_logits if return_dict else outputs[-5]
+            enc_topk_bboxes = outputs.enc_topk_bboxes if return_dict else outputs[-4]
+            loss, loss_dict, auxiliary_outputs = self.loss_function(
+                logits,
+                labels,
+                self.device,
+                pred_boxes,
+                self.config,
+                outputs_class,
+                outputs_coord,
+                enc_topk_logits=enc_topk_logits,
+                enc_topk_bboxes=enc_topk_bboxes,
+                denoising_meta_values=denoising_meta_values,
+                predicted_corners=predicted_corners,
+                initial_reference_points=initial_reference_points,
+                **kwargs,
+            )
+
+        if not return_dict:
+            if auxiliary_outputs is not None:
+                output = (logits, pred_boxes) + (auxiliary_outputs,) + outputs
+            else:
+                output = (logits, pred_boxes) + outputs
+            return ((loss, loss_dict) + output) if loss is not None else output
+
+        return RTDetrObjectDetectionOutput(
+            loss=loss,
+            loss_dict=loss_dict,
+            logits=logits,
+            pred_boxes=pred_boxes,
+            auxiliary_outputs=auxiliary_outputs,
+            last_hidden_state=outputs.last_hidden_state,
+            intermediate_hidden_states=outputs.intermediate_hidden_states,
+            intermediate_logits=outputs.intermediate_logits,
+            intermediate_reference_points=outputs.intermediate_reference_points,
+            intermediate_predicted_corners=outputs.intermediate_predicted_corners,
+            initial_reference_points=outputs.initial_reference_points,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+            init_reference_points=outputs.init_reference_points,
+            enc_topk_logits=outputs.enc_topk_logits,
+            enc_topk_bboxes=outputs.enc_topk_bboxes,
+            enc_outputs_class=outputs.enc_outputs_class,
+            enc_outputs_coord_logits=outputs.enc_outputs_coord_logits,
+            denoising_meta_values=outputs.denoising_meta_values,
+        )
+
+
+__all__ = [
+    "RTDetrForObjectDetection",
+    "RTDetrModel",
+    "RTDetrPreTrainedModel",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/modeling_rt_detr_resnet.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/modeling_rt_detr_resnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..21781dc3573fd8a16f44640ea790109aed720389
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr/modeling_rt_detr_resnet.py
@@ -0,0 +1,399 @@
+# coding=utf-8
+# Copyright 2024 Microsoft Research, Inc. and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+PyTorch RTDetr specific ResNet model. The main difference between hugginface ResNet model is that this RTDetrResNet model forces to use shortcut at the first layer in the resnet-18/34 models.
+See https://github.com/lyuwenyu/RT-DETR/blob/5b628eaa0a2fc25bdafec7e6148d5296b144af85/rtdetr_pytorch/src/nn/backbone/presnet.py#L126 for details.
+"""
+
+import math
+from typing import Optional
+
+from torch import Tensor, nn
+
+from ...activations import ACT2FN
+from ...modeling_outputs import BackboneOutput, BaseModelOutputWithNoAttention
+from ...modeling_utils import PreTrainedModel
+from ...utils import auto_docstring, logging
+from ...utils.backbone_utils import BackboneMixin
+from .configuration_rt_detr_resnet import RTDetrResNetConfig
+
+
+logger = logging.get_logger(__name__)
+
+
+# Copied from transformers.models.resnet.modeling_resnet.ResNetConvLayer -> RTDetrResNetConvLayer
+class RTDetrResNetConvLayer(nn.Module):
+    def __init__(
+        self, in_channels: int, out_channels: int, kernel_size: int = 3, stride: int = 1, activation: str = "relu"
+    ):
+        super().__init__()
+        self.convolution = nn.Conv2d(
+            in_channels, out_channels, kernel_size=kernel_size, stride=stride, padding=kernel_size // 2, bias=False
+        )
+        self.normalization = nn.BatchNorm2d(out_channels)
+        self.activation = ACT2FN[activation] if activation is not None else nn.Identity()
+
+    def forward(self, input: Tensor) -> Tensor:
+        hidden_state = self.convolution(input)
+        hidden_state = self.normalization(hidden_state)
+        hidden_state = self.activation(hidden_state)
+        return hidden_state
+
+
+class RTDetrResNetEmbeddings(nn.Module):
+    """
+    ResNet Embeddings (stem) composed of a deep aggressive convolution.
+    """
+
+    def __init__(self, config: RTDetrResNetConfig):
+        super().__init__()
+        self.embedder = nn.Sequential(
+            *[
+                RTDetrResNetConvLayer(
+                    config.num_channels,
+                    config.embedding_size // 2,
+                    kernel_size=3,
+                    stride=2,
+                    activation=config.hidden_act,
+                ),
+                RTDetrResNetConvLayer(
+                    config.embedding_size // 2,
+                    config.embedding_size // 2,
+                    kernel_size=3,
+                    stride=1,
+                    activation=config.hidden_act,
+                ),
+                RTDetrResNetConvLayer(
+                    config.embedding_size // 2,
+                    config.embedding_size,
+                    kernel_size=3,
+                    stride=1,
+                    activation=config.hidden_act,
+                ),
+            ]
+        )
+        self.pooler = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
+        self.num_channels = config.num_channels
+
+    def forward(self, pixel_values: Tensor) -> Tensor:
+        num_channels = pixel_values.shape[1]
+        if num_channels != self.num_channels:
+            raise ValueError(
+                "Make sure that the channel dimension of the pixel values match with the one set in the configuration."
+            )
+        embedding = self.embedder(pixel_values)
+        embedding = self.pooler(embedding)
+        return embedding
+
+
+# Copied from transformers.models.resnet.modeling_resnet.ResNetShortCut -> RTDetrResNetChortCut
+class RTDetrResNetShortCut(nn.Module):
+    """
+    ResNet shortcut, used to project the residual features to the correct size. If needed, it is also used to
+    downsample the input using `stride=2`.
+    """
+
+    def __init__(self, in_channels: int, out_channels: int, stride: int = 2):
+        super().__init__()
+        self.convolution = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, bias=False)
+        self.normalization = nn.BatchNorm2d(out_channels)
+
+    def forward(self, input: Tensor) -> Tensor:
+        hidden_state = self.convolution(input)
+        hidden_state = self.normalization(hidden_state)
+        return hidden_state
+
+
+class RTDetrResNetBasicLayer(nn.Module):
+    """
+    A classic ResNet's residual layer composed by two `3x3` convolutions.
+    See https://github.com/lyuwenyu/RT-DETR/blob/5b628eaa0a2fc25bdafec7e6148d5296b144af85/rtdetr_pytorch/src/nn/backbone/presnet.py#L34.
+    """
+
+    def __init__(
+        self,
+        config: RTDetrResNetConfig,
+        in_channels: int,
+        out_channels: int,
+        stride: int = 1,
+        should_apply_shortcut: bool = False,
+    ):
+        super().__init__()
+        if in_channels != out_channels:
+            self.shortcut = (
+                nn.Sequential(
+                    *[nn.AvgPool2d(2, 2, 0, ceil_mode=True), RTDetrResNetShortCut(in_channels, out_channels, stride=1)]
+                )
+                if should_apply_shortcut
+                else nn.Identity()
+            )
+        else:
+            self.shortcut = (
+                RTDetrResNetShortCut(in_channels, out_channels, stride=stride)
+                if should_apply_shortcut
+                else nn.Identity()
+            )
+        self.layer = nn.Sequential(
+            RTDetrResNetConvLayer(in_channels, out_channels, stride=stride),
+            RTDetrResNetConvLayer(out_channels, out_channels, activation=None),
+        )
+        self.activation = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_state):
+        residual = hidden_state
+        hidden_state = self.layer(hidden_state)
+        residual = self.shortcut(residual)
+        hidden_state += residual
+        hidden_state = self.activation(hidden_state)
+        return hidden_state
+
+
+class RTDetrResNetBottleNeckLayer(nn.Module):
+    """
+    A classic RTDetrResNet's bottleneck layer composed by three `3x3` convolutions.
+
+    The first `1x1` convolution reduces the input by a factor of `reduction` in order to make the second `3x3`
+    convolution faster. The last `1x1` convolution remaps the reduced features to `out_channels`. If
+    `downsample_in_bottleneck` is true, downsample will be in the first layer instead of the second layer.
+    """
+
+    def __init__(
+        self,
+        config: RTDetrResNetConfig,
+        in_channels: int,
+        out_channels: int,
+        stride: int = 1,
+    ):
+        super().__init__()
+        reduction = 4
+        should_apply_shortcut = in_channels != out_channels or stride != 1
+        reduces_channels = out_channels // reduction
+        if stride == 2:
+            self.shortcut = nn.Sequential(
+                *[
+                    nn.AvgPool2d(2, 2, 0, ceil_mode=True),
+                    RTDetrResNetShortCut(in_channels, out_channels, stride=1)
+                    if should_apply_shortcut
+                    else nn.Identity(),
+                ]
+            )
+        else:
+            self.shortcut = (
+                RTDetrResNetShortCut(in_channels, out_channels, stride=stride)
+                if should_apply_shortcut
+                else nn.Identity()
+            )
+        self.layer = nn.Sequential(
+            RTDetrResNetConvLayer(
+                in_channels, reduces_channels, kernel_size=1, stride=stride if config.downsample_in_bottleneck else 1
+            ),
+            RTDetrResNetConvLayer(
+                reduces_channels, reduces_channels, stride=stride if not config.downsample_in_bottleneck else 1
+            ),
+            RTDetrResNetConvLayer(reduces_channels, out_channels, kernel_size=1, activation=None),
+        )
+        self.activation = ACT2FN[config.hidden_act]
+
+    def forward(self, hidden_state):
+        residual = hidden_state
+        hidden_state = self.layer(hidden_state)
+        residual = self.shortcut(residual)
+        hidden_state += residual
+        hidden_state = self.activation(hidden_state)
+        return hidden_state
+
+
+class RTDetrResNetStage(nn.Module):
+    """
+    A RTDetrResNet stage composed by stacked layers.
+    """
+
+    def __init__(
+        self,
+        config: RTDetrResNetConfig,
+        in_channels: int,
+        out_channels: int,
+        stride: int = 2,
+        depth: int = 2,
+    ):
+        super().__init__()
+
+        layer = RTDetrResNetBottleNeckLayer if config.layer_type == "bottleneck" else RTDetrResNetBasicLayer
+
+        if config.layer_type == "bottleneck":
+            first_layer = layer(
+                config,
+                in_channels,
+                out_channels,
+                stride=stride,
+            )
+        else:
+            first_layer = layer(config, in_channels, out_channels, stride=stride, should_apply_shortcut=True)
+        self.layers = nn.Sequential(
+            first_layer, *[layer(config, out_channels, out_channels) for _ in range(depth - 1)]
+        )
+
+    def forward(self, input: Tensor) -> Tensor:
+        hidden_state = input
+        for layer in self.layers:
+            hidden_state = layer(hidden_state)
+        return hidden_state
+
+
+# Copied from transformers.models.resnet.modeling_resnet.ResNetEncoder with ResNet->RTDetrResNet
+class RTDetrResNetEncoder(nn.Module):
+    def __init__(self, config: RTDetrResNetConfig):
+        super().__init__()
+        self.stages = nn.ModuleList([])
+        # based on `downsample_in_first_stage` the first layer of the first stage may or may not downsample the input
+        self.stages.append(
+            RTDetrResNetStage(
+                config,
+                config.embedding_size,
+                config.hidden_sizes[0],
+                stride=2 if config.downsample_in_first_stage else 1,
+                depth=config.depths[0],
+            )
+        )
+        in_out_channels = zip(config.hidden_sizes, config.hidden_sizes[1:])
+        for (in_channels, out_channels), depth in zip(in_out_channels, config.depths[1:]):
+            self.stages.append(RTDetrResNetStage(config, in_channels, out_channels, depth=depth))
+
+    def forward(
+        self, hidden_state: Tensor, output_hidden_states: bool = False, return_dict: bool = True
+    ) -> BaseModelOutputWithNoAttention:
+        hidden_states = () if output_hidden_states else None
+
+        for stage_module in self.stages:
+            if output_hidden_states:
+                hidden_states = hidden_states + (hidden_state,)
+
+            hidden_state = stage_module(hidden_state)
+
+        if output_hidden_states:
+            hidden_states = hidden_states + (hidden_state,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_state, hidden_states] if v is not None)
+
+        return BaseModelOutputWithNoAttention(
+            last_hidden_state=hidden_state,
+            hidden_states=hidden_states,
+        )
+
+
+@auto_docstring
+# Copied from transformers.models.resnet.modeling_resnet.ResNetPreTrainedModel with ResNet->RTDetrResNet
+class RTDetrResNetPreTrainedModel(PreTrainedModel):
+    config: RTDetrResNetConfig
+    base_model_prefix = "resnet"
+    main_input_name = "pixel_values"
+    _no_split_modules = ["RTDetrResNetConvLayer", "RTDetrResNetShortCut"]
+
+    def _init_weights(self, module):
+        if isinstance(module, nn.Conv2d):
+            nn.init.kaiming_normal_(module.weight, mode="fan_out", nonlinearity="relu")
+        # copied from the `reset_parameters` method of `class Linear(Module)` in `torch`.
+        elif isinstance(module, nn.Linear):
+            nn.init.kaiming_uniform_(module.weight, a=math.sqrt(5))
+            if module.bias is not None:
+                fan_in, _ = nn.init._calculate_fan_in_and_fan_out(module.weight)
+                bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
+                nn.init.uniform_(module.bias, -bound, bound)
+        elif isinstance(module, (nn.BatchNorm2d, nn.GroupNorm)):
+            nn.init.constant_(module.weight, 1)
+            nn.init.constant_(module.bias, 0)
+
+
+@auto_docstring(
+    custom_intro="""
+    ResNet backbone, to be used with frameworks like RTDETR.
+    """
+)
+class RTDetrResNetBackbone(RTDetrResNetPreTrainedModel, BackboneMixin):
+    has_attentions = False
+
+    def __init__(self, config):
+        super().__init__(config)
+        super()._init_backbone(config)
+
+        self.num_features = [config.embedding_size] + config.hidden_sizes
+        self.embedder = RTDetrResNetEmbeddings(config)
+        self.encoder = RTDetrResNetEncoder(config)
+
+        # initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self, pixel_values: Tensor, output_hidden_states: Optional[bool] = None, return_dict: Optional[bool] = None
+    ) -> BackboneOutput:
+        r"""
+                        Examples:
+
+                        ```python
+                        >>> from transformers import RTDetrResNetConfig, RTDetrResNetBackbone
+                        >>> import torch
+        from ...utils.deprecation import deprecate_kwarg
+        from ...utils.deprecation import deprecate_kwarg
+        from ...utils.deprecation import deprecate_kwarg
+                from ...utils.deprecation import deprecate_kwarg
+                from ...utils.deprecation import deprecate_kwarg
+
+                        >>> config = RTDetrResNetConfig()
+                        >>> model = RTDetrResNetBackbone(config)
+
+                        >>> pixel_values = torch.randn(1, 3, 224, 224)
+
+                        >>> with torch.no_grad():
+                        ...     outputs = model(pixel_values)
+
+                        >>> feature_maps = outputs.feature_maps
+                        >>> list(feature_maps[-1].shape)
+                        [1, 2048, 7, 7]
+                        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        embedding_output = self.embedder(pixel_values)
+
+        outputs = self.encoder(embedding_output, output_hidden_states=True, return_dict=True)
+
+        hidden_states = outputs.hidden_states
+
+        feature_maps = ()
+        for idx, stage in enumerate(self.stage_names):
+            if stage in self.out_features:
+                feature_maps += (hidden_states[idx],)
+
+        if not return_dict:
+            output = (feature_maps,)
+            if output_hidden_states:
+                output += (outputs.hidden_states,)
+            return output
+
+        return BackboneOutput(
+            feature_maps=feature_maps,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=None,
+        )
+
+
+__all__ = [
+    "RTDetrResNetBackbone",
+    "RTDetrResNetPreTrainedModel",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr_v2/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr_v2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..efc3772ca6d015271102638c3948958777498fd9
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr_v2/__init__.py
@@ -0,0 +1,29 @@
+# Copyright 2025 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_rt_detr_v2 import *
+    from .modeling_rt_detr_v2 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr_v2/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr_v2/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f4d2b9898eb232fae0f84109a6c4a6bff8f09e07
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr_v2/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr_v2/__pycache__/configuration_rt_detr_v2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr_v2/__pycache__/configuration_rt_detr_v2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6d1a6de2b4835854cf0937b07e79bffd65d6a6dc
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr_v2/__pycache__/configuration_rt_detr_v2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr_v2/__pycache__/modular_rt_detr_v2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr_v2/__pycache__/modular_rt_detr_v2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c3bdfaa2198b83b34e5cd35ea80d2eac653d26c0
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rt_detr_v2/__pycache__/modular_rt_detr_v2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rwkv/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rwkv/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..86c8cc7df11e5a343a21a477a4e2a9ce68e550aa
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rwkv/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rwkv/__pycache__/configuration_rwkv.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rwkv/__pycache__/configuration_rwkv.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..92f7f260b446734e89ef2b8a19b67847f52af55d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rwkv/__pycache__/configuration_rwkv.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rwkv/__pycache__/modeling_rwkv.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rwkv/__pycache__/modeling_rwkv.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e8690e0c7bbf4635a652cfc02f78f2cfd6fd060f
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/rwkv/__pycache__/modeling_rwkv.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam_hq/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam_hq/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..32cbe1887c9f8b5aad56f1c6a8d4c965c94ad9d5
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam_hq/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam_hq/__pycache__/modeling_sam_hq.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam_hq/__pycache__/modeling_sam_hq.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..66d690a1589905f3f7081450ae779bc0f4ed8858
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam_hq/__pycache__/modeling_sam_hq.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam_hq/__pycache__/modular_sam_hq.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam_hq/__pycache__/modular_sam_hq.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..25eba43567fda06ab476bc28de6ac5d79118750b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam_hq/__pycache__/modular_sam_hq.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam_hq/__pycache__/processing_samhq.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam_hq/__pycache__/processing_samhq.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7d63f7e32f525a5bec7699cb0b7a96daa84b8bf1
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/sam_hq/__pycache__/processing_samhq.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0d289de0474acfadeeee08019dd57f40f9a1a4b4
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t/__init__.py
@@ -0,0 +1,31 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_seamless_m4t import *
+    from .feature_extraction_seamless_m4t import *
+    from .modeling_seamless_m4t import *
+    from .processing_seamless_m4t import *
+    from .tokenization_seamless_m4t import *
+    from .tokenization_seamless_m4t_fast import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t/configuration_seamless_m4t.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t/configuration_seamless_m4t.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe4f911957fcb83f91113da96aeeb9280a0638fd
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t/configuration_seamless_m4t.py
@@ -0,0 +1,416 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""SeamlessM4T model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class SeamlessM4TConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`~SeamlessM4TModel`]. It is used to instantiate an
+    SeamlessM4T model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the SeamlessM4T
+    ["facebook/hf-seamless-m4t-medium"](https://huggingface.co/"facebook/hf-seamless-m4t-medium") architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 256102):
+            Vocabulary size of the SeamlessM4T model. Defines the number of different tokens that can be represented by
+            the `inputs_ids` passed when calling [`~SeamlessM4TModel`], [`~SeamlessM4TForTextToSpeech`] or
+            [`~SeamlessM4TForTextToText`].
+        t2u_vocab_size (`int`, *optional*, defaults to 10082):
+            Unit vocabulary size of the SeamlessM4T model. Defines the number of different unit tokens that can be
+            represented by the `inputs_ids` passed when calling the Text-To-Units sub-model of [`~SeamlessM4TModel`],
+            [`~SeamlessM4TForSpeechToSpeech`] or [`~SeamlessM4TForTextToSpeech`].
+
+        > Parameters shared across sub-models
+
+        hidden_size (`int`, *optional*, defaults to 1024):
+            Dimensionality of the "intermediate" layers in the architecture.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        max_position_embeddings (`int`, *optional*, defaults to 1024):
+            The maximum sequence length that this model text encoder and decoder might ever be used with. Typically set
+            this to something large just in case (e.g., 512 or 1024 or 2048).
+        is_encoder_decoder (`bool`, *optional*, defaults to `True`):
+            Whether the model is used as an encoder/decoder or not.
+        encoder_layerdrop (`float`, *optional*, defaults to 0.05):
+            The LayerDrop probability for the encoders. See the [LayerDrop paper](see https://huggingface.co/papers/1909.11556)
+            for more details.
+        decoder_layerdrop (`float`, *optional*, defaults to 0.05):
+            The LayerDrop probability for the decoders. See the [LayerDrop paper](see https://huggingface.co/papers/1909.11556)
+            for more details.
+        activation_function (`str` or `function`, *optional*, defaults to `"relu"`):
+            The non-linear activation function (function or string) in the decoder and feed-forward layers. If string,
+            `"gelu"`, `"relu"`, `"selu"`, `"swish"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, decoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all attention layers.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all activation layers in the model.
+        scale_embedding (`bool`, *optional*, defaults to `True`):
+            Scale embeddings by diving by sqrt(d_model).
+
+        > Text encoder and text decoder specific parameters
+
+        encoder_layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer text encoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 8192):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer text encoder.
+        encoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer text encoder.
+        decoder_layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer text decoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 8192):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer text decoder.
+        decoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer text decoder.
+        decoder_start_token_id (`int`, *optional*, defaults to 3):
+            If an encoder-decoder model starts decoding with a different token than _bos_, the id of that token. Only
+            applied in the text decoder.
+        max_new_tokens (`int`, *optional*, defaults to 256):
+            The maximum numbers of text tokens to generate, ignoring the number of tokens in the prompt.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            The id of the _padding_ text token. Only applied to the text-decoder model.
+        bos_token_id (`int`, *optional*, defaults to 2):
+            The id of the _beginning-of-stream_ text token. Only applied to the text-decoder model.
+        eos_token_id (`int`, *optional*, defaults to 3):
+            The id of the _end-of-stream_ text token. Only applied to the text-decoder model.
+
+        > Speech encoder specific parameters
+
+        speech_encoder_layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer speech encoder.
+        speech_encoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer speech encoder.
+        speech_encoder_intermediate_size (`int`, *optional*, defaults to 4096):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer speech encoder.
+        speech_encoder_hidden_act (`str` or `function`, *optional*, defaults to `"swish"`):
+            The non-linear activation function (function or string) in the speech encoder. If string, `"gelu"`,
+            `"relu"`, `"selu"`, `"swish"` and `"gelu_new"` are supported.
+        speech_encoder_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all layers in the speech encoder.
+        add_adapter (`bool`, *optional*, defaults to `True`):
+            Add an adapter layer on top of the speech encoder.
+        speech_encoder_layerdrop (`float`, *optional*, defaults to 0.1):
+            The LayerDrop probability for the speech encoder. See the [LayerDrop paper](see
+            https://huggingface.co/papers/1909.11556) for more details.
+        feature_projection_input_dim (`int`, *optional*, defaults to 160):
+            Input dimension of the input feature projection of the speech encoder, i.e the dimension after processing
+            input audios with [`SeamlessM4TFeatureExtractor`].
+        num_conv_pos_embeddings (`int`, *optional*, defaults to 128):
+            Number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional
+            embeddings layer of the speech encoder.
+        num_conv_pos_embedding_groups (`int`, *optional*, defaults to 16):
+            Number of groups of 1D convolutional positional embeddings layer of the speech encoder.
+        adaptor_kernel_size (`int`, *optional*, defaults to 8):
+            Kernel size of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`.
+        adaptor_stride (`int`, *optional*, defaults to 8):
+            Stride of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`.
+        adaptor_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all layers in the speech adapter.
+        num_adapter_layers (`int`, *optional*, defaults to 1):
+            Number of convolutional layers that should be used in the adapter network. Only relevant if `add_adapter is
+            True`.
+        position_embeddings_type (`str`, *optional*, defaults to `"relative"`):
+            Can be specified to `relative` or `rotary` for relative or rotary position embeddings respectively. If left
+            `None` no relative position embedding is applied. Only applied to the speech encoder.
+        rotary_embedding_base (`int`, *optional*, defaults to 10000):
+            If `"rotary"` position embeddings are used, defines the size of the embedding base. Only applied to the
+            speech encoder.
+        max_source_positions (`int`, *optional*, defaults to 4096):
+            if `"relative"` position embeddings are used, defines the maximum source input positions. Only applied to
+            the speech encoder.
+        conv_depthwise_kernel_size (`int`, *optional*, defaults to 31):
+            Kernel size of convolutional depthwise 1D layer in Conformer blocks. Only applied to the speech encoder.
+
+        > Text-To-Unit (t2u) model specific parameters
+
+        t2u_bos_token_id (`int`, *optional*, defaults to 0):
+            The id of the _beginning-of-stream_ unit token. Only applied to the text-to-unit seq2seq model.
+        t2u_pad_token_id (`int`, *optional*, defaults to 1):
+            The id of the _padding_ unit token. Only applied to the text-to-unit seq2seq model.
+        t2u_eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the _end-of-stream_ unit token. Only applied to the text-to-unit seq2seq model.
+        t2u_decoder_start_token_id (`int`, *optional*, defaults to 2):
+            If an encoder-decoder model starts decoding with a different token than _bos_, the id of that token. Only
+            applied to the text-to-unit seq2seq model.
+        t2u_max_new_tokens (`int`, *optional*, defaults to 1024):
+            The maximum numbers of unit tokens to generate, ignoring the number of tokens in the prompt. Only applied
+            to the text-to-unit seq2seq model.
+        t2u_encoder_layers (`int`, *optional*, defaults to 6):
+            Number of hidden layers in the Transformer text-to-unit encoder.
+        t2u_encoder_ffn_dim (`int`, *optional*, defaults to 8192):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer text-to-unit encoder.
+        t2u_encoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer text-to-unit encoder.
+        t2u_decoder_layers (`int`, *optional*, defaults to 6):
+            Number of hidden layers in the Transformer text-to-unit decoder.
+        t2u_decoder_ffn_dim (`int`, *optional*, defaults to 8192):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer text-to-unit decoder.
+        t2u_decoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer text-to-unit decoder.
+        t2u_max_position_embeddings (`int`, *optional*, defaults to 2048):
+            The maximum sequence length that this model text-to-unit component might ever be used with. Typically set
+            this to something large just in case (e.g., 512 or 1024 or 2048).
+
+         > Hifi-Gan Vocoder specific parameters
+
+        sampling_rate (`int`, *optional*, defaults to 16000):
+            The sampling rate at which the output audio will be generated, expressed in hertz (Hz).
+        upsample_initial_channel (`int`, *optional*, defaults to 512):
+            The number of input channels into the hifi-gan upsampling network. Applies to the vocoder only.
+        upsample_rates (`tuple[int]` or `list[int]`, *optional*, defaults to `[5, 4, 4, 2, 2]`):
+            A tuple of integers defining the stride of each 1D convolutional layer in the vocoder upsampling network.
+            The length of *upsample_rates* defines the number of convolutional layers and has to match the length of
+            *upsample_kernel_sizes*. Applies to the vocoder only.
+        upsample_kernel_sizes (`tuple[int]` or `list[int]`, *optional*, defaults to `[11, 8, 8, 4, 4]`):
+            A tuple of integers defining the kernel size of each 1D convolutional layer in the vocoder upsampling
+            network. The length of *upsample_kernel_sizes* defines the number of convolutional layers and has to match
+            the length of *upsample_rates*. Applies to the vocoder only.
+        resblock_kernel_sizes (`tuple[int]` or `list[int]`, *optional*, defaults to `[3, 7, 11]`):
+            A tuple of integers defining the kernel sizes of the vocoder 1D convolutional layers in the multi-receptive
+            field fusion (MRF) module. Applies to the vocoder only.
+        resblock_dilation_sizes (`tuple[tuple[int]]` or `list[list[int]]`, *optional*, defaults to `[[1, 3, 5], [1, 3, 5], [1, 3, 5]]`):
+            A nested tuple of integers defining the dilation rates of the vocoder dilated 1D convolutional layers in
+            the multi-receptive field fusion (MRF) module. Applies to the vocoder only.
+        leaky_relu_slope (`float`, *optional*, defaults to 0.1):
+            The angle of the negative slope used by the leaky ReLU activation in the vocoder. Applies to the vocoder
+            only.
+        unit_hifi_gan_vocab_size (`int`, *optional*, defaults to 10000):
+            Vocabulary size of the SeamlessM4T vocoder. Defines the number of different unit tokens that can be
+            represented by the `inputs_ids` passed when calling the vocoder of [`~SeamlessM4TModel`],
+            [`~SeamlessM4TForSpeechToSpeech`] or [`~SeamlessM4TForTextToSpeech`].
+        unit_embed_dim (`int`, *optional*, defaults to 1280):
+            The projection dimension of the input ids given to the hifi-gan vocoder. Applies to the vocoder only.
+        lang_embed_dim (`int`, *optional*, defaults to 256):
+            The projection dimension of the target language given to the hifi-gan vocoder. Applies to the vocoder only.
+        spkr_embed_dim (`int`, *optional*, defaults to 256):
+            The projection dimension of the speaker id given to the hifi-gan vocoder. Applies to the vocoder only.
+        vocoder_num_langs (`int`, *optional*, defaults to 36):
+            Number of langs supported by the vocoder. Might be different from `t2u_num_langs`.
+        vocoder_num_spkrs (`int`, *optional*, defaults to 200):
+            Number of speakers supported by the vocoder.
+        variance_predictor_kernel_size (`int`, *optional*, defaults to 3):
+            Kernel size of the duration predictor. Applies to the vocoder only.
+        var_pred_dropout (`float`, *optional*, defaults to 0.5):
+            The dropout probability of the duration predictor. Applies to the vocoder only.
+        vocoder_offset (`int`, *optional*, defaults to 4):
+            Offset the unit token ids by this number to account for symbol tokens. Applies to the vocoder only.
+
+    ```python
+    >>> from transformers import SeamlessM4TModel, SeamlessM4TConfig
+
+    >>> # Initializing a SeamlessM4T "facebook/hf-seamless-m4t-medium" style configuration
+    >>> configuration = SeamlessM4TConfig()
+
+    >>> # Initializing a model from the "facebook/hf-seamless-m4t-medium" style configuration
+    >>> model = SeamlessM4TModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "seamless_m4t"
+
+    def __init__(
+        self,
+        vocab_size=256102,
+        t2u_vocab_size=10082,
+        # shared config
+        hidden_size=1024,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        use_cache=True,
+        max_position_embeddings=1024,
+        is_encoder_decoder=True,
+        encoder_layerdrop=0.05,
+        decoder_layerdrop=0.05,
+        activation_function="relu",
+        dropout=0.1,
+        attention_dropout=0.1,
+        activation_dropout=0.0,
+        scale_embedding=True,
+        # text encoder|decoder
+        encoder_layers=24,
+        encoder_ffn_dim=8192,
+        encoder_attention_heads=16,
+        decoder_layers=24,
+        decoder_ffn_dim=8192,
+        decoder_attention_heads=16,
+        decoder_start_token_id=3,
+        max_new_tokens=256,
+        pad_token_id=0,
+        bos_token_id=2,
+        eos_token_id=3,
+        # speech_encoder
+        speech_encoder_layers=24,
+        speech_encoder_attention_heads=16,
+        speech_encoder_intermediate_size=4096,
+        speech_encoder_hidden_act="swish",
+        speech_encoder_dropout=0.0,
+        add_adapter=True,
+        speech_encoder_layerdrop=0.1,
+        feature_projection_input_dim=160,
+        num_conv_pos_embeddings=128,
+        num_conv_pos_embedding_groups=16,
+        adaptor_kernel_size=8,
+        adaptor_stride=8,
+        adaptor_dropout=0.1,
+        num_adapter_layers=1,
+        position_embeddings_type="relative",
+        rotary_embedding_base=10000,
+        max_source_positions=4096,
+        conv_depthwise_kernel_size=31,
+        # t2u config
+        t2u_bos_token_id=0,
+        t2u_pad_token_id=1,
+        t2u_eos_token_id=2,
+        t2u_decoder_start_token_id=2,
+        t2u_max_new_tokens=1024,
+        t2u_encoder_layers=6,
+        t2u_encoder_ffn_dim=8192,
+        t2u_encoder_attention_heads=16,
+        t2u_decoder_layers=6,
+        t2u_decoder_ffn_dim=8192,
+        t2u_decoder_attention_heads=16,
+        t2u_max_position_embeddings=2048,
+        # hifi-gan vocoder config
+        sampling_rate=16000,
+        upsample_initial_channel=512,
+        upsample_rates=[5, 4, 4, 2, 2],
+        upsample_kernel_sizes=[11, 8, 8, 4, 4],
+        resblock_kernel_sizes=[3, 7, 11],
+        resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+        leaky_relu_slope=0.1,
+        # specific to Code Hifi-Gan
+        unit_hifi_gan_vocab_size=10000,
+        unit_embed_dim=1280,
+        lang_embed_dim=256,
+        spkr_embed_dim=256,
+        vocoder_num_langs=36,
+        vocoder_num_spkrs=200,
+        variance_predictor_kernel_size=3,
+        var_pred_dropout=0.5,
+        vocoder_offset=4,
+        **kwargs,
+    ):
+        # overall_config
+        self.vocab_size = vocab_size
+        self.t2u_vocab_size = t2u_vocab_size
+        self.hidden_size = hidden_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.max_position_embeddings = max_position_embeddings
+        self.use_cache = use_cache
+        self.max_new_tokens = max_new_tokens
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+        self.activation_function = activation_function
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.scale_embedding = scale_embedding
+        # for proper config init
+        self.num_attention_heads = decoder_attention_heads
+        self.num_hidden_layers = decoder_layers
+
+        # text|unit encoder|decoder
+        self.encoder_layers = encoder_layers
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_layers = decoder_layers
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_attention_heads = decoder_attention_heads
+
+        # speech_encoder
+        self.speech_encoder_layers = speech_encoder_layers
+        self.speech_encoder_hidden_act = speech_encoder_hidden_act
+        self.speech_encoder_dropout = speech_encoder_dropout
+        self.speech_encoder_attention_heads = speech_encoder_attention_heads
+        self.speech_encoder_layerdrop = speech_encoder_layerdrop
+        self.speech_encoder_intermediate_size = speech_encoder_intermediate_size
+        self.feature_projection_input_dim = feature_projection_input_dim
+        self.num_conv_pos_embeddings = num_conv_pos_embeddings
+        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
+        self.adaptor_kernel_size = adaptor_kernel_size
+        self.adaptor_stride = adaptor_stride
+        self.adaptor_dropout = adaptor_dropout
+        self.num_adapter_layers = num_adapter_layers
+        self.position_embeddings_type = position_embeddings_type
+        self.rotary_embedding_base = rotary_embedding_base
+        self.max_source_positions = max_source_positions
+        self.conv_depthwise_kernel_size = conv_depthwise_kernel_size
+        self.add_adapter = add_adapter
+
+        # t2u config
+        self.t2u_bos_token_id = t2u_bos_token_id
+        self.t2u_pad_token_id = t2u_pad_token_id
+        self.t2u_eos_token_id = t2u_eos_token_id
+        self.t2u_decoder_start_token_id = t2u_decoder_start_token_id
+        self.t2u_max_new_tokens = t2u_max_new_tokens
+        self.t2u_encoder_layers = t2u_encoder_layers
+        self.t2u_encoder_ffn_dim = t2u_encoder_ffn_dim
+        self.t2u_encoder_attention_heads = t2u_encoder_attention_heads
+        self.t2u_decoder_layers = t2u_decoder_layers
+        self.t2u_decoder_ffn_dim = t2u_decoder_ffn_dim
+        self.t2u_decoder_attention_heads = t2u_decoder_attention_heads
+        self.t2u_max_position_embeddings = t2u_max_position_embeddings
+
+        # hifi-gan vocoder config
+        # original parameters specific to Hifi-Gan
+        self.sampling_rate = sampling_rate
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_rates = upsample_rates
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.leaky_relu_slope = leaky_relu_slope
+
+        # specific to Code Hifi-Gan
+        self.unit_hifi_gan_vocab_size = unit_hifi_gan_vocab_size
+        self.unit_embed_dim = unit_embed_dim
+        self.lang_embed_dim = lang_embed_dim
+        self.spkr_embed_dim = spkr_embed_dim
+        self.vocoder_num_langs = vocoder_num_langs
+        self.vocoder_num_spkrs = vocoder_num_spkrs
+        self.variance_predictor_kernel_size = variance_predictor_kernel_size
+        self.var_pred_dropout = var_pred_dropout
+        self.vocoder_offset = vocoder_offset
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            decoder_start_token_id=decoder_start_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            max_position_embeddings=max_position_embeddings,
+            **kwargs,
+        )
+
+
+__all__ = ["SeamlessM4TConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t/feature_extraction_seamless_m4t.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t/feature_extraction_seamless_m4t.py
new file mode 100644
index 0000000000000000000000000000000000000000..59a20059f869ecca947d0999d3239573fbf10194
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t/feature_extraction_seamless_m4t.py
@@ -0,0 +1,309 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Feature extractor class for SeamlessM4T
+"""
+
+from typing import Optional, Union
+
+import numpy as np
+
+from ...utils import is_torch_available
+
+
+if is_torch_available():
+    import torch
+
+from ...audio_utils import mel_filter_bank, spectrogram, window_function
+from ...feature_extraction_sequence_utils import SequenceFeatureExtractor
+from ...feature_extraction_utils import BatchFeature
+from ...utils import PaddingStrategy, TensorType, logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class SeamlessM4TFeatureExtractor(SequenceFeatureExtractor):
+    r"""
+    Constructs a SeamlessM4T feature extractor.
+
+    This feature extractor inherits from [`SequenceFeatureExtractor`] which contains most of the main methods. Users
+    should refer to this superclass for more information regarding those methods.
+
+    This class extracts mel-filter bank features from raw speech.
+
+    Args:
+        feature_size (`int`, *optional*, defaults to 80):
+            The feature dimension of the extracted features.
+        sampling_rate (`int`, *optional*, defaults to 16000):
+            The sampling rate at which the audio files should be digitalized expressed in hertz (Hz).
+        num_mel_bins (`int`, *optional*, defaults to 80):
+            Number of Mel-frequency bins.
+        padding_value (`float`, *optional*, defaults to 0.0):
+            The value that is used to fill the padding vectors.
+        stride (`int`, *optional*, defaults to 2):
+            Stride used to reshape audios from shape (batch_size,num_frames,num_mel_bins) to
+            (batch_size,num_frames//stride,num_mel_bins*stride).
+    """
+
+    model_input_names = ["input_features", "attention_mask"]
+
+    def __init__(
+        self,
+        feature_size=80,
+        sampling_rate=16000,
+        num_mel_bins=80,
+        padding_value=0.0,
+        stride=2,
+        **kwargs,
+    ):
+        self.num_mel_bins = num_mel_bins
+        self.return_attention_mask = True
+        self.stride = stride
+
+        mel_filters = mel_filter_bank(
+            num_frequency_bins=257,
+            num_mel_filters=self.num_mel_bins,
+            min_frequency=20,
+            max_frequency=sampling_rate // 2,
+            sampling_rate=sampling_rate,
+            norm=None,
+            mel_scale="kaldi",
+            triangularize_in_mel_space=True,
+        )
+
+        self.mel_filters = mel_filters
+        self.window = window_function(400, "povey", periodic=False)
+
+        super().__init__(feature_size=feature_size, sampling_rate=sampling_rate, padding_value=padding_value, **kwargs)
+
+    @staticmethod
+    # Copied from transformers.models.wav2vec2.feature_extraction_wav2vec2.Wav2Vec2FeatureExtractor.zero_mean_unit_var_norm
+    def zero_mean_unit_var_norm(
+        input_values: list[np.ndarray], attention_mask: list[np.ndarray], padding_value: float = 0.0
+    ) -> list[np.ndarray]:
+        """
+        Every array in the list is normalized to have zero mean and unit variance
+        """
+        if attention_mask is not None:
+            attention_mask = np.array(attention_mask, np.int32)
+            normed_input_values = []
+
+            for vector, length in zip(input_values, attention_mask.sum(-1)):
+                normed_slice = (vector - vector[:length].mean()) / np.sqrt(vector[:length].var() + 1e-7)
+                if length < normed_slice.shape[0]:
+                    normed_slice[length:] = padding_value
+
+                normed_input_values.append(normed_slice)
+        else:
+            normed_input_values = [(x - x.mean()) / np.sqrt(x.var() + 1e-7) for x in input_values]
+
+        return normed_input_values
+
+    def _extract_fbank_features(
+        self,
+        waveform: np.ndarray,
+    ) -> np.ndarray:
+        """
+        Get mel-filter bank features using TorchAudio. Note that TorchAudio requires 16-bit signed integers as inputs
+        and hence the waveform should not be normalized before feature extraction.
+        """
+        # by default, it extracts the left channel if stereo
+        if len(waveform.shape) == 2:
+            waveform = waveform[0]
+
+        waveform = np.squeeze(waveform) * (2**15)  # Kaldi compliance: 16-bit signed integers
+        features = spectrogram(
+            waveform,
+            self.window,
+            frame_length=400,
+            hop_length=160,
+            fft_length=512,
+            power=2.0,
+            center=False,
+            preemphasis=0.97,
+            mel_filters=self.mel_filters,
+            log_mel="log",
+            mel_floor=1.192092955078125e-07,
+            remove_dc_offset=True,
+        ).T
+        return features
+
+    def __call__(
+        self,
+        raw_speech: Union[np.ndarray, list[float], list[np.ndarray], list[list[float]]],
+        padding: Union[bool, str, PaddingStrategy] = True,
+        pad_to_multiple_of: Optional[int] = 2,
+        max_length: Optional[int] = None,
+        truncation: bool = False,
+        return_tensors: Optional[Union[str, TensorType]] = None,
+        sampling_rate: Optional[int] = None,
+        return_attention_mask: Optional[bool] = None,
+        do_normalize_per_mel_bins: Optional[bool] = True,
+        **kwargs,
+    ) -> BatchFeature:
+        """
+        Main method to featurize and prepare for the model one or several sequence(s).
+
+        Args:
+            raw_speech (`np.ndarray`, `torch.Tensor`, `list[float]`, `list[np.ndarray]`, `list[torch.Tensor]`,
+            `list[list[float]]`, `list[list[list[float]]]`):
+                The sequence or batch of sequences to be padded. Each sequence can be a numpy array,
+                a torch tensor, a list of float values, a list of numpy arrays, a list of torch tensors,
+                a list of list of float values or a list of a list of list of float values.
+                If `raw_speech` is a one-dimensional `np.ndarray`, `torch.Tensor` or a `list[float]`, `raw_speech` is
+                considered a single-channel, single-sample sound. In all other cases, the first dimension of
+                `raw_speech`, whether from an `np.ndarray`, a `torch.Tensor` or a `list[...]`,
+                corresponds to the number of samples in the batch, and the number of channels
+                (i.e. mono or stereo character) is derived from the other dimensions
+                (1D -> single-channel waveform batches; 2D-> stereo-channel waveform batches).
+            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
+                Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                index) among:
+
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            pad_to_multiple_of (`int`, *optional*, defaults to 2):
+                If set will pad the sequence to a multiple of the provided value.
+
+                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta), or on TPUs which benefit from having sequence lengths be a multiple of 128.
+            max_length (`int`, *optional*):
+                Maximum length of the returned list and optionally padding length (see above).
+            truncation (`bool`):
+                Activates truncation to cut input sequences longer than *max_length* to *max_length*.
+            return_attention_mask (`bool`, *optional*):
+                Whether to return the attention mask. If left to the default, will return the attention mask according
+                to the specific feature_extractor's default.
+
+                [What are attention masks?](../glossary#attention-mask)
+
+                <Tip>
+
+                For SeamlessM4T models, `attention_mask` should always be passed for batched inference, to avoid subtle
+                bugs.
+
+                </Tip>
+
+            return_tensors (`str` or [`~utils.TensorType`], *optional*):
+                If set, will return tensors instead of list of python integers. Acceptable values are:
+
+                - `'tf'`: Return TensorFlow `tf.constant` objects.
+                - `'pt'`: Return PyTorch `torch.Tensor` objects.
+                - `'np'`: Return Numpy `np.ndarray` objects.
+            sampling_rate (`int`, *optional*):
+                The sampling rate at which the `raw_speech` input was sampled. It is strongly recommended to pass
+                `sampling_rate` at the forward call to prevent silent errors.
+            do_normalize_per_mel_bins (`bool`, *optional*, defaults to `True`):
+                Whether or not to zero-mean unit-variance normalize the input per mel-channel.
+            kwargs (*optional*):
+                Remaining dictionary of keyword arguments that will be passed to the tokenizer or the feature
+                extractor.
+        """
+        if sampling_rate is not None:
+            if sampling_rate != self.sampling_rate:
+                raise ValueError(
+                    f"The model corresponding to this feature extractor: {self} was trained using a sampling rate of"
+                    f" {self.sampling_rate}. Please make sure that the provided `raw_speech` input was sampled with"
+                    f" {self.sampling_rate} and not {sampling_rate}."
+                )
+        else:
+            logger.warning(
+                f"It is strongly recommended to pass the `sampling_rate` argument to `{self.__class__.__name__}()`. "
+                "Failing to do so can result in silent errors that might be hard to debug."
+            )
+
+        return_attention_mask = (
+            return_attention_mask if return_attention_mask is not None else self.return_attention_mask
+        )
+
+        is_batched_numpy = isinstance(raw_speech, np.ndarray) and len(raw_speech.shape) > 1
+        if is_batched_numpy and len(raw_speech.shape) > 3:
+            raise ValueError(f"Only mono-channel or stereo-channel audio is supported for input to {self}")
+
+        acceptable_types = (
+            (torch.Tensor, np.ndarray, tuple, list) if is_torch_available() else (np.ndarray, tuple, list)
+        )
+        is_batched = is_batched_numpy or (
+            isinstance(raw_speech, (list, tuple)) and (isinstance(raw_speech[0], acceptable_types))
+        )
+
+        if is_batched:
+            raw_speech = [np.asarray(speech, dtype=np.float32) for speech in raw_speech]
+        elif not is_batched and not isinstance(raw_speech, np.ndarray):
+            raw_speech = np.asarray(raw_speech, dtype=np.float32)
+        elif isinstance(raw_speech, np.ndarray) and raw_speech.dtype is np.dtype(np.float64):
+            raw_speech = raw_speech.astype(np.float32)
+
+        # always return batch
+        if not is_batched:
+            raw_speech = [raw_speech]
+
+        # extract fbank features
+        features = [self._extract_fbank_features(waveform) for waveform in raw_speech]
+
+        if do_normalize_per_mel_bins:
+            # torch defaults to ddof=1, and numpy defaults to ddof=0
+            features = [
+                (x - np.expand_dims(x.mean(0), 0)) / np.sqrt(np.expand_dims(x.var(0, ddof=1), 0) + 1e-7)
+                for x in features
+            ]
+
+        # convert into correct format for padding
+        encoded_inputs = BatchFeature({"input_features": features})
+
+        padded_inputs = self.pad(
+            encoded_inputs,
+            padding=padding,
+            max_length=max_length,
+            truncation=truncation,
+            pad_to_multiple_of=pad_to_multiple_of,
+            return_attention_mask=True,
+            return_tensors="np",
+        )
+
+        # SeamlessM4T needs to process extracted features
+        input_features = padded_inputs.get("input_features")
+        attention_mask = padded_inputs.pop("attention_mask")
+
+        batch_size, num_frames, num_channels = input_features.shape
+
+        remainder = num_frames % self.stride
+        if remainder != 0:
+            input_features = input_features[:, : num_frames - remainder, :]
+            attention_mask = attention_mask[:, : num_frames - remainder]
+
+        input_features = np.reshape(
+            input_features, (batch_size, num_frames // self.stride, num_channels * self.stride)
+        )
+
+        indices = np.arange(0, num_frames - remainder)
+        attention_mask = attention_mask[:, indices % self.stride == 1]
+
+        padded_inputs["input_features"] = input_features
+        if return_attention_mask:
+            padded_inputs["attention_mask"] = attention_mask
+
+        if return_tensors is not None:
+            padded_inputs = padded_inputs.convert_to_tensors(return_tensors)
+
+        return padded_inputs
+
+
+__all__ = ["SeamlessM4TFeatureExtractor"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t/modeling_seamless_m4t.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t/modeling_seamless_m4t.py
new file mode 100644
index 0000000000000000000000000000000000000000..fbe1d2fb714dae73684d3a4a5f77395ec1837144
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t/modeling_seamless_m4t.py
@@ -0,0 +1,4070 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch SeamlessM4T model."""
+
+import copy
+import math
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+from torch import Tensor, nn
+from torch.nn import CrossEntropyLoss
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
+from ...generation import GenerationMixin
+from ...integrations.deepspeed import is_deepspeed_zero3_enabled
+from ...integrations.fsdp import is_fsdp_managed_module
+from ...modeling_attn_mask_utils import (
+    _prepare_4d_attention_mask,
+    _prepare_4d_causal_attention_mask,
+)
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    Seq2SeqLMOutput,
+    Seq2SeqModelOutput,
+    Wav2Vec2BaseModelOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import ModelOutput, auto_docstring, logging
+from ...utils.deprecation import deprecate_kwarg
+from .configuration_seamless_m4t import SeamlessM4TConfig
+
+
+logger = logging.get_logger(__name__)
+
+SEAMLESS_M4T_COMMON_CUSTOM_ARGS = r"""
+    input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`):
+        Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the
+        [`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
+    decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+        Indices of decoder input sequence tokens in the vocabulary.
+
+        Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+        [`PreTrainedTokenizer.__call__`] for details.
+
+        [What are decoder input IDs?](../glossary#decoder-input-ids)
+
+        Bart uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
+        is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).
+
+        For translation and summarization training, `decoder_input_ids` should be provided. If no
+        `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
+        for denoising pre-training following the paper.
+    decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+        Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+        be used by default.
+
+        If you want to change padding behavior, you should read [`modeling_bart._prepare_decoder_attention_mask`]
+        and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
+        information on the default strategy.
+    inputs_embeds (`torch.FloatTensor` of shape`(batch_size, sequence_length, hidden_size)`, *optional*):
+        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+        is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+        model's internal embedding lookup matrix.
+"""
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Class defining the generated outputs from [`SeamlessM4TModel`], [`SeamlessM4TForTextToText`],
+    [`SeamlessM4TForTextToSpeech`], [`SeamlessM4TForSpeechToSpeech`] and [`SeamlessM4TForTextToSpeech`].
+    """
+)
+class SeamlessM4TGenerationOutput(ModelOutput):
+    r"""
+    waveform (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+        The final audio waveform predicted by the model.
+    waveform_lengths (`torch.IntTensor` of shape `(batch_size,)`, *optional*):
+        The length in samples of each element in the `waveform` batch.
+    sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+        The generated translated sequences. This is the output of the text-to-text or the speech-to-text models.
+        The second dimension (sequence_length) is either equal to `max_length` or shorter if all batches finished
+        early due to the `eos_token_id`.
+    unit_sequences (`torch.LongTensor` of shape `(batch_size, unit_sequence_length)`, *optional*):
+        The generated translated unit sequences. This is the output of the text-to-units model. The second
+        dimension (unit_sequence_length) is either equal to `t2u_max_length` or shorter if all batches finished
+        early due to the `t2u_eos_token_id`.
+    """
+
+    waveform: Optional[torch.FloatTensor] = None
+    waveform_lengths: Optional[torch.IntTensor] = None
+    sequences: Optional[tuple[torch.FloatTensor]] = None
+    unit_sequences: Optional[tuple[torch.FloatTensor]] = None
+
+
+############ UTILS ################
+
+
+# Copied from transformers.models.roberta.modeling_roberta.create_position_ids_from_input_ids
+def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
+    """
+    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
+    are ignored. This is modified from fairseq's `utils.make_positions`.
+
+    Args:
+        x: torch.Tensor x:
+
+    Returns: torch.Tensor
+    """
+    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
+    mask = input_ids.ne(padding_idx).int()
+    incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
+    return incremental_indices.long() + padding_idx
+
+
+# Copied from transformers.models.bart.modeling_bart.shift_tokens_right
+def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
+    """
+    Shift input ids one token to the right.
+    """
+    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
+    shifted_input_ids[:, 0] = decoder_start_token_id
+
+    if pad_token_id is None:
+        raise ValueError("self.model.config.pad_token_id has to be defined.")
+    # replace possible -100 values in labels by `pad_token_id`
+    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+    return shifted_input_ids
+
+
+def _compute_new_attention_mask(hidden_states: torch.Tensor, seq_lens: torch.Tensor):
+    """
+    Computes an attention mask of the form `(batch, seq_len)` with an attention for each element in the batch that
+    stops at the corresponding element in `seq_lens`.
+
+    Args:
+        hidden_states (`torch.FloatTensor` of shape `(batch, seq_len, *)`):
+            The sequences to mask, where `*` is any number of sequence-specific dimensions including none.
+        seq_lens (`torch.Tensor` of shape `(batch)`:
+            Each element represents the length of the sequence at the same index in `hidden_states`
+
+    Returns:
+        `torch.FloatTensor`: The float attention mask of shape `(batch, seq_len)`
+    """
+    batch_size, mask_seq_len = hidden_states.shape[:2]
+
+    indices = torch.arange(mask_seq_len, device=seq_lens.device).expand(batch_size, -1)
+
+    bool_mask = indices >= seq_lens.unsqueeze(1).expand(-1, mask_seq_len)
+
+    mask = hidden_states.new_ones((batch_size, mask_seq_len))
+
+    mask = mask.masked_fill(bool_mask, 0)
+
+    return mask
+
+
+def format_speech_generation_kwargs(kwargs):
+    """
+    Format kwargs for SeamlessM4T models that generate speech, attribute kwargs to either the text generation or the
+    speech generation models.
+
+    Args:
+        kwargs (`dict`)`:
+             Keyword arguments are of two types:
+
+                - Without a prefix, they will be entered as `**kwargs` for the `generate` method of each sub-model,
+                except for `decoder_input_ids` which will only be passed through the text components.
+                - With a *text_* or *speech_* prefix, they will be input for the `generate` method of the
+                text model and speech model respectively. It has the priority over the keywords without a prefix.
+
+                This means you can, for example, specify a generation strategy for one generation but not for the
+                other.
+    """
+    # attribute kwargs to models
+    kwargs_text = {}
+    kwargs_speech = {}
+    for key, value in kwargs.items():
+        if key.startswith("text_"):
+            key = key[len("text_") :]
+            kwargs_text[key] = value
+        elif key.startswith("speech_"):
+            key = key[len("speech_") :]
+            kwargs_speech[key] = value
+        elif key == "generation_config":
+            kwargs_text[key] = value
+        else:
+            # If the key is already in a specific config, then it's been set with a
+            # submodules specific value and we don't override
+            if key not in kwargs_text:
+                kwargs_text[key] = value
+            if key not in kwargs_speech:
+                kwargs_speech[key] = value
+    return kwargs_text, kwargs_speech
+
+
+############ SPEECH ENCODER related code ################
+
+
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2PositionalConvEmbedding with Wav2Vec2->SeamlessM4TConformer, feat_extract_activation->speech_encoder_hidden_act
+class SeamlessM4TConformerPositionalConvEmbedding(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            kernel_size=config.num_conv_pos_embeddings,
+            padding=config.num_conv_pos_embeddings // 2,
+            groups=config.num_conv_pos_embedding_groups,
+        )
+
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
+        if is_deepspeed_zero3_enabled():
+            import deepspeed
+
+            with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
+                self.conv = weight_norm(self.conv, name="weight", dim=2)
+            if hasattr(self.conv, "parametrizations"):
+                weight_g = self.conv.parametrizations.weight.original0
+                weight_v = self.conv.parametrizations.weight.original1
+            else:
+                weight_g = self.conv.weight_g
+                weight_v = self.conv.weight_v
+            deepspeed.zero.register_external_parameter(self, weight_v)
+            deepspeed.zero.register_external_parameter(self, weight_g)
+        else:
+            self.conv = weight_norm(self.conv, name="weight", dim=2)
+
+        self.padding = SeamlessM4TConformerSamePadLayer(config.num_conv_pos_embeddings)
+        self.activation = ACT2FN[config.speech_encoder_hidden_act]
+
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.transpose(1, 2)
+
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.padding(hidden_states)
+        hidden_states = self.activation(hidden_states)
+
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states
+
+
+# Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerRotaryPositionalEmbedding with Wav2Vec2->SeamlessM4T, num_attention_heads->speech_encoder_attention_heads
+class SeamlessM4TConformerRotaryPositionalEmbedding(nn.Module):
+    """Rotary positional embedding
+    Reference : https://blog.eleuther.ai/rotary-embeddings/ Paper: https://huggingface.co/papers/2104.09864
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        dim = config.hidden_size // config.speech_encoder_attention_heads
+        base = config.rotary_embedding_base
+
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float() / dim))
+        self.register_buffer("inv_freq", inv_freq)
+        self.cached_sequence_length = None
+        self.cached_rotary_positional_embedding = None
+
+    def forward(self, hidden_states):
+        sequence_length = hidden_states.shape[1]
+
+        if sequence_length == self.cached_sequence_length and self.cached_rotary_positional_embedding is not None:
+            return self.cached_rotary_positional_embedding
+
+        self.cached_sequence_length = sequence_length
+        # Embeddings are computed in the dtype of the inv_freq constant
+        time_stamps = torch.arange(sequence_length).type_as(self.inv_freq)
+        freqs = torch.einsum("i,j->ij", time_stamps, self.inv_freq)
+        embeddings = torch.cat((freqs, freqs), dim=-1)
+
+        cos_embeddings = embeddings.cos()[:, None, None, :]
+        sin_embeddings = embeddings.sin()[:, None, None, :]
+        # Computed embeddings are cast to the dtype of the hidden state inputs
+        self.cached_rotary_positional_embedding = torch.stack([cos_embeddings, sin_embeddings]).type_as(hidden_states)
+        return self.cached_rotary_positional_embedding
+
+
+# Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerRelPositionalEmbedding with Wav2Vec2->SeamlessM4T
+class SeamlessM4TConformerRelPositionalEmbedding(nn.Module):
+    """Relative positional encoding module."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.max_len = config.max_source_positions
+        self.d_model = config.hidden_size
+        self.pe = None
+        self.extend_pe(torch.tensor(0.0).expand(1, self.max_len))
+
+    def extend_pe(self, x):
+        # Reset the positional encodings
+        if self.pe is not None:
+            # self.pe contains both positive and negative parts
+            # the length of self.pe is 2 * input_len - 1
+            if self.pe.size(1) >= x.size(1) * 2 - 1:
+                if self.pe.dtype != x.dtype or self.pe.device != x.device:
+                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+                return
+        # Suppose `i` is the position of query vector and `j` is the
+        # position of key vector. We use positive relative positions when keys
+        # are to the left (i>j) and negative relative positions otherwise (i<j).
+        pe_positive = torch.zeros(x.size(1), self.d_model)
+        pe_negative = torch.zeros(x.size(1), self.d_model)
+        position = torch.arange(0, x.size(1), dtype=torch.int64).float().unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.int64).float() * -(math.log(10000.0) / self.d_model)
+        )
+        pe_positive[:, 0::2] = torch.sin(position * div_term)
+        pe_positive[:, 1::2] = torch.cos(position * div_term)
+        pe_negative[:, 0::2] = torch.sin(-1 * position * div_term)
+        pe_negative[:, 1::2] = torch.cos(-1 * position * div_term)
+
+        # Reverse the order of positive indices and concat both positive and
+        # negative indices. This is used to support the shifting trick
+        # as in https://huggingface.co/papers/1901.02860
+        pe_positive = torch.flip(pe_positive, [0]).unsqueeze(0)
+        pe_negative = pe_negative[1:].unsqueeze(0)
+        pe = torch.cat([pe_positive, pe_negative], dim=1)
+        self.pe = pe.to(device=x.device, dtype=x.dtype)
+
+    def forward(self, hidden_states: torch.Tensor):
+        self.extend_pe(hidden_states)
+        start_idx = self.pe.size(1) // 2 - hidden_states.size(1) + 1
+        end_idx = self.pe.size(1) // 2 + hidden_states.size(1)
+        relative_position_embeddings = self.pe[:, start_idx:end_idx]
+
+        return relative_position_embeddings
+
+
+# Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerSamePadLayer with Wav2Vec2->SeamlessM4T
+class SeamlessM4TConformerSamePadLayer(nn.Module):
+    def __init__(self, num_conv_pos_embeddings):
+        super().__init__()
+        self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0
+
+    def forward(self, hidden_states):
+        if self.num_pad_remove > 0:
+            hidden_states = hidden_states[:, :, : -self.num_pad_remove]
+        return hidden_states
+
+
+class SeamlessM4TConformerFeatureProjection(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(config.feature_projection_input_dim, eps=config.layer_norm_eps)
+        self.projection = nn.Linear(config.feature_projection_input_dim, config.hidden_size)
+        self.dropout = nn.Dropout(config.speech_encoder_dropout)
+
+    def forward(self, hidden_states):
+        # non-projected hidden states are needed for quantization
+        norm_hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.projection(norm_hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+class SeamlessM4TConformerFeedForward(nn.Module):
+    def __init__(self, config, act_fn=None, dropout=None):
+        super().__init__()
+        dropout = dropout if dropout is not None else config.speech_encoder_dropout
+        act_fn = act_fn if act_fn is not None else config.speech_encoder_hidden_act
+
+        self.intermediate_dropout = nn.Dropout(dropout)
+        self.intermediate_dense = nn.Linear(config.hidden_size, config.speech_encoder_intermediate_size)
+        self.intermediate_act_fn = ACT2FN[act_fn] if isinstance(act_fn, str) else act_fn
+
+        self.output_dense = nn.Linear(config.speech_encoder_intermediate_size, config.hidden_size)
+        self.output_dropout = nn.Dropout(dropout)
+
+    def forward(self, hidden_states):
+        hidden_states = self.intermediate_dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        hidden_states = self.intermediate_dropout(hidden_states)
+
+        hidden_states = self.output_dense(hidden_states)
+        hidden_states = self.output_dropout(hidden_states)
+        return hidden_states
+
+
+class SeamlessM4TConformerConvolutionModule(nn.Module):
+    """Convolution block used in the conformer block"""
+
+    def __init__(self, config):
+        super().__init__()
+        if (config.conv_depthwise_kernel_size - 1) % 2 == 1:
+            raise ValueError("`config.conv_depthwise_kernel_size` should be a odd number for 'SAME' padding")
+        self.layer_norm = nn.LayerNorm(config.hidden_size)
+        self.pointwise_conv1 = nn.Conv1d(
+            config.hidden_size,
+            2 * config.hidden_size,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+        )
+        self.glu = nn.GLU(dim=1)
+        self.depthwise_conv = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            config.conv_depthwise_kernel_size,
+            stride=1,
+            padding="same",
+            groups=config.hidden_size,
+            bias=False,
+        )
+        self.batch_norm = nn.BatchNorm1d(config.hidden_size)
+        self.activation = ACT2FN[config.speech_encoder_hidden_act]
+        self.pointwise_conv2 = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+        )
+        self.dropout = nn.Dropout(config.speech_encoder_dropout)
+
+    def forward(self, hidden_states, attention_mask=None):
+        hidden_states = self.layer_norm(hidden_states)
+
+        # Ensure that we do not leak padded positions in depthwise convolution.
+        # Put 0 where necessary
+        if attention_mask is not None:
+            hidden_states = hidden_states.masked_fill(~attention_mask.bool().unsqueeze(-1), 0.0)
+
+        # exchange the temporal dimension and the feature dimension
+        hidden_states = hidden_states.transpose(1, 2)
+
+        # GLU mechanism
+        # => (batch, 2*channel, dim)
+        hidden_states = self.pointwise_conv1(hidden_states)
+        # => (batch, channel, dim)
+        hidden_states = self.glu(hidden_states)
+
+        # 1D Depthwise Conv
+        hidden_states = self.depthwise_conv(hidden_states)
+        hidden_states = self.batch_norm(hidden_states)
+        hidden_states = self.activation(hidden_states)
+
+        hidden_states = self.pointwise_conv2(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states
+
+
+class SeamlessM4TConformerSelfAttention(nn.Module):
+    """Construct a SeamlessM4TConformerSelfAttention object.
+    Can be enhanced with rotary or relative position embeddings.
+    """
+
+    def __init__(self, config, use_position_embeddings=True):
+        super().__init__()
+
+        self.head_size = config.hidden_size // config.speech_encoder_attention_heads
+        self.num_heads = config.speech_encoder_attention_heads
+        self.position_embeddings_type = config.position_embeddings_type if use_position_embeddings else None
+
+        self.linear_q = nn.Linear(config.hidden_size, config.hidden_size)
+        self.linear_k = nn.Linear(config.hidden_size, config.hidden_size)
+        self.linear_v = nn.Linear(config.hidden_size, config.hidden_size)
+        self.linear_out = nn.Linear(config.hidden_size, config.hidden_size)
+
+        self.dropout = nn.Dropout(p=config.speech_encoder_dropout)
+
+        if self.position_embeddings_type == "relative":
+            # linear transformation for positional encoding
+            self.linear_pos = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
+            # these two learnable bias are used in matrix c and matrix d
+            # as described in https://huggingface.co/papers/1901.02860 Section 3.3
+            self.pos_bias_u = nn.Parameter(torch.zeros(self.num_heads, self.head_size))
+            self.pos_bias_v = nn.Parameter(torch.zeros(self.num_heads, self.head_size))
+
+    # Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerSelfAttention.forward
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        relative_position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        # self-attention mechanism
+        batch_size, sequence_length, hidden_size = hidden_states.size()
+
+        # make sure query/key states can be != value states
+        query_key_states = hidden_states
+        value_states = hidden_states
+
+        if self.position_embeddings_type == "rotary":
+            if relative_position_embeddings is None:
+                raise ValueError(
+                    "`relative_position_embeddings` has to be defined when `self.position_embeddings_type == 'rotary'"
+                )
+            query_key_states = self._apply_rotary_embedding(query_key_states, relative_position_embeddings)
+
+        # project query_key_states and value_states
+        query = self.linear_q(query_key_states).view(batch_size, -1, self.num_heads, self.head_size)
+        key = self.linear_k(query_key_states).view(batch_size, -1, self.num_heads, self.head_size)
+        value = self.linear_v(value_states).view(batch_size, -1, self.num_heads, self.head_size)
+
+        # => (batch, head, time1, d_k)
+        query = query.transpose(1, 2)
+        key = key.transpose(1, 2)
+        value = value.transpose(1, 2)
+
+        if self.position_embeddings_type == "relative":
+            if relative_position_embeddings is None:
+                raise ValueError(
+                    "`relative_position_embeddings` has to be defined when `self.position_embeddings_type =="
+                    " 'relative'"
+                )
+            # apply relative_position_embeddings to qk scores
+            # as proposed in Transformer_XL: https://huggingface.co/papers/1901.02860
+            scores = self._apply_relative_embeddings(
+                query=query, key=key, relative_position_embeddings=relative_position_embeddings
+            )
+        else:
+            scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.head_size)
+
+        # apply attention_mask if necessary
+        if attention_mask is not None:
+            scores = scores + attention_mask
+
+        # => (batch, head, time1, time2)
+        probs = torch.softmax(scores, dim=-1)
+        probs = self.dropout(probs)
+
+        # => (batch, head, time1, d_k)
+        hidden_states = torch.matmul(probs, value)
+
+        # => (batch, time1, hidden_size)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, self.num_heads * self.head_size)
+        hidden_states = self.linear_out(hidden_states)
+
+        return hidden_states, probs
+
+    # Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerSelfAttention._apply_rotary_embedding
+    def _apply_rotary_embedding(self, hidden_states, relative_position_embeddings):
+        batch_size, sequence_length, hidden_size = hidden_states.size()
+        hidden_states = hidden_states.view(batch_size, sequence_length, self.num_heads, self.head_size)
+
+        cos = relative_position_embeddings[0, :sequence_length, ...]
+        sin = relative_position_embeddings[1, :sequence_length, ...]
+
+        # rotate hidden_states with rotary embeddings
+        hidden_states = hidden_states.transpose(0, 1)
+        rotated_states_begin = hidden_states[..., : self.head_size // 2]
+        rotated_states_end = hidden_states[..., self.head_size // 2 :]
+        rotated_states = torch.cat((-rotated_states_end, rotated_states_begin), dim=rotated_states_begin.ndim - 1)
+        hidden_states = (hidden_states * cos) + (rotated_states * sin)
+        hidden_states = hidden_states.transpose(0, 1)
+
+        hidden_states = hidden_states.view(batch_size, sequence_length, self.num_heads * self.head_size)
+
+        return hidden_states
+
+    # Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerSelfAttention._apply_relative_embeddings
+    def _apply_relative_embeddings(self, query, key, relative_position_embeddings):
+        # 1. project positional embeddings
+        # => (batch, head, 2*time1-1, d_k)
+        proj_relative_position_embeddings = self.linear_pos(relative_position_embeddings)
+        proj_relative_position_embeddings = proj_relative_position_embeddings.view(
+            relative_position_embeddings.size(0), -1, self.num_heads, self.head_size
+        )
+        proj_relative_position_embeddings = proj_relative_position_embeddings.transpose(1, 2)
+        proj_relative_position_embeddings = proj_relative_position_embeddings.transpose(2, 3)
+
+        # 2. Add bias to query
+        # => (batch, head, time1, d_k)
+        query = query.transpose(1, 2)
+        q_with_bias_u = (query + self.pos_bias_u).transpose(1, 2)
+        q_with_bias_v = (query + self.pos_bias_v).transpose(1, 2)
+
+        # 3. attention score: first compute matrix a and matrix c
+        # as described in https://huggingface.co/papers/1901.02860 Section 3.3
+        # => (batch, head, time1, time2)
+        scores_ac = torch.matmul(q_with_bias_u, key.transpose(-2, -1))
+
+        # 4. then compute matrix b and matrix d
+        # => (batch, head, time1, 2*time1-1)
+        scores_bd = torch.matmul(q_with_bias_v, proj_relative_position_embeddings)
+
+        # 5. shift matrix b and matrix d
+        zero_pad = torch.zeros((*scores_bd.size()[:3], 1), device=scores_bd.device, dtype=scores_bd.dtype)
+        scores_bd_padded = torch.cat([zero_pad, scores_bd], dim=-1)
+        scores_bd_padded_shape = scores_bd.size()[:2] + (scores_bd.shape[3] + 1, scores_bd.shape[2])
+        scores_bd_padded = scores_bd_padded.view(*scores_bd_padded_shape)
+        scores_bd = scores_bd_padded[:, :, 1:].view_as(scores_bd)
+        scores_bd = scores_bd[:, :, :, : scores_bd.size(-1) // 2 + 1]
+
+        # 6. sum matrices
+        # => (batch, head, time1, time2)
+        scores = (scores_ac + scores_bd) / math.sqrt(self.head_size)
+
+        return scores
+
+
+class SeamlessM4TConformerEncoderLayer(GradientCheckpointingLayer):
+    """Conformer block based on https://huggingface.co/papers/2005.08100."""
+
+    # Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerEncoderLayer.__init__ with Wav2Vec2->SeamlessM4T, attention_dropout->speech_encoder_dropout, torch.nn->nn
+    def __init__(self, config):
+        super().__init__()
+        embed_dim = config.hidden_size
+        dropout = config.speech_encoder_dropout
+
+        # Feed-forward 1
+        self.ffn1_layer_norm = nn.LayerNorm(embed_dim)
+        self.ffn1 = SeamlessM4TConformerFeedForward(config)
+
+        # Self-Attention
+        self.self_attn_layer_norm = nn.LayerNorm(embed_dim)
+        self.self_attn_dropout = nn.Dropout(dropout)
+        self.self_attn = SeamlessM4TConformerSelfAttention(config)
+
+        # Conformer Convolution
+        self.conv_module = SeamlessM4TConformerConvolutionModule(config)
+
+        # Feed-forward 2
+        self.ffn2_layer_norm = nn.LayerNorm(embed_dim)
+        self.ffn2 = SeamlessM4TConformerFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(embed_dim)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask: Optional[torch.Tensor] = None,
+        relative_position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        conv_attention_mask: Optional[torch.Tensor] = None,
+    ):
+        # 1. Feed-Forward 1 layer
+        residual = hidden_states
+        hidden_states = self.ffn1_layer_norm(hidden_states)
+        hidden_states = self.ffn1(hidden_states)
+        hidden_states = hidden_states * 0.5 + residual
+        residual = hidden_states
+
+        # 2. Self-Attention layer
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, attn_weigts = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            relative_position_embeddings=relative_position_embeddings,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self.self_attn_dropout(hidden_states)
+        hidden_states = hidden_states + residual
+
+        # 3. Convolutional Layer
+        residual = hidden_states
+        hidden_states = self.conv_module(hidden_states, attention_mask=conv_attention_mask)
+        hidden_states = residual + hidden_states
+
+        # 4. Feed-Forward 2 Layer
+        residual = hidden_states
+        hidden_states = self.ffn2_layer_norm(hidden_states)
+        hidden_states = self.ffn2(hidden_states)
+        hidden_states = hidden_states * 0.5 + residual
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        return hidden_states, attn_weigts
+
+
+class SeamlessM4TConformerEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+
+        if config.position_embeddings_type == "relative":
+            self.embed_positions = SeamlessM4TConformerRelPositionalEmbedding(config)
+        elif config.position_embeddings_type == "rotary":
+            self.embed_positions = SeamlessM4TConformerRotaryPositionalEmbedding(config)
+        else:
+            self.embed_positions = None
+
+        self.dropout = nn.Dropout(config.speech_encoder_dropout)
+        self.layers = nn.ModuleList(
+            [SeamlessM4TConformerEncoderLayer(config) for _ in range(config.speech_encoder_layers)]
+        )
+
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        conv_attention_mask = attention_mask
+        if attention_mask is not None:
+            # make sure padded tokens output 0
+            hidden_states = hidden_states.masked_fill(~attention_mask.bool().unsqueeze(-1), 0.0)
+            # extend attention_mask
+            attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
+            attention_mask = attention_mask * torch.finfo(hidden_states.dtype).min
+            attention_mask = attention_mask.expand(
+                attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
+            )
+
+        hidden_states = self.dropout(hidden_states)
+
+        if self.embed_positions is not None:
+            relative_position_embeddings = self.embed_positions(hidden_states)
+        else:
+            relative_position_embeddings = None
+
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
+
+        for i, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            dropout_probability = torch.rand([])
+
+            skip_the_layer = self.training and dropout_probability < self.config.speech_encoder_layerdrop
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
+                layer_outputs = layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    relative_position_embeddings=relative_position_embeddings,
+                    output_attentions=output_attentions,
+                    conv_attention_mask=conv_attention_mask,
+                )
+                hidden_states = layer_outputs[0]
+
+            if skip_the_layer:
+                layer_outputs = (None, None)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        hidden_states = self.layer_norm(hidden_states)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class SeamlessM4TConformerAdapterLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        embed_dim = config.hidden_size
+        dropout = config.adaptor_dropout
+
+        self.kernel_size = config.adaptor_kernel_size
+        self.stride = config.adaptor_stride
+
+        # 1. residual convolution
+        self.residual_layer_norm = nn.LayerNorm(embed_dim)
+        self.residual_conv = nn.Conv1d(
+            embed_dim,
+            2 * embed_dim,
+            self.kernel_size,
+            stride=self.stride,
+            padding=self.stride // 2,
+        )
+        self.activation = nn.GLU(dim=1)
+
+        # Self-Attention
+        self.self_attn_layer_norm = nn.LayerNorm(embed_dim)
+        self.self_attn_conv = nn.Conv1d(
+            embed_dim,
+            2 * embed_dim,
+            self.kernel_size,
+            stride=self.stride,
+            padding=self.stride // 2,
+        )
+        self.self_attn = SeamlessM4TConformerSelfAttention(config, use_position_embeddings=False)
+        self.self_attn_dropout = nn.Dropout(dropout)
+
+        # Feed-forward
+        self.ffn_layer_norm = nn.LayerNorm(embed_dim)
+        self.ffn = SeamlessM4TConformerFeedForward(config, act_fn="relu", dropout=dropout)
+
+    def _compute_sub_sample_lengths_from_attention_mask(self, attention_mask):
+        pad = self.kernel_size // 2
+        seq_lens = attention_mask.size(1) - (1 - attention_mask.int()).sum(1)
+
+        seq_lens = ((seq_lens + 2 * pad - self.kernel_size) / self.stride) + 1
+
+        return seq_lens.floor()
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ):
+        residual = self.residual_layer_norm(hidden_states)
+
+        # Apply pooling to the residual to match the sequence length of the
+        # multi-head attention output.
+        # (batch, seq_len, feature_dim) -> (batch, feature_dim, seq_len)
+        residual = residual.transpose(1, 2)
+        residual = self.residual_conv(residual)
+        residual = self.activation(residual)
+        # (batch, feature_dim, seq_len) -> (batch, seq_len, feature_dim)
+        residual = residual.transpose(1, 2)
+
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        # Apply pooling before feeding to the multihead-attention layer.
+        # (batch, seq_len, feature_dim) -> (batch, feature_dim, seq_len)
+        hidden_states = hidden_states.transpose(1, 2)
+        hidden_states = self.self_attn_conv(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        # (batch, feature_dim, seq_len) -> (batch, seq_len, feature_dim)
+        hidden_states = hidden_states.transpose(1, 2)
+
+        if attention_mask is not None:
+            sub_sampled_lengths = self._compute_sub_sample_lengths_from_attention_mask(attention_mask).to(
+                hidden_states.device
+            )
+            attention_mask = _compute_new_attention_mask(hidden_states=hidden_states, seq_lens=sub_sampled_lengths)
+            attention_mask = _prepare_4d_attention_mask(
+                attention_mask,
+                hidden_states.dtype,
+            )
+
+        # The rest of the computation is identical to a vanilla Transformer
+        # encoder layer.
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self.self_attn_dropout(hidden_states)
+        hidden_states = hidden_states + residual
+
+        residual = hidden_states
+
+        hidden_states = self.ffn_layer_norm(hidden_states)
+        hidden_states = self.ffn(hidden_states) + residual
+
+        return hidden_states
+
+
+class SeamlessM4TConformerAdapter(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        self.layers = nn.ModuleList(SeamlessM4TConformerAdapterLayer(config) for _ in range(config.num_adapter_layers))
+
+    def forward(self, hidden_states, attention_mask):
+        # down project hidden_states if necessary
+
+        for layer in self.layers:
+            hidden_states = layer(hidden_states, attention_mask)
+
+        return hidden_states
+
+
+############ TEXT / UNITS related code ################
+
+
+# Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100ScaledWordEmbedding with M2M100->SeamlessM4T
+class SeamlessM4TScaledWordEmbedding(nn.Embedding):
+    """
+    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
+    """
+
+    def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, embed_scale: Optional[float] = 1.0):
+        super().__init__(num_embeddings, embedding_dim, padding_idx)
+        self.embed_scale = embed_scale
+
+    def forward(self, input_ids: torch.Tensor):
+        return super().forward(input_ids) * self.embed_scale
+
+
+# Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100SinusoidalPositionalEmbedding
+class SeamlessM4TSinusoidalPositionalEmbedding(nn.Module):
+    """This module produces sinusoidal positional embeddings of any length."""
+
+    def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        super().__init__()
+        self.offset = 2
+        self.embedding_dim = embedding_dim
+        self.padding_idx = padding_idx
+        self.make_weights(num_positions + self.offset, embedding_dim, padding_idx)
+
+    def make_weights(self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        emb_weights = self.get_embedding(num_embeddings, embedding_dim, padding_idx)
+        if hasattr(self, "weights"):
+            # in forward put the weights on the correct dtype and device of the param
+            emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device)
+
+        self.register_buffer("weights", emb_weights, persistent=False)
+
+    @staticmethod
+    def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        """
+        Build sinusoidal embeddings.
+
+        This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of
+        "Attention Is All You Need".
+        """
+        half_dim = embedding_dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=torch.int64).float() * -emb)
+        emb = torch.arange(num_embeddings, dtype=torch.int64).float().unsqueeze(1) * emb.unsqueeze(0)
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1)
+        if embedding_dim % 2 == 1:
+            # zero pad
+            emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
+        if padding_idx is not None:
+            emb[padding_idx, :] = 0
+
+        return emb.to(torch.get_default_dtype())
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values_length: int = 0,
+    ):
+        if input_ids is not None:
+            bsz, seq_len = input_ids.size()
+            # Create the position ids from the input token ids. Any padded tokens remain padded.
+            position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length).to(
+                input_ids.device
+            )
+        else:
+            bsz, seq_len = inputs_embeds.size()[:-1]
+            position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds, past_key_values_length)
+
+        # expand embeddings if needed
+        max_pos = self.padding_idx + 1 + seq_len + past_key_values_length
+        if max_pos > self.weights.size(0):
+            self.make_weights(max_pos + self.offset, self.embedding_dim, self.padding_idx)
+
+        return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, self.weights.shape[-1]).detach()
+
+    def create_position_ids_from_inputs_embeds(self, inputs_embeds, past_key_values_length):
+        """
+        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
+
+        Args:
+            inputs_embeds: torch.Tensor
+
+        Returns: torch.Tensor
+        """
+        input_shape = inputs_embeds.size()[:-1]
+        sequence_length = input_shape[1]
+
+        position_ids = torch.arange(
+            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
+        )
+        return position_ids.unsqueeze(0).expand(input_shape).contiguous() + past_key_values_length
+
+
+class SeamlessM4TAttention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    # Copied from transformers.models.bart.modeling_bart.BartAttention.__init__ with Bart->SeamlessM4T
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+        is_causal: bool = False,
+        config: Optional[SeamlessM4TConfig] = None,
+        layer_idx: Optional[int] = None,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        self.config = config
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+        self.is_causal = is_causal
+        self.layer_idx = layer_idx
+        if layer_idx is None and self.is_decoder:
+            logger.warning_once(
+                f"Instantiating a decoder {self.__class__.__name__} without passing `layer_idx` is not recommended and "
+                "will lead to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        # if encoder_hidden_states are provided this layer is used as a cross-attention layer
+        # for the decoder
+        is_cross_attention = encoder_hidden_states is not None
+
+        bsz, tgt_len, _ = hidden_states.size()
+
+        # get query proj
+        query_states = self.q_proj(hidden_states) * self.scaling
+
+        is_updated = False
+        if past_key_values is not None:
+            if isinstance(past_key_values, EncoderDecoderCache):
+                is_updated = past_key_values.is_updated.get(self.layer_idx)
+                if is_cross_attention:
+                    # after the first generated id, we can subsequently re-use all key/value_states from cache
+                    curr_past_key_value = past_key_values.cross_attention_cache
+                else:
+                    curr_past_key_value = past_key_values.self_attention_cache
+            else:
+                curr_past_key_value = past_key_values
+
+        current_states = encoder_hidden_states if is_cross_attention else hidden_states
+        if is_cross_attention and past_key_values is not None and is_updated:
+            # reuse k,v, cross_attentions
+            key_states = curr_past_key_value.layers[self.layer_idx].keys
+            value_states = curr_past_key_value.layers[self.layer_idx].values
+        else:
+            key_states = self.k_proj(current_states)
+            value_states = self.v_proj(current_states)
+            key_states = key_states.view(bsz, -1, self.num_heads, self.head_dim).transpose(1, 2)
+            value_states = value_states.view(bsz, -1, self.num_heads, self.head_dim).transpose(1, 2)
+
+            if past_key_values is not None:
+                # save all key/value_states to cache to be re-used for fast auto-regressive generation
+                cache_position = cache_position if not is_cross_attention else None
+                key_states, value_states = curr_past_key_value.update(
+                    key_states, value_states, self.layer_idx, {"cache_position": cache_position}
+                )
+                # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls
+                if is_cross_attention and isinstance(past_key_values, EncoderDecoderCache):
+                    past_key_values.is_updated[self.layer_idx] = True
+
+        proj_shape = (bsz * self.num_heads, -1, self.head_dim)
+        query_states = query_states.view(bsz, tgt_len, self.num_heads, self.head_dim).transpose(1, 2)
+        query_states = query_states.reshape(*proj_shape)
+        key_states = key_states.reshape(*proj_shape)
+        value_states = value_states.reshape(*proj_shape)
+
+        src_len = key_states.size(1)
+        attn_weights = torch.bmm(query_states, key_states.transpose(1, 2))
+
+        if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
+            raise ValueError(
+                f"Attention weights should be of size {(bsz * self.num_heads, tgt_len, src_len)}, but is"
+                f" {attn_weights.size()}"
+            )
+
+        if attention_mask is not None:
+            if attention_mask.size() != (bsz, 1, tgt_len, src_len):
+                raise ValueError(
+                    f"Attention mask should be of size {(bsz, 1, tgt_len, src_len)}, but is {attention_mask.size()}"
+                )
+            attn_weights = attn_weights.view(bsz, self.num_heads, tgt_len, src_len) + attention_mask
+            attn_weights = attn_weights.view(bsz * self.num_heads, tgt_len, src_len)
+
+        attn_weights = nn.functional.softmax(attn_weights, dim=-1)
+
+        if output_attentions:
+            # this operation is a bit awkward, but it's required to
+            # make sure that attn_weights keeps its gradient.
+            # In order to do so, attn_weights have to be reshaped
+            # twice and have to be reused in the following
+            attn_weights_reshaped = attn_weights.view(bsz, self.num_heads, tgt_len, src_len)
+            attn_weights = attn_weights_reshaped.view(bsz * self.num_heads, tgt_len, src_len)
+        else:
+            attn_weights_reshaped = None
+
+        attn_probs = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        attn_output = torch.bmm(attn_probs, value_states)
+
+        if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
+            raise ValueError(
+                f"`attn_output` should be of size {(bsz * self.num_heads, tgt_len, self.head_dim)}, but is"
+                f" {attn_output.size()}"
+            )
+
+        attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim)
+        attn_output = attn_output.transpose(1, 2)
+
+        # Use the `embed_dim` from the config (stored in the class) rather than `hidden_state` because `attn_output` can be
+        # partitioned across GPUs when using tensor-parallelism.
+        attn_output = attn_output.reshape(bsz, tgt_len, self.embed_dim)
+
+        attn_output = self.out_proj(attn_output)
+
+        return attn_output, attn_weights_reshaped
+
+
+# Copied from transformers.models.nllb_moe.modeling_nllb_moe.NllbMoeDenseActDense with NllbMoe->SeamlessM4T,DenseActDense->FeedForwardNetwork, d_model->hidden_size
+class SeamlessM4TFeedForwardNetwork(nn.Module):
+    def __init__(self, config: SeamlessM4TConfig, ffn_dim: int):
+        super().__init__()
+        self.fc1 = nn.Linear(config.hidden_size, ffn_dim)
+        self.fc2 = nn.Linear(ffn_dim, config.hidden_size)
+        self.dropout = nn.Dropout(config.activation_dropout)
+        self.act = ACT2FN[config.activation_function]
+
+    def forward(self, hidden_states):
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        if (
+            isinstance(self.fc2.weight, torch.Tensor)
+            and hidden_states.dtype != self.fc2.weight.dtype
+            and (self.fc2.weight.dtype != torch.int8 and self.fc2.weight.dtype != torch.uint8)
+        ):
+            hidden_states = hidden_states.to(self.fc2.weight.dtype)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+class SeamlessM4TEncoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: SeamlessM4TConfig, encoder_ffn_dim=None, encoder_attention_heads=None):
+        super().__init__()
+        encoder_ffn_dim = config.encoder_ffn_dim if encoder_ffn_dim is None else encoder_ffn_dim
+        encoder_attention_heads = (
+            config.encoder_attention_heads if encoder_attention_heads is None else encoder_attention_heads
+        )
+
+        self.embed_dim = config.hidden_size
+        self.self_attn = SeamlessM4TAttention(
+            embed_dim=self.embed_dim,
+            num_heads=encoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
+        self.attn_dropout = nn.Dropout(config.dropout)
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+
+        self.ffn = SeamlessM4TFeedForwardNetwork(config, ffn_dim=encoder_ffn_dim)
+
+        self.ffn_layer_norm = nn.LayerNorm(config.hidden_size)
+        self.ffn_dropout = nn.Dropout(config.activation_dropout)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        output_attentions: bool = False,
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`):
+                attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very
+                large negative values.
+        """
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self.attn_dropout(hidden_states)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+
+        hidden_states = self.ffn_layer_norm(hidden_states)
+
+        hidden_states = self.ffn(hidden_states)
+        hidden_states = self.ffn_dropout(hidden_states)
+
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+class SeamlessM4TDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: SeamlessM4TConfig, decoder_ffn_dim=None, decoder_attention_heads=None, layer_idx=None):
+        super().__init__()
+        decoder_ffn_dim = config.decoder_ffn_dim if decoder_ffn_dim is None else decoder_ffn_dim
+        decoder_attention_heads = (
+            config.decoder_attention_heads if decoder_attention_heads is None else decoder_attention_heads
+        )
+
+        self.embed_dim = config.hidden_size
+        self.self_attn = SeamlessM4TAttention(
+            embed_dim=self.embed_dim,
+            num_heads=decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+            layer_idx=layer_idx,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.attn_dropout = nn.Dropout(config.dropout)
+
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.cross_attention = SeamlessM4TAttention(
+            self.embed_dim,
+            decoder_attention_heads,
+            config.attention_dropout,
+            is_decoder=True,
+            layer_idx=layer_idx,
+        )
+        self.cross_attention_layer_norm = nn.LayerNorm(self.embed_dim)
+
+        self.ffn = SeamlessM4TFeedForwardNetwork(config, ffn_dim=decoder_ffn_dim)
+
+        self.ffn_layer_norm = nn.LayerNorm(config.hidden_size)
+        self.ffn_dropout = nn.Dropout(config.activation_dropout)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = True,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`):
+                attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very
+                large negative values.
+            encoder_hidden_states (`torch.FloatTensor`):
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
+            encoder_attention_mask (`torch.FloatTensor`):
+                encoder attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by
+                very large negative values.
+            past_key_values (`Cache`):
+                cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            cache_position=cache_position,
+        )
+        hidden_states = self.attn_dropout(hidden_states)
+        hidden_states = residual + hidden_states
+
+        # Cross-Attention Block
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+            hidden_states = self.cross_attention_layer_norm(hidden_states)
+
+            hidden_states, cross_attn_weights = self.cross_attention(
+                hidden_states=hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                past_key_values=past_key_values,
+                attention_mask=encoder_attention_mask,
+                output_attentions=output_attentions,
+                cache_position=cache_position,
+            )
+            hidden_states = self.attn_dropout(hidden_states)
+            hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+
+        hidden_states = self.ffn_layer_norm(hidden_states)
+
+        hidden_states = self.ffn(hidden_states)
+        hidden_states = self.ffn_dropout(hidden_states)
+
+        hidden_states = residual + hidden_states
+
+        return hidden_states, self_attn_weights, cross_attn_weights
+
+
+############ SUB-MODELS related code ################
+
+
+@auto_docstring
+class SeamlessM4TPreTrainedModel(PreTrainedModel):
+    config: SeamlessM4TConfig
+    base_model_prefix = "seamless_m4t"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["SeamlessM4TEncoderLayer", "SeamlessM4TDecoderLayer", "SeamlessM4TConformerEncoderLayer"]
+
+    def _init_weights(self, module: nn.Module):
+        """Initialize the weights"""
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, SeamlessM4TConformerSelfAttention):
+            if hasattr(module, "pos_bias_u"):
+                nn.init.xavier_uniform_(module.pos_bias_u)
+            if hasattr(module, "pos_bias_v"):
+                nn.init.xavier_uniform_(module.pos_bias_v)
+        elif isinstance(module, SeamlessM4TConformerPositionalConvEmbedding):
+            nn.init.normal_(
+                module.conv.weight,
+                mean=0,
+                std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)),
+            )
+            nn.init.constant_(module.conv.bias, 0)
+        elif isinstance(module, SeamlessM4TConformerFeatureProjection):
+            k = math.sqrt(1 / module.projection.in_features)
+            nn.init.uniform_(module.projection.weight, a=-k, b=k)
+            nn.init.uniform_(module.projection.bias, a=-k, b=k)
+        elif isinstance(module, (nn.LayerNorm, nn.BatchNorm1d)):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Conv1d):
+            nn.init.kaiming_normal_(module.weight)
+            if module.bias is not None:
+                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+                nn.init.uniform_(module.bias, a=-k, b=k)
+
+    def _compute_sub_sample_lengths_from_attention_mask(self, attention_mask):
+        kernel_size, stride = self.config.adaptor_kernel_size, self.config.adaptor_stride
+        pad = kernel_size // 2
+        seq_lens = attention_mask.size(1) - (1 - attention_mask.int()).sum(1)
+
+        seq_lens = ((seq_lens + 2 * pad - kernel_size) / stride) + 1
+
+        return seq_lens.floor()
+
+    def compute_last_hidden_states_per_sample(
+        self,
+        hidden_states: tuple[tuple[torch.Tensor]],
+        beam_indices: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Computes the last hidden states.
+
+        Parameters:
+            hidden_states (`tuple[tuple[torch.Tensor]]`):
+                The generated hidden states. Tuple (one element for each generated token) of tuples (one element for
+                each layer of the decoder) of torch.FloatTensor of shape (batch_size*num_beams*num_return_sequences,
+                generated_length, hidden_size).
+            beam_indices (`torch.LongTensor`, *optional*):
+                Beam indices of generated token id at each generation step. `torch.LongTensor` of shape
+                `(batch_size*num_return_sequences, sequence_length)`. Only required if a `num_beams>1` at
+                generate-time.
+
+        Return:
+            `torch.Tensor`: A `torch.Tensor` of shape `(batch_size*num_return_sequences, sequence_length, hidden_size)`
+            containing
+                the last hidden states.
+        ```"""
+        # 1. First, let's compute last_hidden_states from hidden_states.
+        # For each generation step, takes the hidden state from the last layer.
+        # shape: (batch_size*vocab_size*num_return_sequences, # generation_steps, hidden_dim)
+        last_hidden_states = torch.concat([hidden_states[-1] for hidden_states in hidden_states], dim=1)
+
+        # 2. In absence of `beam_indices`, we can assume that we come from e.g. greedy search, which is equivalent
+        # to a beam search approach were the first (and only) beam is always selected
+        # in that case, return directly last_hidden_states
+        if beam_indices is None:
+            return last_hidden_states
+
+        # 3. cut beam_indices to longest beam length
+        beam_indices_mask = beam_indices < 0
+        max_beam_length = (1 - beam_indices_mask.long()).sum(-1).max()
+        beam_indices = beam_indices.clone()[:, :max_beam_length]
+        beam_indices_mask = beam_indices_mask[:, :max_beam_length]
+
+        # 4. Set indices of beams that finished early to 0; such indices will be masked correctly afterwards anyways
+        beam_indices[beam_indices_mask] = 0
+
+        # 5. expand beam_indices to last_hidden_states dim
+        beam_indices = beam_indices.unsqueeze(-1)
+        beam_indices = beam_indices.expand(-1, -1, last_hidden_states.shape[-1])
+
+        # 6. select the right candidate for each beam
+        # in other words, new_last_hidden_states[i,j,k] = last_hidden_states[beam_indices[i,j,k], j, k] for all i, j, k
+        last_hidden_states = torch.gather(last_hidden_states, 0, beam_indices)
+
+        return last_hidden_states
+
+
+@auto_docstring(
+    custom_intro="""
+    Transformer speech encoder consisting of *config.speech_encoder_layers* conformer self attention layers.
+    Each layer is a [`SeamlessM4TConformerEncoderLayer`].
+    """
+)
+class SeamlessM4TSpeechEncoder(SeamlessM4TPreTrainedModel):
+    main_input_name = "input_features"
+
+    def __init__(self, config: SeamlessM4TConfig):
+        super().__init__(config)
+
+        self.feature_projection = SeamlessM4TConformerFeatureProjection(config)
+        self.encoder = SeamlessM4TConformerEncoder(config)
+        self.intermediate_ffn = SeamlessM4TConformerFeedForward(config, act_fn="relu", dropout=0.0)
+        self.adapter = SeamlessM4TConformerAdapter(config) if config.add_adapter else None
+        self.inner_layer_norm = nn.LayerNorm(config.hidden_size)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        input_features: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[tuple, Wav2Vec2BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_features is None:
+            raise ValueError(
+                """Both `input_features` and `inputs_embeds` are `None` in `SeamlessM4TSpeechEncoder.forward`.
+                Make sure one of them is not `None`."""
+            )
+
+        hidden_states = self.feature_projection(input_features)
+
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = encoder_outputs[0]
+
+        expanded_hidden_states = self.intermediate_ffn(hidden_states)
+        hidden_states = hidden_states + 0.5 * expanded_hidden_states
+
+        if self.adapter is not None:
+            hidden_states = self.adapter(hidden_states, attention_mask=attention_mask)
+
+        hidden_states = self.inner_layer_norm(hidden_states)
+
+        if not return_dict:
+            return (hidden_states,) + encoder_outputs[1:]
+
+        return Wav2Vec2BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+# inspired from MBart and NllbMoe
+@auto_docstring(
+    custom_intro="""
+    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a [`SeamlessM4TEncoderLayer`].
+    """
+)
+class SeamlessM4TEncoder(SeamlessM4TPreTrainedModel):
+    def __init__(
+        self,
+        config: SeamlessM4TConfig,
+        embed_tokens: Optional[nn.Embedding] = None,
+        is_t2u_encoder: bool = False,
+    ):
+        r"""
+        embed_tokens (`nn.Embedding`, *optional*):
+            Input embedding
+        is_t2u_encoder (`bool`, *optional*, defaults to `False`):
+            indicates if it belongs to the text-to-units model, in which case it won't have input embeddings
+        """
+        super().__init__(config)
+
+        self.dropout = config.dropout
+        self.layerdrop = config.encoder_layerdrop
+        self.padding_idx = config.pad_token_id
+        embed_dim = config.hidden_size
+
+        self.is_t2u_encoder = is_t2u_encoder
+        self.max_source_positions = config.max_position_embeddings
+
+        if not self.is_t2u_encoder:
+            embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
+
+            self.embed_tokens = SeamlessM4TScaledWordEmbedding(
+                config.vocab_size, embed_dim, self.padding_idx, embed_scale=embed_scale
+            )
+
+            if embed_tokens is not None:
+                self.embed_tokens.weight = embed_tokens.weight
+
+            self.embed_positions = SeamlessM4TSinusoidalPositionalEmbedding(
+                self.max_source_positions,
+                embed_dim,
+                self.padding_idx,
+            )
+
+        layers = []
+        for _ in range(config.encoder_layers):
+            layers.append(
+                SeamlessM4TEncoderLayer(
+                    config,
+                    encoder_attention_heads=config.encoder_attention_heads,
+                    encoder_ffn_dim=config.encoder_ffn_dim,
+                )
+            )
+
+        self.layers = nn.ModuleList(layers)
+
+        self.layer_norm = nn.LayerNorm(config.hidden_size)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[tuple, BaseModelOutput]:
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and self.is_t2u_encoder:
+            raise ValueError(
+                "You cannot pass input_ids to the encoder of the text_to_units model. Pass inputs_embeds instead."
+            )
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input = input_ids
+            input_shape = input.shape
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input = inputs_embeds[:, :, -1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if not self.is_t2u_encoder:
+            embed_pos = self.embed_positions(input)
+
+            hidden_states = inputs_embeds + embed_pos.to(inputs_embeds.device)
+        else:
+            hidden_states = inputs_embeds
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype)
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            if to_drop:
+                layer_outputs = (None, None)
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+                hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`SeamlessM4TDecoderLayer`].
+    """
+)
+class SeamlessM4TDecoder(SeamlessM4TPreTrainedModel):
+    def __init__(
+        self,
+        config: SeamlessM4TConfig,
+        embed_tokens: Optional[nn.Embedding] = None,
+    ):
+        r"""
+        embed_tokens (`nn.Embedding`, *optional*):
+            Input embedding
+        """
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.max_target_positions = config.max_position_embeddings
+        embed_scale = math.sqrt(config.hidden_size) if config.scale_embedding else 1.0
+
+        if embed_tokens is not None:
+            # if embed_tokens defined, use its shape instead
+            self.embed_tokens = SeamlessM4TScaledWordEmbedding(
+                embed_tokens.num_embeddings, embed_tokens.embedding_dim, self.padding_idx, embed_scale=embed_scale
+            )
+            self.embed_tokens.weight = embed_tokens.weight
+        else:
+            self.embed_tokens = SeamlessM4TScaledWordEmbedding(
+                self.vocab_size, config.hidden_size, self.padding_idx, embed_scale=embed_scale
+            )
+
+        self.embed_positions = SeamlessM4TSinusoidalPositionalEmbedding(
+            self.max_target_positions,
+            config.hidden_size,
+            padding_idx=self.padding_idx,
+        )
+
+        layers = []
+        for i in range(config.decoder_layers):
+            layers.append(
+                SeamlessM4TDecoderLayer(
+                    config,
+                    decoder_attention_heads=config.decoder_attention_heads,
+                    decoder_ffn_dim=config.decoder_ffn_dim,
+                    layer_idx=i,
+                )
+            )
+        self.layers = nn.ModuleList(layers)
+        self.layer_norm = nn.LayerNorm(config.hidden_size)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            input = input_ids
+            input_shape = input.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            input = inputs_embeds[:, :, -1]
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing`. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        # initialize `past_key_values`
+        if use_cache and past_key_values is None:
+            past_key_values = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config))
+        if use_cache and isinstance(past_key_values, tuple):
+            logger.warning_once(
+                "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. "
+                "You should pass an instance of `EncoderDecoderCache` instead, e.g. "
+                "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`."
+            )
+            past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values)
+
+        past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0
+        attention_mask = _prepare_4d_causal_attention_mask(
+            attention_mask, input_shape, inputs_embeds, past_key_values_length
+        )
+
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _prepare_4d_attention_mask(
+                encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+            )
+
+        # embed positions
+        positions = self.embed_positions(input, past_key_values_length=past_key_values_length)
+
+        hidden_states = inputs_embeds + positions.to(inputs_embeds.device)
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask,
+                encoder_hidden_states,  # as a positional argument for gradient checkpointing
+                encoder_attention_mask=encoder_attention_mask,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+            )
+            hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, past_key_values, all_hidden_states, all_self_attns, all_cross_attentions]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    Transformer bare text-to-unit encoder-decoder. The encoder is a [`SeamlessM4TEncoder`] without embeddings and the decoder is a [`SeamlessM4TDecoder`].
+    """
+)
+class SeamlessM4TTextToUnitModel(SeamlessM4TPreTrainedModel):
+    def __init__(
+        self,
+        config: SeamlessM4TConfig,
+        embed_tokens_decoder: Optional[nn.Embedding] = None,
+    ):
+        r"""
+        embed_tokens_decoder (`nn.Embedding`, *optional*):
+            input embedding of the decoder.
+        """
+        super().__init__(config)
+
+        self.encoder = SeamlessM4TEncoder(config, is_t2u_encoder=True)
+        self.decoder = SeamlessM4TDecoder(config, embed_tokens_decoder)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> Union[tuple[torch.Tensor], Seq2SeqModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        # decoder outputs consists of (dec_features, past_key_values, dec_hidden, dec_attn)
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
+        return Seq2SeqModelOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    Transformer text-to-unit encoder-decoder with a language model head. The base encoder-decoder model is a [`SeamlessM4TTextToUnit`].
+    """
+)
+class SeamlessM4TTextToUnitForConditionalGeneration(SeamlessM4TPreTrainedModel, GenerationMixin):
+    _keys_to_ignore_on_load_missing = [
+        "vocoder",
+        "speech_encoder",
+        "text_encoder",
+        "text_decoder",
+    ]
+    _tied_weights_keys = ["lm_head.weight"]
+
+    def __init__(
+        self,
+        config: SeamlessM4TConfig,
+        embed_tokens_decoder: Optional[nn.Embedding] = None,
+    ):
+        r"""
+        embed_tokens_decoder (`nn.Embedding`, *optional*):
+            input embedding of the decoder.
+        """
+        # update config - used principality for bos_token_id etc.
+        config = copy.deepcopy(config)
+        for param, val in config.to_dict().items():
+            if param.startswith("t2u_"):
+                config.__setattr__(param[4:], val)
+        super().__init__(config)
+
+        self.model = SeamlessM4TTextToUnitModel(config, embed_tokens_decoder)
+
+        self.lm_head = nn.Linear(config.hidden_size, config.t2u_vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_encoder(self):
+        return self.model.encoder
+
+    def get_decoder(self):
+        return self.model.decoder
+
+    def get_input_embeddings(self):
+        return self.model.decoder.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.model.decoder.embed_tokens = value
+
+    @auto_docstring(custom_args=SEAMLESS_M4T_COMMON_CUSTOM_ARGS)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> Union[Seq2SeqLMOutput, tuple[torch.FloatTensor]]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if labels is not None:
+            if use_cache:
+                logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
+            use_cache = False
+            if decoder_input_ids is None and decoder_inputs_embeds is None:
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.t2u_pad_token_id, self.config.t2u_decoder_start_token_id
+                )
+
+        outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            encoder_outputs=encoder_outputs,
+            decoder_attention_mask=decoder_attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+        lm_logits = self.lm_head(outputs[0])
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            labels = labels.to(lm_logits.device)
+            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=masked_lm_loss,
+            logits=lm_logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+        )
+
+    def prepare_decoder_input_ids_from_labels(self, labels: torch.Tensor):
+        return shift_tokens_right(labels, self.config.t2u_pad_token_id, self.config.t2u_decoder_start_token_id)
+
+    def _tie_weights(self) -> None:
+        if getattr(self.config, "tie_word_embeddings", True):
+            output_embeddings = self.get_output_embeddings()
+            if output_embeddings is not None:
+                self._tie_or_clone_weights(output_embeddings, self.get_input_embeddings())
+
+
+############ VOCODER related code ################
+
+
+# Copied from transformers.models.speecht5.modeling_speecht5.HifiGanResidualBlock
+class HifiGanResidualBlock(nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5), leaky_relu_slope=0.1):
+        super().__init__()
+        self.leaky_relu_slope = leaky_relu_slope
+
+        self.convs1 = nn.ModuleList(
+            [
+                nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    stride=1,
+                    dilation=dilation[i],
+                    padding=self.get_padding(kernel_size, dilation[i]),
+                )
+                for i in range(len(dilation))
+            ]
+        )
+        self.convs2 = nn.ModuleList(
+            [
+                nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    stride=1,
+                    dilation=1,
+                    padding=self.get_padding(kernel_size, 1),
+                )
+                for _ in range(len(dilation))
+            ]
+        )
+
+    def get_padding(self, kernel_size, dilation=1):
+        return (kernel_size * dilation - dilation) // 2
+
+    def apply_weight_norm(self):
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
+        for layer in self.convs1:
+            weight_norm(layer)
+        for layer in self.convs2:
+            weight_norm(layer)
+
+    def remove_weight_norm(self):
+        for layer in self.convs1:
+            nn.utils.remove_weight_norm(layer)
+        for layer in self.convs2:
+            nn.utils.remove_weight_norm(layer)
+
+    def forward(self, hidden_states):
+        for conv1, conv2 in zip(self.convs1, self.convs2):
+            residual = hidden_states
+            hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
+            hidden_states = conv1(hidden_states)
+            hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
+            hidden_states = conv2(hidden_states)
+            hidden_states = hidden_states + residual
+        return hidden_states
+
+
+class SeamlessM4TVariancePredictor(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        embed_dim = config.unit_embed_dim
+        kernel_size = config.variance_predictor_kernel_size
+        var_pred_dropout = config.var_pred_dropout
+
+        self.conv1 = nn.Conv1d(
+            embed_dim,
+            embed_dim,
+            kernel_size=kernel_size,
+            padding=(kernel_size - 1) // 2,
+        )
+        self.activation_function = nn.ReLU()
+        self.ln1 = nn.LayerNorm(embed_dim)
+        self.dropout_module = nn.Dropout(p=var_pred_dropout)
+        self.conv2 = nn.Conv1d(
+            embed_dim,
+            embed_dim,
+            kernel_size=kernel_size,
+            padding=1,
+        )
+        self.ln2 = nn.LayerNorm(embed_dim)
+        self.proj = nn.Linear(embed_dim, 1)
+
+    def forward(self, hidden_states: Tensor) -> Tensor:
+        # Input: B x T x C; Output: B x T
+        hidden_states = self.conv1(hidden_states.transpose(1, 2))
+        hidden_states = self.activation_function(hidden_states).transpose(1, 2)
+        hidden_states = self.dropout_module(self.ln1(hidden_states))
+        hidden_states = self.conv2(hidden_states.transpose(1, 2))
+        hidden_states = self.activation_function(hidden_states).transpose(1, 2)
+        hidden_states = self.dropout_module(self.ln2(hidden_states))
+        return self.proj(hidden_states).squeeze(dim=2)
+
+
+class SeamlessM4THifiGan(nn.Module):
+    def __init__(self, config: SeamlessM4TConfig):
+        super().__init__()
+        model_in_dim = config.unit_embed_dim + config.lang_embed_dim + config.spkr_embed_dim
+        self.leaky_relu_slope = config.leaky_relu_slope
+        self.num_kernels = len(config.resblock_kernel_sizes)
+        self.num_upsamples = len(config.upsample_rates)
+        self.conv_pre = nn.Conv1d(
+            model_in_dim,
+            config.upsample_initial_channel,
+            kernel_size=7,
+            stride=1,
+            padding=3,
+        )
+
+        self.upsampler = nn.ModuleList()
+        for i, (upsample_rate, kernel_size) in enumerate(zip(config.upsample_rates, config.upsample_kernel_sizes)):
+            self.upsampler.append(
+                nn.ConvTranspose1d(
+                    config.upsample_initial_channel // (2**i),
+                    config.upsample_initial_channel // (2 ** (i + 1)),
+                    kernel_size=kernel_size,
+                    stride=upsample_rate,
+                    padding=(kernel_size - upsample_rate) // 2,
+                )
+            )
+
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.upsampler)):
+            channels = config.upsample_initial_channel // (2 ** (i + 1))
+            for kernel_size, dilation in zip(config.resblock_kernel_sizes, config.resblock_dilation_sizes):
+                self.resblocks.append(HifiGanResidualBlock(channels, kernel_size, dilation, config.leaky_relu_slope))
+
+        self.conv_post = nn.Conv1d(channels, 1, kernel_size=7, stride=1, padding=3)
+
+    def forward(self, input_embeds: torch.FloatTensor) -> torch.FloatTensor:
+        r"""
+        Converts a log-mel spectrogram into a speech waveform. Passing a batch of log-mel spectrograms returns a batch
+        of speech waveforms. Passing a single, un-batched log-mel spectrogram returns a single, un-batched speech
+        waveform.
+
+        Args:
+            spectrogram (`torch.FloatTensor`):
+                Tensor containing the log-mel spectrograms. Can be batched and of shape `(batch_size, sequence_length,
+                model_in_dim)`, or un-batched and of shape `(sequence_length, model_in_dim)`. Note that `model_in_dim`
+                is the sum of `config.unit_embed_dim`, `config.lang_embed_dim` and `config.spkr_embed_dim`.
+
+        Returns:
+            `torch.FloatTensor`: Tensor containing the speech waveform. If the input spectrogram is batched, will be of
+            shape `(batch_size, num_frames,)`. If un-batched, will be of shape `(num_frames,)`.
+        """
+
+        hidden_states = self.conv_pre(input_embeds)
+        for i in range(self.num_upsamples):
+            hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
+            hidden_states = self.upsampler[i](hidden_states)
+
+            res_state = self.resblocks[i * self.num_kernels](hidden_states)
+            for j in range(1, self.num_kernels):
+                res_state += self.resblocks[i * self.num_kernels + j](hidden_states)
+            hidden_states = res_state / self.num_kernels
+
+        hidden_states = nn.functional.leaky_relu(hidden_states)
+        hidden_states = self.conv_post(hidden_states)
+        hidden_states = torch.tanh(hidden_states)
+
+        # remove seq-len dim since this collapses to 1
+        waveform = hidden_states.squeeze(1)
+
+        return waveform
+
+
+@auto_docstring(
+    custom_intro="""
+    Code HiFi-GAN vocoder as described in this [repository](https://github.com/facebookresearch/speech-resynthesis).
+    """
+)
+class SeamlessM4TCodeHifiGan(PreTrainedModel):
+    config: SeamlessM4TConfig
+    main_input_name = "input_embeds"
+    _no_split_modules = []
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.pad_token_id = config.t2u_pad_token_id
+        self.dur_predictor = SeamlessM4TVariancePredictor(config)
+
+        self.unit_embedding = nn.Embedding(config.unit_hifi_gan_vocab_size, config.unit_embed_dim)
+        self.speaker_embedding = nn.Embedding(config.vocoder_num_spkrs, config.spkr_embed_dim)
+        self.language_embedding = nn.Embedding(config.vocoder_num_langs, config.lang_embed_dim)
+
+        self.hifi_gan = SeamlessM4THifiGan(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def _get_dur_output_lengths(self, input_ids, dur_out):
+        """
+        Computes the output length after the duration layer.
+        """
+        unit_lengths = (input_ids != self.pad_token_id).sum(1)
+
+        # take care of edge cases where no padding or too many padding
+        unit_lengths = torch.clamp(unit_lengths, 0, dur_out.shape[1] - 1)
+
+        cumulative_dur_out = torch.cumsum(dur_out, dim=1)
+        unit_lengths = cumulative_dur_out.gather(dim=1, index=unit_lengths.unsqueeze(1)).squeeze()
+
+        return unit_lengths
+
+    def _get_output_hifigan_lengths(self, input_lengths: Union[torch.LongTensor, int]):
+        """
+        Computes the output length of the hifigan convolutional layers
+        """
+
+        def _conv_out_length(input_length, kernel_size, stride, pad, dilation=1):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return (
+                torch.div(input_length + 2 * pad - dilation * (kernel_size - 1) - 1, stride, rounding_mode="floor") + 1
+            )
+
+        def _transpose_conv_out_length(input_length, kernel_size, stride, pad, dilation=1):
+            return (input_length - 1) * stride - 2 * pad + dilation * (kernel_size - 1) + 1
+
+        # conv_pre
+        input_lengths = _conv_out_length(input_lengths, 7, 1, 3)
+
+        # upsampler
+        for i, (upsample_rate, kernel_size) in enumerate(
+            zip(self.config.upsample_rates, self.config.upsample_kernel_sizes)
+        ):
+            input_lengths = _transpose_conv_out_length(
+                input_lengths, kernel_size, upsample_rate, (kernel_size - upsample_rate) // 2
+            )
+
+        # resblock
+        for i in range(len(self.config.upsample_rates)):
+            for kernel_size, dilation in zip(self.config.resblock_kernel_sizes, self.config.resblock_dilation_sizes):
+                for dil in dilation:
+                    input_lengths = _conv_out_length(
+                        input_lengths, kernel_size, 1, (kernel_size - 1) * dil // 2, dilation=dil
+                    )
+
+                for dil in dilation:
+                    input_lengths = _conv_out_length(input_lengths, kernel_size, 1, (kernel_size - 1) // 2, dilation=1)
+
+        # conv_post
+        input_lengths = _conv_out_length(input_lengths, 7, 1, 3)
+
+        return input_lengths
+
+    def forward(
+        self, input_ids: torch.LongTensor, spkr_id: torch.Tensor, lang_id: torch.Tensor
+    ) -> tuple[torch.Tensor]:
+        """
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary.
+
+                Indices can be obtained using [`SeamlessM4TTextToUnitForConditionalGeneration`]. [What are input
+                IDs?](../glossary#input-ids)
+            spkr_id (`int`, *optional*):
+                The id of the speaker used for speech synthesis. Must be lower than `config.vocoder_num_spkrs`.
+            tgt_lang (`str`, *optional*):
+                The language id to use as target language for translation.
+        """
+        hidden_states = self.unit_embedding(input_ids).transpose(1, 2)
+        spkr = self.speaker_embedding(spkr_id).transpose(1, 2)
+        lang = self.language_embedding(lang_id).transpose(1, 2)
+
+        log_dur_pred = self.dur_predictor(hidden_states.transpose(1, 2))
+        dur_out = torch.clamp(torch.round(torch.expm1(log_dur_pred)).long(), min=1)
+        # B x C x T
+        if hidden_states.size(0) == 1:
+            hidden_states = torch.repeat_interleave(hidden_states, dur_out.view(-1), dim=2)
+        else:
+            # if batched sample, need to interleave per sample, and pad -> loss of parallelism
+            if hidden_states.shape[0] > 1 and self.training:
+                logger.warning(
+                    """`self.training=True` and you use batching. You lose parallelism during the hifigan
+                               forward pass because the samples are interleaved."""
+                )
+            hidden_states = [
+                torch.repeat_interleave(hidden_state, duration, dim=-1).transpose(0, 1)
+                for (hidden_state, duration) in zip(hidden_states, dur_out)
+            ]
+
+            hidden_states = nn.utils.rnn.pad_sequence(hidden_states, batch_first=True).transpose(1, 2)
+
+        spkr = spkr.repeat(1, 1, hidden_states.shape[-1])
+        lang = lang.repeat(1, 1, hidden_states.shape[-1])
+        hidden_states = torch.cat([lang, hidden_states, spkr], dim=1)
+
+        hidden_states = self.hifi_gan(hidden_states)
+
+        unit_lengths = self._get_dur_output_lengths(input_ids, dur_out)
+        lengths = self._get_output_hifigan_lengths(unit_lengths)
+
+        return hidden_states, lengths
+
+    def _init_weights(self, module: nn.Module):
+        """Initialize the weights."""
+        std = self.config.initializer_range
+        if isinstance(module, (nn.Linear, nn.Conv1d, nn.ConvTranspose1d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.weight.data.fill_(1.0)
+            module.bias.data.zero_()
+
+    def apply_weight_norm(self):
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
+        weight_norm(self.hifi_gan.conv_pre)
+        for layer in self.hifi_gan.upsampler:
+            weight_norm(layer)
+        for layer in self.hifi_gan.resblocks:
+            layer.apply_weight_norm()
+        weight_norm(self.hifi_gan.conv_post)
+
+    def remove_weight_norm(self):
+        nn.utils.remove_weight_norm(self.hifi_gan.conv_pre)
+        for layer in self.hifi_gan.upsampler:
+            nn.utils.remove_weight_norm(layer)
+        for layer in self.hifi_gan.resblocks:
+            layer.remove_weight_norm()
+        nn.utils.remove_weight_norm(self.hifi_gan.conv_post)
+
+
+############ WHOLE MODEL related code ################
+
+
+@auto_docstring(
+    custom_intro="""
+    The text-to-text SeamlessM4T Model transformer which can be used for T2TT.
+    """
+)
+class SeamlessM4TForTextToText(SeamlessM4TPreTrainedModel, GenerationMixin):
+    _keys_to_ignore_on_load_missing = ["speech_encoder", "t2u_model", "vocoder"]
+    main_input_name = "input_ids"
+
+    _tied_weights_keys = [
+        "lm_head.weight",
+        "text_encoder.embed_tokens.weight",
+        "text_decoder.embed_tokens.weight",
+    ]
+
+    def __init__(self, config: SeamlessM4TConfig):
+        super().__init__(config)
+
+        self.shared = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
+
+        self.text_encoder = SeamlessM4TEncoder(config, self.shared)
+        self.text_decoder = SeamlessM4TDecoder(config, self.shared)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_encoder(self):
+        return self.text_encoder
+
+    def get_decoder(self):
+        return self.text_decoder
+
+    def get_input_embeddings(self):
+        return self.text_decoder.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.text_encoder.embed_tokens = value
+        self.text_decoder.embed_tokens = value
+        self.shared = value
+
+    def _tie_weights(self):
+        if self.config.tie_word_embeddings:
+            self._tie_or_clone_weights(self.text_encoder.embed_tokens, self.shared)
+            self._tie_or_clone_weights(self.text_decoder.embed_tokens, self.shared)
+            self._tie_or_clone_weights(self.lm_head, self.shared)
+
+    @auto_docstring(custom_args=SEAMLESS_M4T_COMMON_CUSTOM_ARGS)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Seq2SeqLMOutput, tuple[torch.FloatTensor]]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
+        if labels is not None:
+            if use_cache:
+                logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
+            use_cache = False
+            if decoder_input_ids is None and decoder_inputs_embeds is None:
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if encoder_outputs is None:
+            encoder_outputs = self.text_encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        encoder_attention_mask = attention_mask
+
+        # decoder outputs consists of (dec_features, past_key_values, dec_hidden, dec_attn)
+        decoder_outputs = self.text_decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        lm_logits = self.lm_head(decoder_outputs[0])
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            labels = labels.to(lm_logits.device)
+            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            outputs = decoder_outputs + encoder_outputs
+            output = (lm_logits,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=masked_lm_loss,
+            logits=lm_logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+    def generate(
+        self,
+        input_ids=None,
+        tgt_lang=None,
+        generation_config=None,
+        logits_processor=None,
+        stopping_criteria=None,
+        prefix_allowed_tokens_fn=None,
+        synced_gpus=False,
+        **kwargs,
+    ):
+        """
+        Generates sequences of token ids.
+
+        <Tip warning={true}>
+
+        Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
+        model's default generation configuration. You can override any `generation_config` by passing the corresponding
+        parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`.
+
+        For an overview of generation strategies and code examples, check out the [following
+        guide](./generation_strategies).
+
+        </Tip>
+
+        Parameters:
+            input_ids (`torch.Tensor` of varying shape depending on the modality, *optional*):
+                Indices of input sequence tokens in the vocabulary.
+
+                Indices can be obtained using [`SeamlessM4TTokenizer`] or [`SeamlessM4TProcessor`]. See
+                [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            tgt_lang (`str`, *optional*):
+                The language to use as target language for translation.
+            generation_config (`~generation.GenerationConfig`, *optional*):
+                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
+                passed to generate matching the attributes of `generation_config` will override them. If
+                `generation_config` is not provided, the default will be used, which had the following loading
+                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
+                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
+                default values, whose documentation should be checked to parameterize generation.
+            logits_processor (`LogitsProcessorList`, *optional*):
+                Custom logits processors that complement the default logits processors built from arguments and
+                generation config. If a logit processor is passed that is already created with the arguments or a
+                generation config an error is thrown. This feature is intended for advanced users.
+            stopping_criteria (`StoppingCriteriaList`, *optional*):
+                Custom stopping criteria that complement the default stopping criteria built from arguments and a
+                generation config. If a stopping criteria is passed that is already created with the arguments or a
+                generation config an error is thrown. This feature is intended for advanced users.
+            prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], list[int]]`, *optional*):
+                If provided, this function constraints the beam search to allowed tokens only at each step. If not
+                provided no constraint is applied. This function takes 2 arguments: the batch ID `batch_id` and
+                `input_ids`. It has to return a list with the allowed tokens for the next generation step conditioned
+                on the batch ID `batch_id` and the previously generated tokens `inputs_ids`. This argument is useful
+                for constrained generation conditioned on the prefix, as described in [Autoregressive Entity
+                Retrieval](https://huggingface.co/papers/2010.00904).
+            synced_gpus (`bool`, *optional*, defaults to `False`):
+                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
+                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
+            kwargs (`dict[str, Any]`, *optional*):
+                Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
+                forwarded to the `forward` function of the model.
+
+        Return:
+            [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True`
+            or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`. The possible
+            [`~utils.ModelOutput`] types are:
+                - [`~generation.GenerateEncoderDecoderOutput`],
+                - [`~generation.GenerateBeamEncoderDecoderOutput`]
+        """
+        # prepare text_decoder_input_ids
+        text_decoder_input_ids = kwargs.pop("decoder_input_ids", None)
+        # overwrite text_decoder_input_ids if tgt_lang is passed. The latter gets priority over decoder_input_ids.
+        if tgt_lang is not None:
+            batch_size = len(input_ids) if input_ids is not None else len(kwargs.get("inputs_embeds"))
+
+            if hasattr(self.generation_config, "text_decoder_lang_to_code_id"):
+                # also accept __xxx__
+                tgt_lang = tgt_lang.replace("__", "")
+                if tgt_lang not in self.generation_config.text_decoder_lang_to_code_id:
+                    raise ValueError(
+                        f"""`tgt_lang={tgt_lang}` is not supported by this model. Please specify a `tgt_lang` in
+                        {", ".join(self.generation_config.text_decoder_lang_to_code_id.keys())}"""
+                    )
+                # tgt_lang gets priority over decoder input ids
+                text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang)
+                text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size, device=self.device)
+            else:
+                raise ValueError(
+                    """This model generation config doesn't have a `text_decoder_lang_to_code_id` key which maps
+                    the target language to the right token id. Make sure to load the right generation config."""
+                )
+        else:
+            # only a warning, otherwise errors appear in the tests
+            logger.warning(
+                """You must either specify a `tgt_lang` or pass a correct `text_decoder_input_ids` to get
+                a correct generation, otherwise the generation will probably make no sense."""
+            )
+
+        return super().generate(
+            input_ids,
+            generation_config,
+            logits_processor,
+            stopping_criteria,
+            prefix_allowed_tokens_fn,
+            synced_gpus,
+            decoder_input_ids=text_decoder_input_ids,
+            **kwargs,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    The speech-to-text SeamlessM4T Model transformer which can be used for S2TT.
+    """
+)
+class SeamlessM4TForSpeechToText(SeamlessM4TPreTrainedModel, GenerationMixin):
+    _keys_to_ignore_on_load_missing = ["text_encoder", "t2u_model", "vocoder"]
+    main_input_name = "input_features"
+
+    _tied_weights_keys = [
+        "lm_head.weight",
+        "text_decoder.embed_tokens.weight",
+    ]
+
+    def __init__(self, config: SeamlessM4TConfig):
+        super().__init__(config)
+
+        self.shared = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
+        self.speech_encoder = SeamlessM4TSpeechEncoder(config)
+        self.text_decoder = SeamlessM4TDecoder(config, self.shared)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_encoder(self):
+        return self.speech_encoder
+
+    def get_decoder(self):
+        return self.text_decoder
+
+    def get_input_embeddings(self):
+        return self.text_decoder.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.text_decoder.embed_tokens = value
+
+    def _tie_weights(self):
+        if self.config.tie_word_embeddings:
+            self._tie_or_clone_weights(self.text_decoder.embed_tokens, self.shared)
+            self._tie_or_clone_weights(self.lm_head, self.shared)
+
+    @auto_docstring(custom_args=SEAMLESS_M4T_COMMON_CUSTOM_ARGS)
+    def forward(
+        self,
+        input_features: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Seq2SeqLMOutput, tuple[torch.FloatTensor]]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
+        if labels is not None:
+            if use_cache:
+                logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
+            use_cache = False
+            if decoder_input_ids is None and decoder_inputs_embeds is None:
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if encoder_outputs is None:
+            encoder_outputs = self.speech_encoder(
+                input_features=input_features,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        encoder_attention_mask = attention_mask
+        if attention_mask is not None:
+            sub_sampled_lengths = self._compute_sub_sample_lengths_from_attention_mask(attention_mask).to(
+                encoder_outputs[0].device
+            )
+            encoder_attention_mask = _compute_new_attention_mask(
+                hidden_states=encoder_outputs[0], seq_lens=sub_sampled_lengths
+            )
+
+        # decoder outputs consists of (dec_features, past_key_values, dec_hidden, dec_attn)
+        decoder_outputs = self.text_decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        lm_logits = self.lm_head(decoder_outputs[0])
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            labels = labels.to(lm_logits.device)
+            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            outputs = decoder_outputs + encoder_outputs
+            output = (lm_logits,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=masked_lm_loss,
+            logits=lm_logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+    def generate(
+        self,
+        input_features=None,
+        tgt_lang=None,
+        generation_config=None,
+        logits_processor=None,
+        stopping_criteria=None,
+        prefix_allowed_tokens_fn=None,
+        synced_gpus=False,
+        **kwargs,
+    ):
+        """
+        Generates sequences of token ids.
+
+        <Tip warning={true}>
+
+        Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
+        model's default generation configuration. You can override any `generation_config` by passing the corresponding
+        parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`.
+
+        For an overview of generation strategies and code examples, check out the [following
+        guide](./generation_strategies).
+
+        </Tip>
+
+        Parameters:
+            input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`):
+                Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the
+                [`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
+
+            tgt_lang (`str`, *optional*):
+                The language to use as target language for translation.
+            generation_config (`~generation.GenerationConfig`, *optional*):
+                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
+                passed to generate matching the attributes of `generation_config` will override them. If
+                `generation_config` is not provided, the default will be used, which had the following loading
+                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
+                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
+                default values, whose documentation should be checked to parameterize generation.
+            logits_processor (`LogitsProcessorList`, *optional*):
+                Custom logits processors that complement the default logits processors built from arguments and
+                generation config. If a logit processor is passed that is already created with the arguments or a
+                generation config an error is thrown. This feature is intended for advanced users.
+            stopping_criteria (`StoppingCriteriaList`, *optional*):
+                Custom stopping criteria that complement the default stopping criteria built from arguments and a
+                generation config. If a stopping criteria is passed that is already created with the arguments or a
+                generation config an error is thrown. This feature is intended for advanced users.
+            prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], list[int]]`, *optional*):
+                If provided, this function constraints the beam search to allowed tokens only at each step. If not
+                provided no constraint is applied. This function takes 2 arguments: the batch ID `batch_id` and
+                `input_ids`. It has to return a list with the allowed tokens for the next generation step conditioned
+                on the batch ID `batch_id` and the previously generated tokens `inputs_ids`. This argument is useful
+                for constrained generation conditioned on the prefix, as described in [Autoregressive Entity
+                Retrieval](https://huggingface.co/papers/2010.00904).
+            synced_gpus (`bool`, *optional*, defaults to `False`):
+                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
+                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
+            kwargs (`dict[str, Any]`, *optional*):
+                Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
+                forwarded to the `forward` function of the model.
+
+        Return:
+            [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True`
+            or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`. The possible
+            [`~utils.ModelOutput`] types are:
+                - [`~generation.GenerateEncoderDecoderOutput`],
+                - [`~generation.GenerateBeamEncoderDecoderOutput`]
+        """
+        text_decoder_input_ids = kwargs.pop("decoder_input_ids", None)
+        # overwrite text_decoder_input_ids if tgt_lang is passed. The latter gets priority over decoder_input_ids.
+        input_features = input_features if input_features is not None else kwargs.pop("inputs")
+        if tgt_lang is not None:
+            inputs = kwargs.get("input_embeds") if input_features is None else input_features
+            inputs = (
+                inputs
+                if inputs is not None
+                else kwargs.get("encoder_outputs", {"last_hidden_state": None})["last_hidden_state"]
+            )
+            batch_size = len(inputs)
+
+            if hasattr(self.generation_config, "text_decoder_lang_to_code_id"):
+                # also accept __xxx__
+                tgt_lang = tgt_lang.replace("__", "")
+                if tgt_lang not in self.generation_config.text_decoder_lang_to_code_id:
+                    raise ValueError(
+                        f"""`tgt_lang={tgt_lang}` is not supported by this model. Please specify a `tgt_lang` in
+                        {", ".join(self.generation_config.text_decoder_lang_to_code_id.keys())}"""
+                    )
+                # tgt_lang gets priority over decoder input ids
+                text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang)
+                text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size, device=self.device)
+            else:
+                raise ValueError(
+                    """This model generation config doesn't have a `text_decoder_lang_to_code_id` key which maps
+                    the target language to the right token id. Make sure to load the right generation config."""
+                )
+        else:
+            # only a warning, otherwise errors appear in the tests
+            logger.warning(
+                """You must either specify a `tgt_lang` or pass a correct `text_decoder_input_ids` to get
+                a correct generation, otherwise the generation will probably make no sense."""
+            )
+        return super().generate(
+            input_features,
+            generation_config,
+            logits_processor,
+            stopping_criteria,
+            prefix_allowed_tokens_fn,
+            synced_gpus,
+            decoder_input_ids=text_decoder_input_ids,
+            **kwargs,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    The text-to-speech SeamlessM4T Model transformer which can be used for T2ST.
+    """
+)
+class SeamlessM4TForTextToSpeech(SeamlessM4TPreTrainedModel, GenerationMixin):
+    _keys_to_ignore_on_load_missing = ["speech_encoder"]
+    main_input_name = "input_ids"
+
+    _tied_weights_keys = [
+        "lm_head.weight",
+        "text_encoder.embed_tokens.weight",
+        "text_decoder.embed_tokens.weight",
+    ]
+
+    def __init__(self, config: SeamlessM4TConfig):
+        super().__init__(config)
+
+        self.shared = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
+
+        self.text_encoder = SeamlessM4TEncoder(config, self.shared)
+        self.text_decoder = SeamlessM4TDecoder(config, self.shared)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        self.t2u_model = SeamlessM4TTextToUnitForConditionalGeneration(config)
+        self.vocoder = SeamlessM4TCodeHifiGan(config)
+
+    def get_encoder(self):
+        return self.text_encoder
+
+    def get_decoder(self):
+        return self.text_decoder
+
+    def get_input_embeddings(self):
+        return self.text_decoder.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.text_encoder.embed_tokens = value
+        self.text_decoder.embed_tokens = value
+        self.shared = value
+
+    def _tie_weights(self):
+        if self.config.tie_word_embeddings:
+            self._tie_or_clone_weights(self.text_encoder.embed_tokens, self.shared)
+            self._tie_or_clone_weights(self.text_decoder.embed_tokens, self.shared)
+            self._tie_or_clone_weights(self.lm_head, self.shared)
+
+    @auto_docstring(custom_args=SEAMLESS_M4T_COMMON_CUSTOM_ARGS)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> Union[Seq2SeqLMOutput, tuple[torch.FloatTensor]]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
+        if labels is not None:
+            if use_cache:
+                logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
+            use_cache = False
+            if decoder_input_ids is None and decoder_inputs_embeds is None:
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if encoder_outputs is None:
+            # if encoder_outputs is not None, it's probably used within a .generate method so no need to warn
+            logger.warning(
+                "This is the same forward method as `SeamlessM4TForTextToText`."
+                "It doesn't use the text-to-unit model `SeamlessM4TTextToUnitForConditionalGeneration`."
+                "If you want to generate speech, use the `.generate` method."
+            )
+            encoder_outputs = self.text_encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        encoder_attention_mask = attention_mask
+
+        # decoder outputs consists of (dec_features, past_key_values, dec_hidden, dec_attn)
+        decoder_outputs = self.text_decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+
+        lm_logits = self.lm_head(decoder_outputs[0])
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            labels = labels.to(lm_logits.device)
+            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            outputs = decoder_outputs + encoder_outputs
+            output = (lm_logits,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=masked_lm_loss,
+            logits=lm_logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        return_intermediate_token_ids: Optional[bool] = None,
+        tgt_lang: Optional[str] = None,
+        spkr_id: Optional[int] = 0,
+        **kwargs,
+    ) -> Union[torch.Tensor, SeamlessM4TGenerationOutput]:
+        """
+        Generates translated audio waveforms.
+
+        <Tip>
+
+        This method successively calls the `.generate` function of two different sub-models. You can specify keyword
+        arguments at two different levels: general arguments that will be passed to both models, or prefixed arguments
+        that will be passed to one of them.
+
+        For example, calling `.generate(input_ids, num_beams=4, speech_do_sample=True)` will successively perform
+        beam-search decoding on the text model, and multinomial beam-search sampling on the speech model.
+
+        For an overview of generation strategies and code examples, check out the [following
+        guide](./generation_strategies).
+
+        </Tip>
+
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary.
+
+                Indices can be obtained using [`SeamlessM4TTokenizer`] or [`SeamlessM4TProcessor`]. See
+                [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            return_intermediate_token_ids (`bool`, *optional*):
+                If `True`, also returns the intermediate generated text and unit tokens. Set to `True` if you also want
+                to get translated text alongside the audio.
+            tgt_lang (`str`, *optional*):
+                The language to use as target language for translation.
+            spkr_id (`int`, *optional*, defaults to 0):
+                The id of the speaker used for speech synthesis. Must be lower than `config.vocoder_num_spkrs`.
+            kwargs (*optional*):
+                Remaining dictionary of keyword arguments that will be passed to [`GenerationMixin.generate`]. Keyword
+                arguments are of two types:
+
+                    - Without a prefix, they will be entered as `**kwargs` for the `generate` method of each sub-model,
+                    except for `decoder_input_ids` which will only be passed through the text components.
+                    - With a *text_* or *speech_* prefix, they will be input for the `generate` method of the
+                    text model and speech model respectively. It has the priority over the keywords without a prefix.
+
+                    This means you can, for example, specify a generation strategy for one generation but not for the
+                    other.
+
+
+        Returns:
+            `Union[SeamlessM4TGenerationOutput, tuple[Tensor]]`:
+            - If `return_intermediate_token_ids`, returns [`SeamlessM4TGenerationOutput`].
+            - If not `return_intermediate_token_ids`, returns a tuple composed of waveforms of shape `(batch_size,
+              sequence_length)` and `waveform_lengths` which gives the length of each sample.
+        """
+        batch_size = len(input_ids) if input_ids is not None else len(kwargs.get("inputs_embeds"))
+
+        if tgt_lang is None:
+            raise ValueError("You must specify a `tgt_lang` to generate translated speech.")
+        else:
+            # also accept __xxx__
+            tgt_lang = tgt_lang.replace("__", "")
+            for key in ["text_decoder_lang_to_code_id", "t2u_lang_code_to_id", "vocoder_lang_code_to_id"]:
+                lang_code_to_id = getattr(self.generation_config, key, None)
+                if lang_code_to_id is None:
+                    raise ValueError(
+                        f"""This model generation config doesn't have a `{key}` key which maps the target language
+                        to the right token id. Make sure to load the right generation config."""
+                    )
+                elif tgt_lang not in lang_code_to_id:
+                    raise ValueError(
+                        f"""`tgt_lang={tgt_lang}` is not supported by this model.
+                    Please specify a `tgt_lang` in {",".join(lang_code_to_id.keys())}. Note that SeamlessM4T supports
+                    more languages for text translation than for speech synthesis."""
+                    )
+
+        kwargs_text, kwargs_speech = format_speech_generation_kwargs(kwargs)
+        kwargs_text["output_hidden_states"] = True
+        kwargs_text["return_dict_in_generate"] = True
+        kwargs_text["output_scores"] = True
+
+        text_decoder_input_ids = kwargs_text.get("decoder_input_ids")
+
+        # overwrite text_decoder_input_ids if tgt_lang is passed. The latter gets priority over decoder_input_ids.
+        text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang)
+        text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size, device=self.device)
+
+        kwargs_text["decoder_input_ids"] = text_decoder_input_ids
+
+        # first generation
+        text_generation_output = super().generate(input_ids, **kwargs_text)
+        sequences = text_generation_output.sequences
+
+        # prepare second generation
+        num_return_sequences = len(sequences) // batch_size
+        attention_mask = kwargs_speech.get("attention_mask", kwargs_text.get("attention_mask", None))
+
+        encoder_hidden_states = text_generation_output.encoder_hidden_states[-1]
+
+        # take care of num_return_sequences
+        # take most probable hidden states per batch of return_sequences
+        # (batch_size*num_return_sequences, ...) -> (batch_size,...)
+        if num_return_sequences > 1:
+            idx_most_probable_sequences_per_batch = text_generation_output.sequences_scores.view(batch_size, -1)
+            idx_most_probable_sequences_per_batch = idx_most_probable_sequences_per_batch.argmax(-1)
+            idx_most_probable_sequences_per_batch = (
+                idx_most_probable_sequences_per_batch
+                + torch.arange(batch_size, device=self.device) * num_return_sequences
+            )
+            sequences = sequences[idx_most_probable_sequences_per_batch]
+
+        # get decoder last hidden state - must do a pass through the text decoder
+        t2u_input_embeds = self.text_decoder(
+            input_ids=sequences,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=attention_mask,
+        ).last_hidden_state
+
+        pad_token_id = self.generation_config.pad_token_id
+
+        # Compute new attention mask
+        seq_lens = (sequences != pad_token_id).int().sum(1)
+        t2u_model_attention_mask = _compute_new_attention_mask(t2u_input_embeds, seq_lens)
+        kwargs_speech["attention_mask"] = t2u_model_attention_mask
+
+        # Compute t2u decoder_input_ids
+        t2u_decoder_input_ids = kwargs_speech.get("decoder_input_ids")
+        t2u_tgt_lang_id = self.generation_config.t2u_lang_code_to_id.get(tgt_lang)
+        t2u_decoder_input_ids = torch.tensor(
+            [[self.config.t2u_eos_token_id, t2u_tgt_lang_id]] * batch_size, device=self.device
+        )
+        kwargs_speech["decoder_input_ids"] = t2u_decoder_input_ids
+        # second generation
+        unit_ids = self.t2u_model.generate(inputs_embeds=t2u_input_embeds, **kwargs_speech)
+        output_unit_ids = unit_ids.detach().clone()
+
+        # get rid of t2u_decoder_input_ids
+        unit_ids = unit_ids[:, kwargs_speech["decoder_input_ids"].shape[1] :]
+        # replace eos per pad
+        unit_ids[unit_ids == self.config.t2u_eos_token_id] = self.config.t2u_pad_token_id
+        # offset of control symbols
+        unit_ids = torch.where(
+            unit_ids == self.config.t2u_pad_token_id, unit_ids, unit_ids - self.config.vocoder_offset
+        )
+
+        vocoder_tgt_lang_id = self.generation_config.vocoder_lang_code_to_id.get(tgt_lang)
+        vocoder_tgt_lang_id = torch.tensor([[vocoder_tgt_lang_id]] * len(unit_ids), device=self.device)
+
+        spkr_id = torch.tensor([[spkr_id]] * len(unit_ids), device=self.device)
+
+        waveform, waveform_lengths = self.vocoder(input_ids=unit_ids, spkr_id=spkr_id, lang_id=vocoder_tgt_lang_id)
+
+        if return_intermediate_token_ids:
+            return SeamlessM4TGenerationOutput(
+                waveform=waveform,
+                waveform_lengths=waveform_lengths,
+                sequences=sequences,
+                unit_sequences=output_unit_ids,
+            )
+
+        return waveform, waveform_lengths
+
+
+@auto_docstring(
+    custom_intro="""
+    The speech-to-speech SeamlessM4T Model transformer which can be used for S2ST.
+    """
+)
+class SeamlessM4TForSpeechToSpeech(SeamlessM4TPreTrainedModel, GenerationMixin):
+    _keys_to_ignore_on_load_missing = ["text_encoder"]
+    main_input_name = "input_features"
+
+    _tied_weights_keys = [
+        "lm_head.weight",
+        "text_decoder.embed_tokens.weight",
+    ]
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.shared = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
+        self.speech_encoder = SeamlessM4TSpeechEncoder(config)
+        self.text_decoder = SeamlessM4TDecoder(config, self.shared)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        self.t2u_model = SeamlessM4TTextToUnitForConditionalGeneration(config)
+        self.vocoder = SeamlessM4TCodeHifiGan(config)
+
+    def get_encoder(self):
+        return self.speech_encoder
+
+    def get_decoder(self):
+        return self.text_decoder
+
+    def get_input_embeddings(self):
+        return self.text_decoder.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.text_decoder.embed_tokens = value
+
+    def _tie_weights(self):
+        if self.config.tie_word_embeddings:
+            self._tie_or_clone_weights(self.text_decoder.embed_tokens, self.shared)
+            self._tie_or_clone_weights(self.lm_head, self.shared)
+
+    @auto_docstring(custom_args=SEAMLESS_M4T_COMMON_CUSTOM_ARGS)
+    def forward(
+        self,
+        input_features: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Seq2SeqLMOutput, tuple[torch.FloatTensor]]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
+        if labels is not None:
+            if use_cache:
+                logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
+            use_cache = False
+            if decoder_input_ids is None and decoder_inputs_embeds is None:
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if encoder_outputs is None:
+            # if encoder_outputs is not None, it's probably used within a .generate method so no need to warn
+            logger.warning(
+                "This is the same forward method as `SeamlessM4TForSpeechToText`. It doesn't use `self.t2u_model`."
+                "If you want to generate speech, use the `generate` method."
+            )
+
+            encoder_outputs = self.speech_encoder(
+                input_features=input_features,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        encoder_attention_mask = attention_mask
+        if attention_mask is not None:
+            sub_sampled_lengths = self._compute_sub_sample_lengths_from_attention_mask(attention_mask).to(
+                encoder_outputs[0].device
+            )
+            encoder_attention_mask = _compute_new_attention_mask(
+                hidden_states=encoder_outputs[0], seq_lens=sub_sampled_lengths
+            )
+
+        # decoder outputs consists of (dec_features, past_key_values, dec_hidden, dec_attn)
+        decoder_outputs = self.text_decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        lm_logits = self.lm_head(decoder_outputs[0])
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            labels = labels.to(lm_logits.device)
+            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            outputs = decoder_outputs + encoder_outputs
+            output = (lm_logits,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=masked_lm_loss,
+            logits=lm_logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        input_features: Optional[torch.Tensor] = None,
+        return_intermediate_token_ids: Optional[bool] = None,
+        tgt_lang: Optional[str] = None,
+        spkr_id: Optional[int] = 0,
+        **kwargs,
+    ) -> Union[torch.Tensor, SeamlessM4TGenerationOutput]:
+        """
+        Generates translated audio waveforms.
+
+        <Tip>
+
+        This method successively calls the `.generate` function of two different sub-models. You can specify keyword
+        arguments at two different levels: general arguments that will be passed to both models, or prefixed arguments
+        that will be passed to one of them.
+
+        For example, calling `.generate(input_features, num_beams=4, speech_do_sample=True)` will successively perform
+        beam-search decoding on the text model, and multinomial beam-search sampling on the speech model.
+
+        For an overview of generation strategies and code examples, check out the [following
+        guide](./generation_strategies).
+
+        </Tip>
+
+        Args:
+            input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`):
+                Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the
+                [`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
+            return_intermediate_token_ids (`bool`, *optional*):
+                If `True`, also returns the intermediate generated text and unit tokens. Set to `True` if you also want
+                to get translated text alongside the audio.
+            tgt_lang (`str`, *optional*):
+                The language to use as target language for translation.
+            spkr_id (`int`, *optional*, defaults to 0):
+                The id of the speaker used for speech synthesis. Must be lower than `config.vocoder_num_spkrs`.
+
+            kwargs (*optional*):
+                Remaining dictionary of keyword arguments that will be passed to [`GenerationMixin.generate`]. Keyword
+                arguments are of two types:
+
+                    - Without a prefix, they will be entered as `**kwargs` for the `generate` method of each sub-model,
+                    except for `decoder_input_ids` which will only be passed through the text components.
+                    - With a *text_* or *speech_* prefix, they will be input for the `generate` method of the
+                    text model and speech model respectively. It has the priority over the keywords without a prefix.
+
+                    This means you can, for example, specify a generation strategy for one generation but not for the
+                    other.
+
+
+        Returns:
+            `Union[SeamlessM4TGenerationOutput, tuple[Tensor]]`:
+            - If `return_intermediate_token_ids`, returns [`SeamlessM4TGenerationOutput`].
+            - If not `return_intermediate_token_ids`, returns a tuple composed of waveforms of shape `(batch_size,
+              sequence_length)` and `waveform_lengths` which gives the length of each sample.
+        """
+        batch_size = len(input_features) if input_features is not None else len(kwargs.get("inputs_embeds"))
+
+        if tgt_lang is None:
+            raise ValueError("You must specify a `tgt_lang` to generate translated speech.")
+        else:
+            # also accept __xxx__
+            tgt_lang = tgt_lang.replace("__", "")
+            for key in ["text_decoder_lang_to_code_id", "t2u_lang_code_to_id", "vocoder_lang_code_to_id"]:
+                lang_code_to_id = getattr(self.generation_config, key, None)
+                if lang_code_to_id is None:
+                    raise ValueError(
+                        f"""This model generation config doesn't have a `{key}` key which maps the target language
+                        to the right token id. Make sure to load the right generation config."""
+                    )
+                elif tgt_lang not in lang_code_to_id:
+                    raise ValueError(
+                        f"""`tgt_lang={tgt_lang}` is not supported by this model.
+                    Please specify a `tgt_lang` in {",".join(lang_code_to_id.keys())}. Note that SeamlessM4T supports
+                    more languages for text translation than for speech synthesis."""
+                    )
+
+        kwargs_text, kwargs_speech = format_speech_generation_kwargs(kwargs)
+        kwargs_text["output_hidden_states"] = True
+        kwargs_text["return_dict_in_generate"] = True
+        kwargs_text["output_scores"] = True
+
+        text_decoder_input_ids = kwargs_text.get("decoder_input_ids")
+        # overwrite text_decoder_input_ids if tgt_lang is passed. The latter gets priority over decoder_input_ids.
+        text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang)
+        text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size, device=self.device)
+
+        kwargs_text["decoder_input_ids"] = text_decoder_input_ids
+
+        # first generation
+        text_generation_output = super().generate(input_features, **kwargs_text)
+        sequences = text_generation_output.sequences
+
+        # prepare second generation
+        num_return_sequences = len(sequences) // batch_size
+        attention_mask = kwargs_speech.get("attention_mask", kwargs_text.get("attention_mask", None))
+
+        # get last_hidden_state from encoder
+        encoder_hidden_states = self.speech_encoder(input_features=input_features, attention_mask=attention_mask)[0]
+
+        # input modality = speech so new attention mask for the decoder
+        if attention_mask is not None:
+            sub_sampled_lengths = self._compute_sub_sample_lengths_from_attention_mask(attention_mask).to(
+                encoder_hidden_states.device
+            )
+            attention_mask = _compute_new_attention_mask(
+                hidden_states=encoder_hidden_states, seq_lens=sub_sampled_lengths
+            )
+
+        # take care of num_return_sequences
+        # take most probable hidden states per batch of return_sequences
+        # (batch_size*num_return_sequences, ...) -> (batch_size,...)
+        if num_return_sequences > 1:
+            idx_most_probable_sequences_per_batch = text_generation_output.sequences_scores.view(batch_size, -1)
+            idx_most_probable_sequences_per_batch = idx_most_probable_sequences_per_batch.argmax(-1)
+            idx_most_probable_sequences_per_batch = (
+                idx_most_probable_sequences_per_batch
+                + torch.arange(batch_size, device=self.device) * num_return_sequences
+            )
+            sequences = sequences[idx_most_probable_sequences_per_batch]
+
+        # get decoder last hidden state - must do a pass through the text decoder
+        t2u_input_embeds = self.text_decoder(
+            input_ids=sequences,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=attention_mask,
+        ).last_hidden_state
+
+        pad_token_id = self.generation_config.pad_token_id
+
+        # Compute new attention mask
+        seq_lens = (sequences != pad_token_id).int().sum(1)
+        t2u_model_attention_mask = _compute_new_attention_mask(t2u_input_embeds, seq_lens)
+        kwargs_speech["attention_mask"] = t2u_model_attention_mask
+
+        # Compute t2u decoder_input_ids
+        t2u_decoder_input_ids = kwargs_speech.get("decoder_input_ids")
+        t2u_tgt_lang_id = self.generation_config.t2u_lang_code_to_id.get(tgt_lang)
+        t2u_decoder_input_ids = torch.tensor(
+            [[self.config.t2u_eos_token_id, t2u_tgt_lang_id]] * batch_size, device=self.device
+        )
+        kwargs_speech["decoder_input_ids"] = t2u_decoder_input_ids
+
+        # second generation
+        unit_ids = self.t2u_model.generate(inputs_embeds=t2u_input_embeds, **kwargs_speech)
+        output_unit_ids = unit_ids.detach().clone()
+
+        # get rid of t2u_decoder_input_ids
+        unit_ids = unit_ids[:, kwargs_speech["decoder_input_ids"].shape[1] :]
+        # replace eos per pad
+        unit_ids[unit_ids == self.config.t2u_eos_token_id] = self.config.t2u_pad_token_id
+        # offset of control symbols
+        unit_ids = torch.where(
+            unit_ids == self.config.t2u_pad_token_id, unit_ids, unit_ids - self.config.vocoder_offset
+        )
+
+        vocoder_tgt_lang_id = self.generation_config.vocoder_lang_code_to_id.get(tgt_lang)
+        vocoder_tgt_lang_id = torch.tensor([[vocoder_tgt_lang_id]] * len(unit_ids), device=self.device)
+
+        spkr_id = torch.tensor([[spkr_id]] * len(unit_ids), device=self.device)
+
+        waveform, waveform_lengths = self.vocoder(input_ids=unit_ids, spkr_id=spkr_id, lang_id=vocoder_tgt_lang_id)
+
+        if return_intermediate_token_ids:
+            return SeamlessM4TGenerationOutput(
+                waveform=waveform,
+                waveform_lengths=waveform_lengths,
+                sequences=sequences,
+                unit_sequences=output_unit_ids,
+            )
+
+        return waveform, waveform_lengths
+
+
+@auto_docstring(
+    custom_intro="""
+    The original SeamlessM4T Model transformer which can be used for every tasks available (S2ST, S2TT, T2TT, T2ST).
+    """
+)
+class SeamlessM4TModel(SeamlessM4TPreTrainedModel, GenerationMixin):
+    _tied_weights_keys = [
+        "lm_head.weight",
+        "text_encoder.embed_tokens.weight",
+        "text_decoder.embed_tokens.weight",
+    ]
+
+    def __init__(self, config, current_modality="text"):
+        r"""
+        current_modality (`str`, *optional*, defaults to `"text"`):
+            Default modality. Used to initialize the model.
+        """
+        super().__init__(config)
+
+        self.shared = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
+
+        self.text_encoder = SeamlessM4TEncoder(config, self.shared)
+        self.speech_encoder = SeamlessM4TSpeechEncoder(config)
+        self.text_decoder = SeamlessM4TDecoder(config, self.shared)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        self.current_modality = current_modality
+        if current_modality == "speech":
+            self.main_input_name = "input_features"
+
+        # these models already call post_init in their initialization
+        self.t2u_model = SeamlessM4TTextToUnitForConditionalGeneration(config)
+        self.vocoder = SeamlessM4TCodeHifiGan(config)
+
+    def set_modality(self, modality="text"):
+        if modality == "text":
+            self.main_input_name = "input_ids"
+            self.current_modality = "text"
+        elif modality == "speech":
+            self.main_input_name = "input_features"
+            self.current_modality = "speech"
+        else:
+            raise ValueError(f"`modality={modality}` is not a valid modality. It must be `text` or `speech`.")
+
+    def get_encoder(self):
+        if self.current_modality == "text":
+            return self.text_encoder
+        else:
+            return self.speech_encoder
+
+    def get_input_embeddings(self):
+        return self.text_decoder.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.text_encoder.embed_tokens = value
+        self.text_decoder.embed_tokens = value
+        self.shared = value
+
+    def _tie_weights(self):
+        if self.config.tie_word_embeddings:
+            self._tie_or_clone_weights(self.text_encoder.embed_tokens, self.shared)
+            self._tie_or_clone_weights(self.text_decoder.embed_tokens, self.shared)
+            self._tie_or_clone_weights(self.lm_head, self.shared)
+
+    @auto_docstring(custom_args=SEAMLESS_M4T_COMMON_CUSTOM_ARGS)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        input_features: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Seq2SeqLMOutput, tuple[torch.FloatTensor]]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if labels is not None:
+            if use_cache:
+                logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
+            use_cache = False
+            if decoder_input_ids is None and decoder_inputs_embeds is None:
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )
+
+        if input_ids is None and input_features is None and inputs_embeds is None and encoder_outputs is None:
+            raise ValueError(
+                "`input_ids`,`input_features`, `inputs_embeds` and `encoder_outputs` are all empty. Make sure at least one of them is not."
+            )
+        elif input_features is not None:
+            if input_ids is not None:
+                logger.warning(
+                    "`input_ids` is not `None` but `input_features` has been given."
+                    "`input_features` will be used in priority through the `speech_encoder`. "
+                    "Make sure that `input_features` and `input_ids` are mutually exclusive."
+                )
+
+            if inputs_embeds is not None:
+                logger.warning(
+                    "`inputs_embeds` is not `None` but `input_features` has been given."
+                    "`input_features` will be used in priority through `speech_encoder`. "
+                    "`inputs_embeds` will be ignored."
+                )
+
+            # if encoder_outputs is not None, it's probably used within a .generate method so no need to warn
+            logger.warning(
+                "This calls the same method `forward` as `SeamlessM4TForTextToText` and `SeamlessM4TForSpeechToText`"
+                "depending on the input modality. If you want to generate speech, use the `generate` method."
+            )
+
+            self.set_modality("speech")
+
+            encoder_outputs = self.speech_encoder(
+                input_features=input_features,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+
+        elif input_ids is not None or inputs_embeds is not None:
+            # if encoder_outputs is not None, it's probably used within a .generate method so no need to warn
+            logger.warning(
+                "This calls the same method `forward` as `SeamlessM4TForTextToText` and `SeamlessM4TForSpeechToText`"
+                "depending on the input modality. If you want to generate speech, use the `generate` method."
+            )
+            self.set_modality("text")
+            encoder_outputs = self.text_encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        encoder_attention_mask = attention_mask
+        # input modality = speech so new attention mask
+        if self.current_modality == "speech" and attention_mask is not None:
+            sub_sampled_lengths = self._compute_sub_sample_lengths_from_attention_mask(attention_mask).to(
+                encoder_outputs[0].device
+            )
+            encoder_attention_mask = _compute_new_attention_mask(
+                hidden_states=encoder_outputs[0], seq_lens=sub_sampled_lengths
+            )
+
+        # decoder outputs consists of (dec_features, past_key_values, dec_hidden, dec_attn)
+        decoder_outputs = self.text_decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        lm_logits = self.lm_head(decoder_outputs[0])
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            labels = labels.to(lm_logits.device)
+            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            outputs = decoder_outputs + encoder_outputs
+            output = (lm_logits,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=masked_lm_loss,
+            logits=lm_logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        input_features: Optional[torch.Tensor] = None,
+        return_intermediate_token_ids: Optional[bool] = None,
+        tgt_lang: Optional[str] = None,
+        spkr_id: Optional[int] = 0,
+        generate_speech: Optional[bool] = True,
+        **kwargs,
+    ) -> Union[torch.Tensor, SeamlessM4TGenerationOutput]:
+        """
+        Generates translated token ids and/or translated audio waveforms.
+
+        <Tip>
+
+        This method successively calls the `.generate` function of two different sub-models. You can specify keyword
+        arguments at two different levels: general arguments that will be passed to both models, or prefixed arguments
+        that will be passed to one of them.
+
+        For example, calling `.generate(input_ids=input_ids, num_beams=4, speech_do_sample=True)` will successively
+        perform beam-search decoding on the text model, and multinomial beam-search sampling on the speech model.
+
+        For an overview of generation strategies and code examples, check out the [following
+        guide](./generation_strategies).
+
+        </Tip>
+
+
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Indices of input sequence tokens in the vocabulary.
+
+                Indices can be obtained using [`SeamlessM4TTokenizer`] or [`SeamlessM4TProcessor`]. See
+                [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`, *optional*):
+                Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the
+                [`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
+            return_intermediate_token_ids (`bool`, *optional*):
+                If `True`, also returns the intermediate generated text and unit tokens. Set to `True` if you also want
+                to get translated text alongside the audio.
+                Note that if `generate_speech=False`, this parameter will be ignored and
+                the text tokens are returned.
+            tgt_lang (`str`, *optional*):
+                The language to use as target language for translation.
+            spkr_id (`int`, *optional*, defaults to 0):
+                The id of the speaker used for speech synthesis. Must be lower than `config.vocoder_num_spkrs`.
+            generate_speech (`bool`, *optional*, defaults to `True`):
+                If `False`, will only returns the text tokens and won't generate speech.
+
+            kwargs (*optional*):
+                Remaining dictionary of keyword arguments that will be passed to [`GenerationMixin.generate`]. Keyword
+                arguments are of two types:
+
+                    - Without a prefix, they will be entered as `**kwargs` for the `generate` method of each sub-model,
+                    except for `decoder_input_ids` which will only be passed through the text components.
+                    - With a *text_* or *speech_* prefix, they will be input for the `generate` method of the
+                    text model and speech model respectively. It has the priority over the keywords without a prefix.
+
+                    This means you can, for example, specify a generation strategy for one generation but not for the
+                    other.
+
+        Returns:
+            `Union[SeamlessM4TGenerationOutput, tuple[Tensor], ModelOutput]`:
+            - If `generate_speech` and `return_intermediate_token_ids`, returns [`SeamlessM4TGenerationOutput`].
+            - If `generate_speech` and not `return_intermediate_token_ids`, returns a tuple composed of waveforms of
+              shape `(batch_size, sequence_length)` and `waveform_lengths` which gives the length of each sample.
+            - If `generate_speech=False`, it will returns `ModelOutput`.
+        """
+        if input_ids is None and input_features is None and kwargs.get("inputs_embeds") is None:
+            raise ValueError(
+                "`input_ids`,`input_features` and `inputs_embeds` are all empty. Make sure at least one of them is not."
+            )
+
+        if generate_speech and tgt_lang is None:
+            raise ValueError("You must specify a `tgt_lang` to generate translated speech.")
+
+        if tgt_lang is not None:
+            # also accept __xxx__
+            tgt_lang = tgt_lang.replace("__", "")
+            for key in ["text_decoder_lang_to_code_id", "t2u_lang_code_to_id", "vocoder_lang_code_to_id"]:
+                lang_code_to_id = getattr(self.generation_config, key, None)
+                if lang_code_to_id is None:
+                    raise ValueError(
+                        f"""This model generation config doesn't have a `{key}` key which maps the target language
+                        to the right token id. Make sure to load the right generation config."""
+                    )
+                elif tgt_lang not in lang_code_to_id:
+                    raise ValueError(
+                        f"""`tgt_lang={tgt_lang}` is not supported by this model.
+                    Please specify a `tgt_lang` in {",".join(lang_code_to_id.keys())}. Note that SeamlessM4T supports
+                    more languages for text translation than for speech synthesis."""
+                    )
+
+        batch_size = (
+            len(input_features)
+            if input_features is not None
+            else (len(input_ids) if input_ids is not None else len(kwargs.get("inputs_embeds")))
+        )
+
+        kwargs_text, kwargs_speech = format_speech_generation_kwargs(kwargs)
+        kwargs_text["output_hidden_states"] = True
+        kwargs_text["return_dict_in_generate"] = True
+        kwargs_text["output_scores"] = True
+
+        text_decoder_input_ids = kwargs_text.get("decoder_input_ids")
+        # overwrite text_decoder_input_ids if tgt_lang is passed. The latter gets priority over decoder_input_ids.
+        if tgt_lang is not None:
+            # tgt_lang gets priority over decoder input ids
+            text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang)
+            text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size, device=self.device)
+
+        kwargs_text["decoder_input_ids"] = text_decoder_input_ids
+
+        # first generation
+        if input_features is not None:
+            self.set_modality("speech")
+            if input_ids is not None:
+                logger.warning(
+                    "`input_features` and `input_ids` are both non empty. `input_features` will be used in priority "
+                    "through the speech encoder. Make sure `input_features=None` if you want to use the text encoder."
+                )
+            text_generation_output = super().generate(input_features=input_features, **kwargs_text)
+        else:
+            self.set_modality("text")
+            text_generation_output = super().generate(input_ids=input_ids, input_features=None, **kwargs_text)
+        sequences = text_generation_output.sequences
+
+        if not generate_speech:
+            return text_generation_output
+
+        # prepare second generation
+        num_return_sequences = len(sequences) // batch_size
+        attention_mask = kwargs_speech.get("attention_mask", kwargs_text.get("attention_mask", None))
+
+        # get encoder last hidden states
+        if self.current_modality == "speech":
+            # get last_hidden_state from encoder - must do a pass through the speech encoder
+            encoder_hidden_states = self.speech_encoder(
+                input_features=input_features, attention_mask=attention_mask
+            ).last_hidden_state
+
+            # input modality = speech so new attention mask for the decoder
+            if attention_mask is not None:
+                sub_sampled_lengths = self._compute_sub_sample_lengths_from_attention_mask(attention_mask).to(
+                    encoder_hidden_states.device
+                )
+                attention_mask = _compute_new_attention_mask(
+                    hidden_states=encoder_hidden_states, seq_lens=sub_sampled_lengths
+                )
+        else:
+            encoder_hidden_states = text_generation_output.encoder_hidden_states[-1]
+
+        # take care of num_return_sequences
+        # take most probable hidden states per batch of return_sequences
+        # (batch_size*num_return_sequences, ...) -> (batch_size,...)
+        if num_return_sequences > 1:
+            idx_most_probable_sequences_per_batch = text_generation_output.sequences_scores.view(batch_size, -1)
+            idx_most_probable_sequences_per_batch = idx_most_probable_sequences_per_batch.argmax(-1)
+            idx_most_probable_sequences_per_batch = (
+                idx_most_probable_sequences_per_batch
+                + torch.arange(batch_size, device=self.device) * num_return_sequences
+            )
+            sequences = sequences[idx_most_probable_sequences_per_batch]
+
+        # get decoder last hidden state - must do a pass through the text decoder
+        t2u_input_embeds = self.text_decoder(
+            input_ids=sequences,
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=attention_mask,
+        ).last_hidden_state
+
+        pad_token_id = self.generation_config.pad_token_id
+
+        # Compute new attention mask
+        seq_lens = (sequences != pad_token_id).int().sum(1)
+        t2u_model_attention_mask = _compute_new_attention_mask(t2u_input_embeds, seq_lens)
+        kwargs_speech["attention_mask"] = t2u_model_attention_mask
+
+        # Compute t2u decoder_input_ids
+        t2u_decoder_input_ids = kwargs_speech.get("decoder_input_ids")
+        t2u_tgt_lang_id = self.generation_config.t2u_lang_code_to_id.get(tgt_lang)
+        t2u_decoder_input_ids = torch.tensor(
+            [[self.config.t2u_eos_token_id, t2u_tgt_lang_id]] * batch_size, device=self.device
+        )
+        kwargs_speech["decoder_input_ids"] = t2u_decoder_input_ids
+
+        # second generation
+        unit_ids = self.t2u_model.generate(inputs_embeds=t2u_input_embeds, **kwargs_speech)
+        output_unit_ids = unit_ids.detach().clone()
+
+        # get rid of t2u_decoder_input_ids
+        unit_ids = unit_ids[:, kwargs_speech["decoder_input_ids"].shape[1] :]
+        # replace eos per pad
+        unit_ids[unit_ids == self.config.t2u_eos_token_id] = self.config.t2u_pad_token_id
+        # offset of control symbols
+        unit_ids = torch.where(
+            unit_ids == self.config.t2u_pad_token_id, unit_ids, unit_ids - self.config.vocoder_offset
+        )
+
+        vocoder_tgt_lang_id = self.generation_config.vocoder_lang_code_to_id.get(tgt_lang)
+        vocoder_tgt_lang_id = torch.tensor([[vocoder_tgt_lang_id]] * len(unit_ids), device=self.device)
+
+        spkr_id = torch.tensor([[spkr_id]] * len(unit_ids), device=self.device)
+
+        waveform, waveform_lengths = self.vocoder(input_ids=unit_ids, spkr_id=spkr_id, lang_id=vocoder_tgt_lang_id)
+
+        if return_intermediate_token_ids:
+            return SeamlessM4TGenerationOutput(
+                waveform=waveform,
+                waveform_lengths=waveform_lengths,
+                sequences=sequences,
+                unit_sequences=output_unit_ids,
+            )
+
+        return waveform, waveform_lengths
+
+
+__all__ = [
+    "SeamlessM4TForTextToSpeech",
+    "SeamlessM4TForSpeechToSpeech",
+    "SeamlessM4TForTextToText",
+    "SeamlessM4TForSpeechToText",
+    "SeamlessM4TModel",
+    "SeamlessM4TPreTrainedModel",
+    "SeamlessM4TCodeHifiGan",
+    "SeamlessM4THifiGan",
+    "SeamlessM4TTextToUnitForConditionalGeneration",
+    "SeamlessM4TTextToUnitModel",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t/processing_seamless_m4t.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t/processing_seamless_m4t.py
new file mode 100644
index 0000000000000000000000000000000000000000..85e1968d7d43c006b5495b5729031abfd5026010
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t/processing_seamless_m4t.py
@@ -0,0 +1,109 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Audio/Text processor class for SeamlessM4T
+"""
+
+from typing import Optional, Union
+
+from ...audio_utils import AudioInput
+from ...processing_utils import ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack
+from ...tokenization_utils_base import PreTokenizedInput, TextInput
+from ...utils import logging
+from ...utils.deprecation import deprecate_kwarg
+
+
+logger = logging.get_logger(__name__)
+
+
+class SeamlessM4TTextKwargs(TextKwargs):
+    src_lang: Optional[str]
+    tgt_lang: Optional[str]
+
+
+class SeamlessM4TProcessorKwargs(ProcessingKwargs, total=False):
+    text_kwargs: SeamlessM4TTextKwargs
+    _defaults = {}
+
+
+class SeamlessM4TProcessor(ProcessorMixin):
+    r"""
+    Constructs a SeamlessM4T processor which wraps a SeamlessM4T feature extractor and a SeamlessM4T tokenizer into a
+    single processor.
+
+    [`SeamlessM4TProcessor`] offers all the functionalities of [`SeamlessM4TFeatureExtractor`] and
+    [`SeamlessM4TTokenizerFast`]. See the [`~SeamlessM4TProcessor.__call__`] and [`~SeamlessM4TProcessor.decode`] for
+    more information.
+
+    Args:
+        feature_extractor ([`SeamlessM4TFeatureExtractor`]):
+            The audio processor is a required input.
+        tokenizer ([`SeamlessM4TTokenizerFast`]):
+            The tokenizer is a required input.
+    """
+
+    feature_extractor_class = "SeamlessM4TFeatureExtractor"
+    tokenizer_class = ("SeamlessM4TTokenizer", "SeamlessM4TTokenizerFast")
+    valid_processor_kwargs = SeamlessM4TProcessorKwargs
+
+    def __init__(self, feature_extractor, tokenizer):
+        super().__init__(feature_extractor, tokenizer)
+
+    @deprecate_kwarg("audios", version="v4.59.0", new_name="audio")
+    def __call__(
+        self,
+        text: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
+        audios: Optional[AudioInput] = None,
+        audio: Optional[AudioInput] = None,
+        **kwargs: Unpack[ProcessingKwargs],
+    ):
+        """
+        Main method to prepare for the model one or several sequences(s) and audio(s). This method forwards the `text`
+        and `kwargs` arguments to SeamlessM4TTokenizerFast's [`~SeamlessM4TTokenizerFast.__call__`] if `text` is not
+        `None` to encode the text. To prepare the audio(s), this method forwards the `audios` and `kwargs` arguments to
+        SeamlessM4TFeatureExtractor's [`~SeamlessM4TFeatureExtractor.__call__`] if `audios` is not `None`. Please refer
+        to the docstring of the above two methods for more information.
+
+        Args:
+            text (`str`, `list[str]`, `list[list[str]]`):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            audios (`np.ndarray`, `torch.Tensor`, `list[np.ndarray]`, `list[torch.Tensor]`):
+                The audio or batch of audios to be prepared. Each audio can be NumPy array or PyTorch tensor. In case
+                of a NumPy array/PyTorch tensor, each audio should be of shape (C, T), where C is a number of channels,
+                and T the sample length of the audio.
+        Returns:
+            [`BatchEncoding`]: A [`BatchEncoding`] with the following fields:
+
+            - **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
+            - **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
+              `return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
+              `None`).
+            - **input_features** -- Audio input features to be fed to a model. Returned when `audios` is not `None`.
+        """
+        if text is not None and audios is not None:
+            raise ValueError(
+                "Text and audios are mututally exclusive when passed to `SeamlessM4T`. Specify one or another."
+            )
+        if audio is None and audios is not None:
+            logger.warning(
+                "Passing `audios` as keyword argument is deprecated and will be removed in v4.63, please pass `audio` instead."
+            )
+            audio = audios
+        return super().__call__(text=text, audio=audio, **kwargs)
+
+
+__all__ = ["SeamlessM4TProcessor"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t/tokenization_seamless_m4t.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t/tokenization_seamless_m4t.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd773316580c1dc3961a954dc2fd3a8e0d4face9
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t/tokenization_seamless_m4t.py
@@ -0,0 +1,567 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for SeamlessM4T."""
+
+import os
+from shutil import copyfile
+from typing import Any, Optional, Union
+
+import sentencepiece as spm
+
+from ...convert_slow_tokenizer import import_protobuf
+from ...tokenization_utils import (
+    BatchEncoding,
+    PreTokenizedInput,
+    PreTrainedTokenizer,
+    TextInput,
+)
+from ...tokenization_utils_base import AddedToken
+from ...utils import PaddingStrategy, logging
+from ...utils.import_utils import requires
+
+
+logger = logging.get_logger(__name__)
+
+
+SPIECE_UNDERLINE = "▁"
+
+
+VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model"}
+
+
+@requires(backends=("sentencepiece",))
+class SeamlessM4TTokenizer(PreTrainedTokenizer):
+    """
+    Construct a SeamlessM4T tokenizer.
+
+    Adapted from [`RobertaTokenizer`] and [`XLNetTokenizer`]. Based on
+    [SentencePiece](https://github.com/google/sentencepiece).
+
+    The tokenization method is `<language code> <tokens> <eos>` for source language documents, and `<eos> <language
+    code> <tokens> <eos>` for target language documents.
+
+    Examples:
+
+    ```python
+    >>> from transformers import SeamlessM4TTokenizer
+
+    >>> tokenizer = SeamlessM4TTokenizer.from_pretrained(
+    ...     "facebook/hf-seamless-m4t-medium", src_lang="eng", tgt_lang="fra"
+    ... )
+    >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
+    >>> expected_translation_french = "Le chef de l'ONU affirme qu'il n'y a pas de solution militaire en Syrie."
+    >>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_french, return_tensors="pt")
+    ```
+
+    Args:
+        vocab_file (`str`):
+            Path to the vocabulary file.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
+            The token used is the `sep_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        tokenizer_file (`str`, *optional*):
+            The path to a tokenizer file to use instead of the vocab file.
+        src_lang (`str`, *optional*, defaults to `"eng"`):
+            The language to use as source language for translation.
+        tgt_lang (`str`, *optional*, defaults to `"fra"`):
+            The language to use as target language for translation.
+        sp_model_kwargs (`dict[str, Any]`, *optional*):
+            Additional keyword arguments to pass to the model initialization.
+        additional_special_tokens (tuple or list of `str` or `tokenizers.AddedToken`, *optional*):
+            A tuple or a list of additional special tokens. Can be used to specify the list of languages that will be
+            supported by the tokenizer.
+        add_prefix_space (`bool`, *optional*, defaults to `True`):
+            Whether or not to add an initial space to the input. This allows to treat the leading word just as any
+            other word.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    model_input_names = ["input_ids", "attention_mask"]
+
+    prefix_tokens: list[int] = []
+    suffix_tokens: list[int] = []
+
+    def __init__(
+        self,
+        vocab_file,
+        bos_token="<s>",
+        eos_token="</s>",
+        sep_token="</s>",
+        cls_token="<s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        tokenizer_file=None,
+        src_lang="eng",
+        tgt_lang="fra",
+        sp_model_kwargs: Optional[dict[str, Any]] = None,
+        additional_special_tokens=None,
+        add_prefix_space=True,
+        **kwargs,
+    ):
+        self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
+        # Add this unused argument to keep some important Copied from statements
+        self.legacy = False
+        self.vocab_file = vocab_file
+
+        self.sp_model = self.get_spm_processor(kwargs.pop("from_slow", False))
+
+        # Vocab    |    0    |    1    |   2    |    3    |  4   |  5   |  6   |   7  |   8  |  9
+        # -------- | ------- | ------- | ------ | ------- | ---- | ---- | ---- | ---- | ---- | ----
+        # spm  | '<unk>'   | '<s>' | '</s>' | 'an' | 'en' | '_d' | 'er' | 'in' | '_s' | '_a'
+        # fairseq  | '<pad>'   | '<unk>' | '<s>' | '</s>' | 'an' | 'en' | '▁d' | 'er' | 'in' | '▁s'
+
+        # Mimic fairseq token-to-id alignment for the first 4 token
+        self._added_tokens_decoder = {
+            0: AddedToken(pad_token, special=True) if isinstance(pad_token, str) else pad_token,
+            1: AddedToken(unk_token, special=True) if isinstance(unk_token, str) else unk_token,
+            2: AddedToken(bos_token, special=True) if isinstance(bos_token, str) else bos_token,
+            3: AddedToken(eos_token, special=True) if isinstance(eos_token, str) else eos_token,
+        }
+
+        # The first "real" token "an" has position 4 in the original fairseq vocab and position 3 in the spm vocab
+        self.fairseq_offset = 1
+
+        self.sp_model_size = len(self.sp_model)
+
+        self._src_lang = f"__{src_lang}__" if "__" not in src_lang else src_lang
+        self._tgt_lang = f"__{tgt_lang}__" if "__" not in tgt_lang else tgt_lang
+        self.add_prefix_space = add_prefix_space
+
+        super().__init__(
+            bos_token=bos_token,
+            eos_token=eos_token,
+            unk_token=unk_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            pad_token=pad_token,
+            tokenizer_file=tokenizer_file,
+            src_lang=src_lang,
+            tgt_lang=tgt_lang,
+            additional_special_tokens=additional_special_tokens,
+            sp_model_kwargs=self.sp_model_kwargs,
+            add_prefix_space=add_prefix_space,
+            **kwargs,
+        )
+
+        self.set_src_lang_special_tokens(self._src_lang)
+        self.set_tgt_lang_special_tokens(self._tgt_lang)
+
+    # Copied from transformers.models.nllb.tokenization_nllb.NllbTokenizer.__getstate__
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        state["sp_model"] = None
+        state["sp_model_proto"] = self.sp_model.serialized_model_proto()
+        return state
+
+    # Copied from transformers.models.nllb.tokenization_nllb.NllbTokenizer.__setstate__
+    def __setstate__(self, d):
+        self.__dict__ = d
+
+        # for backward compatibility
+        if not hasattr(self, "sp_model_kwargs"):
+            self.sp_model_kwargs = {}
+
+        self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
+
+    @property
+    def vocab_size(self):
+        return len(self.sp_model)
+
+    def __call__(
+        self,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
+        text_pair: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
+        text_target: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
+        text_pair_target: Optional[
+            Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]
+        ] = None,
+        padding: Union[bool, str, PaddingStrategy] = True,
+        pad_to_multiple_of: Optional[int] = 2,
+        src_lang: Optional[str] = None,
+        tgt_lang: Optional[str] = None,
+        **kwargs,
+    ):
+        """
+        Args:
+            text (`str`, `list[str]`, `list[list[str]]`, *optional*):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            text_pair (`str`, `list[str]`, `list[list[str]]`, *optional*):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            text_target (`str`, `list[str]`, `list[list[str]]`, *optional*):
+                The sequence or batch of sequences to be encoded as target texts. Each sequence can be a string or a
+                list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized),
+                you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            text_pair_target (`str`, `list[str]`, `list[list[str]]`, *optional*):
+                The sequence or batch of sequences to be encoded as target texts. Each sequence can be a string or a
+                list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized),
+                you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
+                 Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                 index) among:
+
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            pad_to_multiple_of (`int`, *optional*):
+                If set will pad the sequence to a multiple of the provided value.
+
+                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta).
+            src_lang (`str`, *optional*):
+                A string representing the source language. If not specified, the last `src_lang` specified (either
+                during initialization or when calling this tokenizer) will be used.
+            tgt_lang (`str`, *optional*):
+                A string representing the target language. If not specified, the last `tgt_lang` specified (either
+                during initialization or when calling this tokenizer) will be used.
+            kwargs (*optional*):
+                Remaining dictionary of keyword arguments that will be passed to [`PreTrainedTokenizer.__call__`].
+        """
+        if src_lang is not None:
+            self.src_lang = src_lang
+        if tgt_lang is not None:
+            self.tgt_lang = tgt_lang
+
+        output = super().__call__(
+            text=text,
+            text_pair=text_pair,
+            text_target=text_target,
+            text_pair_target=text_pair_target,
+            padding=padding,
+            pad_to_multiple_of=pad_to_multiple_of,
+            **kwargs,
+        )
+
+        return BatchEncoding(output, tensor_type=kwargs.get("return_tensors"))
+
+    @property
+    # Copied from transformers.models.nllb.tokenization_nllb.NllbTokenizer.src_lang
+    def src_lang(self) -> str:
+        return self._src_lang
+
+    @src_lang.setter
+    def src_lang(self, new_src_lang: str) -> None:
+        if "__" not in new_src_lang:
+            self._src_lang = f"__{new_src_lang}__"
+        else:
+            self._src_lang = new_src_lang
+        self.set_src_lang_special_tokens(self._src_lang)
+
+    @property
+    def tgt_lang(self) -> str:
+        return self._tgt_lang
+
+    @tgt_lang.setter
+    def tgt_lang(self, new_tgt_lang: str) -> None:
+        if "__" not in new_tgt_lang:
+            self._tgt_lang = f"__{new_tgt_lang}__"
+        else:
+            self._tgt_lang = new_tgt_lang
+        self.set_tgt_lang_special_tokens(self._tgt_lang)
+
+    # Copied from transformers.models.nllb.tokenization_nllb.NllbTokenizer.get_special_tokens_mask
+    def get_special_tokens_mask(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None, already_has_special_tokens: bool = False
+    ) -> list[int]:
+        """
+        Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
+        special tokens using the tokenizer `prepare_for_model` method.
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+            already_has_special_tokens (`bool`, *optional*, defaults to `False`):
+                Whether or not the token list is already formatted with special tokens for the model.
+
+        Returns:
+            `list[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
+        """
+
+        if already_has_special_tokens:
+            return super().get_special_tokens_mask(
+                token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
+            )
+
+        prefix_ones = [1] * len(self.prefix_tokens)
+        suffix_ones = [1] * len(self.suffix_tokens)
+        if token_ids_1 is None:
+            return prefix_ones + ([0] * len(token_ids_0)) + suffix_ones
+        return prefix_ones + ([0] * len(token_ids_0)) + ([0] * len(token_ids_1)) + suffix_ones
+
+    # Copied from transformers.models.nllb.tokenization_nllb.NllbTokenizer.build_inputs_with_special_tokens
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. An NLLB sequence has the following format, where `X` represents the sequence:
+
+        - `input_ids` (for encoder) `X [eos, src_lang_code]`
+        - `decoder_input_ids`: (for decoder) `X [eos, tgt_lang_code]`
+
+        BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
+        separator.
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `list[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return self.prefix_tokens + token_ids_0 + self.suffix_tokens
+        # We don't expect to process pairs, but leave the pair logic for API consistency
+        return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens
+
+    # Copied from transformers.models.nllb.tokenization_nllb.NllbTokenizer.create_token_type_ids_from_sequences
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. nllb does not
+        make use of token type ids, therefore a list of zeros is returned.
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `list[int]`: List of zeros.
+
+        """
+
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
+
+    def _build_translation_inputs(
+        self, raw_inputs, return_tensors: str, src_lang: Optional[str], tgt_lang: Optional[str], **extra_kwargs
+    ):
+        """Used by translation pipeline, to prepare inputs for the generate function"""
+        if src_lang is None or tgt_lang is None:
+            raise ValueError("Translation requires a `src_lang` and a `tgt_lang` for this model.")
+        self.src_lang = src_lang
+        inputs = self(raw_inputs, add_special_tokens=True, return_tensors=return_tensors, **extra_kwargs)
+        if "__" not in tgt_lang:
+            tgt_lang = f"__{tgt_lang}__"
+        tgt_lang_id = self.convert_tokens_to_ids(tgt_lang)
+        inputs["forced_bos_token_id"] = tgt_lang_id
+        return inputs
+
+    def get_vocab(self):
+        vocab = {
+            self.convert_ids_to_tokens(i): i for i in range(self.fairseq_offset, self.vocab_size + self.fairseq_offset)
+        }
+        vocab.update(self.added_tokens_encoder)
+        return vocab
+
+    @property
+    def unk_token_length(self):
+        return len(self.sp_model.encode(str(self.unk_token)))
+
+    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.get_spm_processor
+    def get_spm_processor(self, from_slow=False):
+        tokenizer = spm.SentencePieceProcessor(**self.sp_model_kwargs)
+        if self.legacy or from_slow:  # no dependency on protobuf
+            tokenizer.Load(self.vocab_file)
+            return tokenizer
+
+        with open(self.vocab_file, "rb") as f:
+            sp_model = f.read()
+            model_pb2 = import_protobuf(f"The new behaviour of {self.__class__.__name__} (with `self.legacy = False`)")
+            model = model_pb2.ModelProto.FromString(sp_model)
+            normalizer_spec = model_pb2.NormalizerSpec()
+            normalizer_spec.add_dummy_prefix = False
+            model.normalizer_spec.MergeFrom(normalizer_spec)
+            sp_model = model.SerializeToString()
+            tokenizer.LoadFromSerializedProto(sp_model)
+        return tokenizer
+
+    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer.tokenize
+    def tokenize(self, text: "TextInput", **kwargs) -> list[str]:
+        """
+        Converts a string to a list of tokens. If `self.legacy` is set to `False`, a prefix token is added unless the
+        first token is special.
+        """
+        if self.legacy or len(text) == 0:
+            return super().tokenize(text, **kwargs)
+
+        text = text.replace(SPIECE_UNDERLINE, " ")
+        if self.add_prefix_space:
+            text = SPIECE_UNDERLINE + text
+
+        tokens = super().tokenize(text, **kwargs)
+
+        if len(tokens) > 1 and tokens[0] == SPIECE_UNDERLINE and tokens[1] in self.all_special_tokens:
+            tokens = tokens[1:]
+        return tokens
+
+    # Copied from transformers.models.t5.tokenization_t5.T5Tokenizer._tokenize
+    def _tokenize(self, text, **kwargs):
+        """
+        Returns a tokenized string.
+
+        We de-activated the `add_dummy_prefix` option, thus the sentencepiece internals will always strip any
+        SPIECE_UNDERLINE. For example: `self.sp_model.encode(f"{SPIECE_UNDERLINE}Hey", out_type = str)` will give
+        `['H', 'e', 'y']` instead of `['▁He', 'y']`. Thus we always encode `f"{unk_token}text"` and strip the
+        `unk_token`. Here is an example with `unk_token = "<unk>"` and `unk_token_length = 4`.
+        `self.tokenizer.sp_model.encode("<unk> Hey", out_type = str)[4:]`.
+        """
+        if self.legacy or not text.startswith((SPIECE_UNDERLINE, " ")):
+            return self.sp_model.encode(text, out_type=str)
+
+        # 1. Encode string + prefix ex: "<unk> Hey"
+        tokens = self.sp_model.encode(self.unk_token + text, out_type=str)
+        # 2. Remove self.unk_token from ['<','unk','>', '▁Hey']
+        return tokens[self.unk_token_length :] if len(tokens) >= self.unk_token_length else tokens
+
+    def _convert_token_to_id(self, token):
+        """Converts a token (str) in an id using the vocab."""
+        spm_id = self.sp_model.PieceToId(token)
+
+        # Need to return unknown token if the SP model returned 0
+        return spm_id + self.fairseq_offset if spm_id else self.unk_token_id
+
+    def _convert_id_to_token(self, index):
+        """Converts an index (integer) in a token (str) using the vocab."""
+        return self.sp_model.IdToPiece(index - self.fairseq_offset)
+
+    def convert_tokens_to_string(self, tokens):
+        """Converts a sequence of tokens (strings for sub-words) in a single string."""
+        # since we manually add the prefix space, we have to remove it when decoding
+        if tokens[0].startswith(SPIECE_UNDERLINE) and self.add_prefix_space:
+            tokens[0] = tokens[0][1:]
+
+        out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
+        return out_string
+
+    # Copied from transformers.models.nllb.tokenization_nllb.NllbTokenizer.save_vocabulary
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+        elif not os.path.isfile(self.vocab_file):
+            with open(out_vocab_file, "wb") as fi:
+                content_spiece_model = self.sp_model.serialized_model_proto()
+                fi.write(content_spiece_model)
+
+        return (out_vocab_file,)
+
+    # Copied from transformers.models.nllb.tokenization_nllb.NllbTokenizer.prepare_seq2seq_batch with eng_Latn->eng, fra_Latn->fra
+    def prepare_seq2seq_batch(
+        self,
+        src_texts: list[str],
+        src_lang: str = "eng",
+        tgt_texts: Optional[list[str]] = None,
+        tgt_lang: str = "fra",
+        **kwargs,
+    ) -> BatchEncoding:
+        self.src_lang = src_lang
+        self.tgt_lang = tgt_lang
+        return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs)
+
+    # Copied from transformers.models.nllb.tokenization_nllb.NllbTokenizer._switch_to_input_mode
+    def _switch_to_input_mode(self):
+        return self.set_src_lang_special_tokens(self.src_lang)
+
+    # Copied from transformers.models.nllb.tokenization_nllb.NllbTokenizer._switch_to_target_mode
+    def _switch_to_target_mode(self):
+        return self.set_tgt_lang_special_tokens(self.tgt_lang)
+
+    def set_src_lang_special_tokens(self, src_lang) -> None:
+        """Reset the special tokens to the source lang setting.
+        Prefix=[src_lang_code], suffix = [eos]
+        """
+        self.cur_lang_code = self.convert_tokens_to_ids(src_lang)
+        self.init_kwargs["src_lang"] = src_lang
+
+        if self.cur_lang_code == self.unk_token_id:
+            logger.warning_once(
+                f"`src_lang={src_lang}` has not be found in the vocabulary. Behaviour will probably be unexpected because the language token id will be replaced by the unknown token id."
+            )
+
+        self.prefix_tokens = [self.cur_lang_code]
+        self.suffix_tokens = [self.eos_token_id]
+
+    # https://github.com/facebookresearch/fairseq2/blob/c53f18e6be6b8b46b722f2249b8397b7eccd7ad3/src/fairseq2/models/nllb/tokenizer.py#L112-L116
+    def set_tgt_lang_special_tokens(self, lang: str) -> None:
+        """Reset the special tokens to the target lang setting.
+        Prefix=[eos, tgt_lang_code] and suffix=[eos].
+        """
+        self.cur_lang_code = self.convert_tokens_to_ids(lang)
+        self.init_kwargs["tgt_lang"] = lang
+
+        if self.cur_lang_code == self.unk_token_id:
+            logger.warning_once(
+                f"`tgt_lang={lang}` has not be found in the vocabulary. Behaviour will probably be unexpected because the language token id will be replaced by the unknown token id."
+            )
+
+        self.prefix_tokens = [self.eos_token_id, self.cur_lang_code]
+        self.suffix_tokens = [self.eos_token_id]
+
+
+__all__ = ["SeamlessM4TTokenizer"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t/tokenization_seamless_m4t_fast.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t/tokenization_seamless_m4t_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..0318336332c3c9e4ff9dfad1a050cc7b0a3fe47d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t/tokenization_seamless_m4t_fast.py
@@ -0,0 +1,446 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Fast Tokenization class for SeamlessM4T."""
+
+import os
+from shutil import copyfile
+from typing import Optional, Union
+
+from tokenizers import processors
+
+from ...tokenization_utils import (
+    BatchEncoding,
+    PreTokenizedInput,
+    TextInput,
+)
+from ...tokenization_utils_fast import PreTrainedTokenizerFast
+from ...utils import PaddingStrategy, is_sentencepiece_available, logging
+
+
+if is_sentencepiece_available():
+    from .tokenization_seamless_m4t import SeamlessM4TTokenizer
+else:
+    SeamlessM4TTokenizer = None
+
+logger = logging.get_logger(__name__)
+
+VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "tokenizer_file": "tokenizer.json"}
+
+
+class SeamlessM4TTokenizerFast(PreTrainedTokenizerFast):
+    """
+    Construct a "fast" SeamlessM4T tokenizer (backed by HuggingFace's *tokenizers* library). Based on
+    [BPE](https://huggingface.co/docs/tokenizers/python/latest/components.html?highlight=BPE#models).
+
+    This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
+    refer to this superclass for more information regarding those methods.
+
+    The tokenization method is `<language code> <tokens> <eos>` for source language documents, and `<eos> <language
+    code> <tokens> <eos>` for target language documents.
+
+    Examples:
+
+    ```python
+    >>> from transformers import SeamlessM4TTokenizerFast
+
+    >>> tokenizer = SeamlessM4TTokenizerFast.from_pretrained(
+    ...     "facebook/hf-seamless-m4t-medium", src_lang="eng", tgt_lang="fra"
+    ... )
+    >>> example_english_phrase = " UN Chief Says There Is No Military Solution in Syria"
+    >>> expected_translation_french = "Le chef de l'ONU affirme qu'il n'y a pas de solution militaire en Syrie."
+    >>> inputs = tokenizer(example_english_phrase, text_target=expected_translation_french, return_tensors="pt")
+    ```
+
+    Args:
+        vocab_file (`str`, *optional*):
+            Path to the vocabulary file.
+        tokenizer_file (`str`, *optional*):
+            The path to a tokenizer file to use instead of the vocab file.
+        bos_token (`str`, *optional*, defaults to `"<s>"`):
+            The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the beginning of
+            sequence. The token used is the `cls_token`.
+
+            </Tip>
+
+        eos_token (`str`, *optional*, defaults to `"</s>"`):
+            The end of sequence token.
+
+            <Tip>
+
+            When building a sequence using special tokens, this is not the token that is used for the end of sequence.
+            The token used is the `sep_token`.
+
+            </Tip>
+
+        sep_token (`str`, *optional*, defaults to `"</s>"`):
+            The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
+            sequence classification or for a text and a question for question answering. It is also used as the last
+            token of a sequence built with special tokens.
+        cls_token (`str`, *optional*, defaults to `"<s>"`):
+            The classifier token which is used when doing sequence classification (classification of the whole sequence
+            instead of per-token classification). It is the first token of the sequence when built with special tokens.
+        unk_token (`str`, *optional*, defaults to `"<unk>"`):
+            The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
+            token instead.
+        pad_token (`str`, *optional*, defaults to `"<pad>"`):
+            The token used for padding, for example when batching sequences of different lengths.
+        src_lang (`str`, *optional*, defaults to `"eng"`):
+            The language to use as source language for translation.
+        tgt_lang (`str`, *optional*, defaults to `"fra"`):
+            The language to use as target language for translation.
+        additional_special_tokens (tuple or list of `str` or `tokenizers.AddedToken`, *optional*):
+            A tuple or a list of additional special tokens.
+    """
+
+    vocab_files_names = VOCAB_FILES_NAMES
+    slow_tokenizer_class = SeamlessM4TTokenizer
+    model_input_names = ["input_ids", "attention_mask"]
+
+    prefix_tokens: list[int] = []
+    suffix_tokens: list[int] = []
+
+    def __init__(
+        self,
+        vocab_file=None,
+        tokenizer_file=None,
+        bos_token="<s>",
+        eos_token="</s>",
+        sep_token="</s>",
+        cls_token="<s>",
+        unk_token="<unk>",
+        pad_token="<pad>",
+        src_lang="eng",
+        tgt_lang="fra",
+        additional_special_tokens=None,
+        **kwargs,
+    ):
+        super().__init__(
+            vocab_file=vocab_file,
+            tokenizer_file=tokenizer_file,
+            bos_token=bos_token,
+            eos_token=eos_token,
+            sep_token=sep_token,
+            cls_token=cls_token,
+            unk_token=unk_token,
+            pad_token=pad_token,
+            src_lang=src_lang,
+            tgt_lang=tgt_lang,
+            additional_special_tokens=additional_special_tokens,
+            **kwargs,
+        )
+
+        self.vocab_file = vocab_file
+        self._src_lang = f"__{src_lang}__" if "__" not in src_lang else src_lang
+        self._tgt_lang = f"__{tgt_lang}__" if "__" not in tgt_lang else tgt_lang
+        self.set_src_lang_special_tokens(self._src_lang)
+        self.set_tgt_lang_special_tokens(self._tgt_lang)
+
+    @property
+    # Copied from transformers.models.nllb.tokenization_nllb.NllbTokenizer.src_lang
+    def src_lang(self) -> str:
+        return self._src_lang
+
+    @src_lang.setter
+    def src_lang(self, new_src_lang: str) -> None:
+        if "__" not in new_src_lang:
+            self._src_lang = f"__{new_src_lang}__"
+        else:
+            self._src_lang = new_src_lang
+        self.set_src_lang_special_tokens(self._src_lang)
+
+    @property
+    def tgt_lang(self) -> str:
+        return self._tgt_lang
+
+    @tgt_lang.setter
+    def tgt_lang(self, new_tgt_lang: str) -> None:
+        if "__" not in new_tgt_lang:
+            self._tgt_lang = f"__{new_tgt_lang}__"
+        else:
+            self._tgt_lang = new_tgt_lang
+        self.set_tgt_lang_special_tokens(self._tgt_lang)
+
+    def build_inputs_with_special_tokens(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
+        adding special tokens. The special tokens depend on calling set_lang.
+
+        An SeamlessM4T sequence has the following format, where `X` represents the sequence:
+
+        - `input_ids` (for encoder) `[src_lang_code] X [eos]`
+        - `decoder_input_ids`: (for decoder) `[eos, tgt_lang_code] X [eos]`
+
+        BOS is never used. Pairs of sequences are not the expected use case, but they will be handled without a
+        separator.
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs to which the special tokens will be added.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `list[int]`: list of [input IDs](../glossary#input-ids) with the appropriate special tokens.
+        """
+        if token_ids_1 is None:
+            return self.prefix_tokens + token_ids_0 + self.suffix_tokens
+        # We don't expect to process pairs, but leave the pair logic for API consistency
+        return self.prefix_tokens + token_ids_0 + token_ids_1 + self.suffix_tokens
+
+    # Copied from transformers.models.nllb.tokenization_nllb_fast.NllbTokenizerFast.create_token_type_ids_from_sequences
+    def create_token_type_ids_from_sequences(
+        self, token_ids_0: list[int], token_ids_1: Optional[list[int]] = None
+    ) -> list[int]:
+        """
+        Create a mask from the two sequences passed to be used in a sequence-pair classification task. nllb does not
+        make use of token type ids, therefore a list of zeros is returned.
+
+        Args:
+            token_ids_0 (`list[int]`):
+                List of IDs.
+            token_ids_1 (`list[int]`, *optional*):
+                Optional second list of IDs for sequence pairs.
+
+        Returns:
+            `list[int]`: List of zeros.
+
+        """
+
+        sep = [self.sep_token_id]
+        cls = [self.cls_token_id]
+
+        if token_ids_1 is None:
+            return len(cls + token_ids_0 + sep) * [0]
+        return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
+
+    def _build_translation_inputs(
+        self, raw_inputs, return_tensors: str, src_lang: Optional[str], tgt_lang: Optional[str], **extra_kwargs
+    ):
+        """Used by translation pipeline, to prepare inputs for the generate function"""
+        if src_lang is None or tgt_lang is None:
+            raise ValueError("Translation requires a `src_lang` and a `tgt_lang` for this model")
+        self.src_lang = src_lang
+        inputs = self(raw_inputs, add_special_tokens=True, return_tensors=return_tensors, **extra_kwargs)
+        if "__" not in tgt_lang:
+            tgt_lang = f"__{tgt_lang}__"
+        tgt_lang_id = self.convert_tokens_to_ids(tgt_lang)
+        inputs["forced_bos_token_id"] = tgt_lang_id
+        return inputs
+
+    # Copied from transformers.models.nllb.tokenization_nllb_fast.NllbTokenizerFast.prepare_seq2seq_batch with "fra_Latn"->"fra", "eng_Latn"->"eng"
+    def prepare_seq2seq_batch(
+        self,
+        src_texts: list[str],
+        src_lang: str = "eng",
+        tgt_texts: Optional[list[str]] = None,
+        tgt_lang: str = "fra",
+        **kwargs,
+    ) -> BatchEncoding:
+        self.src_lang = src_lang
+        self.tgt_lang = tgt_lang
+        return super().prepare_seq2seq_batch(src_texts, tgt_texts, **kwargs)
+
+    # Copied from transformers.models.nllb.tokenization_nllb_fast.NllbTokenizerFast._switch_to_input_mode
+    def _switch_to_input_mode(self):
+        return self.set_src_lang_special_tokens(self.src_lang)
+
+    # Copied from transformers.models.nllb.tokenization_nllb_fast.NllbTokenizerFast._switch_to_target_mode
+    def _switch_to_target_mode(self):
+        return self.set_tgt_lang_special_tokens(self.tgt_lang)
+
+    def set_src_lang_special_tokens(self, src_lang) -> None:
+        """Reset the special tokens to the source lang setting.
+        Prefix=[src_lang_code], suffix = [eos]
+        """
+        self.cur_lang_code = self.convert_tokens_to_ids(src_lang)
+
+        if self.cur_lang_code == self.unk_token_id:
+            logger.warning_once(
+                f"`tgt_lang={src_lang}` has not be found in the `vocabulary`. Behaviour will probably be unexpected because the language token id will be replaced by the unknown token id."
+            )
+
+        self.init_kwargs["src_lang"] = src_lang
+
+        self.prefix_tokens = [self.cur_lang_code]
+        self.suffix_tokens = [self.eos_token_id]
+
+        prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens)
+        suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens)
+
+        self._tokenizer.post_processor = processors.TemplateProcessing(
+            single=prefix_tokens_str + ["$A"] + suffix_tokens_str,
+            pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str,
+            special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)),
+        )
+
+    def set_tgt_lang_special_tokens(self, lang: str) -> None:
+        """Reset the special tokens to the target lang setting.
+        Prefix=[eos, tgt_lang_code] and suffix=[eos].
+        """
+        self.cur_lang_code = self.convert_tokens_to_ids(lang)
+
+        if self.cur_lang_code == self.unk_token_id:
+            logger.warning_once(
+                f"`tgt_lang={lang}` has not be found in the `vocabulary`. Behaviour will probably be unexpected because the language token id will be replaced by the unknown token id."
+            )
+
+        self.init_kwargs["tgt_lang"] = lang
+
+        self.prefix_tokens = [self.eos_token_id, self.cur_lang_code]
+        self.suffix_tokens = [self.eos_token_id]
+
+        prefix_tokens_str = self.convert_ids_to_tokens(self.prefix_tokens)
+        suffix_tokens_str = self.convert_ids_to_tokens(self.suffix_tokens)
+
+        self._tokenizer.post_processor = processors.TemplateProcessing(
+            single=prefix_tokens_str + ["$A"] + suffix_tokens_str,
+            pair=prefix_tokens_str + ["$A", "$B"] + suffix_tokens_str,
+            special_tokens=list(zip(prefix_tokens_str + suffix_tokens_str, self.prefix_tokens + self.suffix_tokens)),
+        )
+
+    # Copied from transformers.models.nllb.tokenization_nllb_fast.NllbTokenizerFast.save_vocabulary
+    def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> tuple[str]:
+        if not self.can_save_slow_tokenizer:
+            raise ValueError(
+                "Your fast tokenizer does not have the necessary information to save the vocabulary for a slow "
+                "tokenizer."
+            )
+
+        if not os.path.isdir(save_directory):
+            logger.error(f"Vocabulary path ({save_directory}) should be a directory.")
+            return
+        out_vocab_file = os.path.join(
+            save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
+        )
+
+        if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
+            copyfile(self.vocab_file, out_vocab_file)
+
+        return (out_vocab_file,)
+
+    @classmethod
+    def _from_pretrained(
+        cls,
+        resolved_vocab_files,
+        pretrained_model_name_or_path,
+        init_configuration,
+        *init_inputs,
+        token=None,
+        cache_dir=None,
+        local_files_only=False,
+        _commit_hash=None,
+        _is_local=False,
+        **kwargs,
+    ):
+        tokenizer = super()._from_pretrained(
+            resolved_vocab_files,
+            pretrained_model_name_or_path,
+            init_configuration,
+            *init_inputs,
+            token=token,
+            cache_dir=cache_dir,
+            local_files_only=local_files_only,
+            _commit_hash=_commit_hash,
+            _is_local=_is_local,
+            **kwargs,
+        )
+
+        # ensure also set after from pretrained
+        tokenizer.set_src_lang_special_tokens(tokenizer._src_lang)
+        tokenizer.set_tgt_lang_special_tokens(tokenizer._tgt_lang)
+
+        return tokenizer
+
+    def __call__(
+        self,
+        text: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
+        text_pair: Optional[Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]] = None,
+        text_target: Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]] = None,
+        text_pair_target: Optional[
+            Union[TextInput, PreTokenizedInput, list[TextInput], list[PreTokenizedInput]]
+        ] = None,
+        padding: Union[bool, str, PaddingStrategy] = True,
+        pad_to_multiple_of: Optional[int] = 2,
+        src_lang: Optional[str] = None,
+        tgt_lang: Optional[str] = None,
+        **kwargs,
+    ):
+        """
+        Args:
+            text (`str`, `list[str]`, `list[list[str]]`, *optional*):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            text_pair (`str`, `list[str]`, `list[list[str]]`, *optional*):
+                The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
+                (pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
+                `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            text_target (`str`, `list[str]`, `list[list[str]]`, *optional*):
+                The sequence or batch of sequences to be encoded as target texts. Each sequence can be a string or a
+                list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized),
+                you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            text_pair_target (`str`, `list[str]`, `list[list[str]]`, *optional*):
+                The sequence or batch of sequences to be encoded as target texts. Each sequence can be a string or a
+                list of strings (pretokenized string). If the sequences are provided as list of strings (pretokenized),
+                you must set `is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
+            padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `True`):
+                 Select a strategy to pad the returned sequences (according to the model's padding side and padding
+                 index) among:
+
+                - `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
+                  sequence if provided).
+                - `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
+                  acceptable input length for the model if that argument is not provided.
+                - `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
+                  lengths).
+            pad_to_multiple_of (`int`, *optional*):
+                If set will pad the sequence to a multiple of the provided value.
+
+                This is especially useful to enable the use of Tensor Cores on NVIDIA hardware with compute capability
+                `>= 7.5` (Volta).
+            src_lang (`str`, *optional*):
+                A string representing the source language. If not specified, the last `src_lang` specified (either
+                during initialization or when calling this tokenizer) will be used.
+            tgt_lang (`str`, *optional*):
+                A string representing the target language. If not specified, the last `tgt_lang` specified (either
+                during initialization or when calling this tokenizer) will be used.
+            kwargs (*optional*):
+                Remaining dictionary of keyword arguments that will be passed to [`PreTrainedTokenizerFast.__call__`].
+        """
+        if src_lang is not None:
+            self.src_lang = src_lang
+        if tgt_lang is not None:
+            self.tgt_lang = tgt_lang
+
+        output = super().__call__(
+            text=text,
+            text_pair=text_pair,
+            text_target=text_target,
+            text_pair_target=text_pair_target,
+            padding=padding,
+            pad_to_multiple_of=pad_to_multiple_of,
+            **kwargs,
+        )
+
+        return output
+
+
+__all__ = ["SeamlessM4TTokenizerFast"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t_v2/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t_v2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..10f256f8575595d6b3409c030e1d6c3b33c75aa9
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t_v2/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_seamless_m4t_v2 import *
+    from .modeling_seamless_m4t_v2 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..7ac6c4024a5a6dd3dc5f60472974c771bf27e27e
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t_v2/configuration_seamless_m4t_v2.py
@@ -0,0 +1,425 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""SeamlessM4Tv2 model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class SeamlessM4Tv2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`~SeamlessM4Tv2Model`]. It is used to instantiate
+    an SeamlessM4Tv2 model according to the specified arguments, defining the model architecture. Instantiating a
+    configuration with the defaults will yield a similar configuration to that of the SeamlessM4Tv2
+    [""](https://huggingface.co/"") architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 256102):
+            Vocabulary size of the text modality of the SeamlessM4Tv2 model. Defines the number of different tokens
+            that can be represented by the `inputs_ids` passed when calling [`~SeamlessM4Tv2Model`],
+            [`~SeamlessM4Tv2ForTextToSpeech`] or [`~SeamlessM4Tv2ForTextToText`].
+        t2u_vocab_size (`int`, *optional*, defaults to 10082):
+            Unit vocabulary size of the SeamlessM4Tv2 model. Defines the number of different "unit tokens" that can be
+            represented by the `inputs_ids` passed when calling the Text-To-Units sub-model of [`~SeamlessM4Tv2Model`],
+            [`~SeamlessM4Tv2ForSpeechToSpeech`] or [`~SeamlessM4Tv2ForTextToSpeech`].
+        char_vocab_size (`int`, *optional*, defaults to 10943):
+            Character vocabulary size of the SeamlessM4Tv2 model. Defines the number of different character tokens that
+            can be represented by the `char_inputs_ids` passed when calling the Text-To-Units sub-model of
+            [`~SeamlessM4Tv2Model`], [`~SeamlessM4Tv2ForSpeechToSpeech`] or [`~SeamlessM4Tv2ForTextToSpeech`].
+
+        > Parameters shared across sub-models
+
+        hidden_size (`int`, *optional*, defaults to 1024):
+            Dimensionality of the "intermediate" layers in the architecture.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the layer normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models).
+        max_position_embeddings (`int`, *optional*, defaults to 4096):
+            The maximum sequence length that this model text encoder and decoder might ever be used with. Typically set
+            this to something large just in case (e.g., 512 or 1024 or 2048).
+        is_encoder_decoder (`bool`, *optional*, defaults to `True`):
+            Whether the model is used as an encoder/decoder or not.
+        encoder_layerdrop (`float`, *optional*, defaults to 0.05):
+            The LayerDrop probability for the encoders. See the [LayerDrop paper](see https://huggingface.co/papers/1909.11556)
+            for more details.
+        decoder_layerdrop (`float`, *optional*, defaults to 0.05):
+            The LayerDrop probability for the decoders. See the [LayerDrop paper](see https://huggingface.co/papers/1909.11556)
+            for more details.
+        activation_function (`str` or `function`, *optional*, defaults to `"relu"`):
+            The non-linear activation function (function or string) in the decoder and feed-forward layers. If string,
+            `"gelu"`, `"relu"`, `"selu"`, `"swish"` and `"gelu_new"` are supported.
+        dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, decoder, and pooler.
+        attention_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all attention layers.
+        activation_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all activation layers in the model.
+        scale_embedding (`bool`, *optional*, defaults to `True`):
+            Scale embeddings by diving by sqrt(d_model).
+
+        > Text encoder and text decoder specific parameters
+
+        encoder_layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer text encoder.
+        encoder_ffn_dim (`int`, *optional*, defaults to 8192):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer text encoder.
+        encoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer text encoder.
+        decoder_layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer text decoder.
+        decoder_ffn_dim (`int`, *optional*, defaults to 8192):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer text decoder.
+        decoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer text decoder.
+        decoder_start_token_id (`int`, *optional*, defaults to 3):
+            If an encoder-decoder model starts decoding with a different token than _bos_, the id of that token. Only
+            applied in the text decoder.
+        max_new_tokens (`int`, *optional*, defaults to 256):
+            The maximum numbers of text tokens to generate, ignoring the number of tokens in the prompt.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            The id of the _padding_ text token. Only applied to the text-decoder model.
+        bos_token_id (`int`, *optional*, defaults to 2):
+            The id of the _beginning-of-stream_ text token. Only applied to the text-decoder model.
+        eos_token_id (`int`, *optional*, defaults to 3):
+            The id of the _end-of-stream_ text token. Only applied to the text-decoder model.
+
+        > Speech encoder specific parameters
+
+        speech_encoder_layers (`int`, *optional*, defaults to 24):
+            Number of hidden layers in the Transformer speech encoder.
+        speech_encoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer speech encoder.
+        speech_encoder_intermediate_size (`int`, *optional*, defaults to 4096):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer speech encoder.
+        speech_encoder_hidden_act (`str` or `function`, *optional*, defaults to `"swish"`):
+            The non-linear activation function (function or string) in the speech encoder. If string, `"gelu"`,
+            `"relu"`, `"selu"`, `"swish"` and `"gelu_new"` are supported.
+        speech_encoder_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for all layers in the speech encoder.
+        add_adapter (`bool`, *optional*, defaults to `True`):
+            Add an adapter layer on top of the speech encoder.
+        speech_encoder_layerdrop (`float`, *optional*, defaults to 0.1):
+            The LayerDrop probability for the speech encoder. See the [LayerDrop paper](see
+            https://huggingface.co/papers/1909.11556) for more details.
+        feature_projection_input_dim (`int`, *optional*, defaults to 160):
+            Input dimension of the input feature projection of the speech encoder, i.e the dimension after processing
+            input audios with [`SeamlessM4TFeatureExtractor`].
+        adaptor_kernel_size (`int`, *optional*, defaults to 8):
+            Kernel size of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`.
+        adaptor_stride (`int`, *optional*, defaults to 8):
+            Stride of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`.
+        adaptor_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all layers in the speech adapter.
+        num_adapter_layers (`int`, *optional*, defaults to 1):
+            Number of convolutional layers that should be used in the adapter network. Only relevant if `add_adapter is
+            True`.
+        position_embeddings_type (`str`, *optional*, defaults to `"relative_key"`):
+            Can be specified to `relative_key`. If left to `None`, no relative position embedding is applied. Only
+            applied to the speech encoder. For more information on `"relative_key"`, please refer to [Self-Attention
+            with Relative Position Representations (Shaw et al.)](https://huggingface.co/papers/1803.02155).
+        conv_depthwise_kernel_size (`int`, *optional*, defaults to 31):
+            Kernel size of convolutional depthwise 1D layer in Conformer blocks. Only applied to the speech encoder.
+        left_max_position_embeddings (`int`, *optional*, defaults to 64):
+            The left clipping value for relative positions.
+        right_max_position_embeddings (`int`, *optional*, defaults to 8):
+            The right clipping value for relative positions.
+        speech_encoder_chunk_size (`int`, *optional*, defaults to 20000): The size of each attention chunk.
+        speech_encoder_left_chunk_num (`int`, *optional*, defaults to 128):
+            Number of chunks on the left up to which lookahead is allowed.
+
+        > Text-To-Unit (t2u) model specific parameters
+
+        t2u_bos_token_id (`int`, *optional*, defaults to 0):
+            The id of the _beginning-of-stream_ unit token. Only applied to the text-to-unit seq2seq model.
+        t2u_pad_token_id (`int`, *optional*, defaults to 1):
+            The id of the _padding_ unit token. Only applied to the text-to-unit seq2seq model.
+        t2u_eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the _end-of-stream_ unit token. Only applied to the text-to-unit seq2seq model.
+        t2u_encoder_layers (`int`, *optional*, defaults to 6):
+            Number of hidden layers in the Transformer text-to-unit encoder.
+        t2u_encoder_ffn_dim (`int`, *optional*, defaults to 8192):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer text-to-unit encoder.
+        t2u_encoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer text-to-unit encoder.
+        t2u_decoder_layers (`int`, *optional*, defaults to 6):
+            Number of hidden layers in the Transformer text-to-unit decoder.
+        t2u_decoder_ffn_dim (`int`, *optional*, defaults to 8192):
+            Dimension of the "intermediate" (i.e., feed-forward) layer in the Transformer text-to-unit decoder.
+        t2u_decoder_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer text-to-unit decoder.
+        t2u_max_position_embeddings (`int`, *optional*, defaults to 4096):
+            The maximum sequence length that this model text-to-unit component might ever be used with. Typically set
+            this to something large just in case (e.g., 512 or 1024 or 2048).
+        t2u_variance_predictor_embed_dim (`int`, *optional*, defaults to 1024):
+            The projection dimension of the text-to-unit's duration predictor.
+        t2u_variance_predictor_hidden_dim (`int`, *optional*, defaults to 256):
+            Internal dimension of the text-to-unit's duration predictor.
+        t2u_variance_predictor_kernel_size (`int`, *optional*, defaults to 3):
+            Kernel size of the convolutional layers of the text-to-unit's duration predictor.
+        t2u_variance_pred_dropout (`float`, *optional*, defaults to 0.5):
+            The dropout probability of the text-to-unit's duration predictor.
+
+         > Hifi-Gan Vocoder specific parameters
+
+        sampling_rate (`int`, *optional*, defaults to 16000):
+            The sampling rate at which the output audio will be generated, expressed in hertz (Hz).
+        upsample_initial_channel (`int`, *optional*, defaults to 512):
+            The number of input channels into the hifi-gan upsampling network. Applies to the vocoder only.
+        upsample_rates (`tuple[int]` or `list[int]`, *optional*, defaults to `[5, 4, 4, 2, 2]`):
+            A tuple of integers defining the stride of each 1D convolutional layer in the vocoder upsampling network.
+            The length of *upsample_rates* defines the number of convolutional layers and has to match the length of
+            *upsample_kernel_sizes*. Applies to the vocoder only.
+        upsample_kernel_sizes (`tuple[int]` or `list[int]`, *optional*, defaults to `[11, 8, 8, 4, 4]`):
+            A tuple of integers defining the kernel size of each 1D convolutional layer in the vocoder upsampling
+            network. The length of *upsample_kernel_sizes* defines the number of convolutional layers and has to match
+            the length of *upsample_rates*. Applies to the vocoder only.
+        resblock_kernel_sizes (`tuple[int]` or `list[int]`, *optional*, defaults to `[3, 7, 11]`):
+            A tuple of integers defining the kernel sizes of the vocoder 1D convolutional layers in the multi-receptive
+            field fusion (MRF) module. Applies to the vocoder only.
+        resblock_dilation_sizes (`tuple[tuple[int]]` or `list[list[int]]`, *optional*, defaults to `[[1, 3, 5], [1, 3, 5], [1, 3, 5]]`):
+            A nested tuple of integers defining the dilation rates of the vocoder dilated 1D convolutional layers in
+            the multi-receptive field fusion (MRF) module. Applies to the vocoder only.
+        leaky_relu_slope (`float`, *optional*, defaults to 0.1):
+            The angle of the negative slope used by the leaky ReLU activation in the vocoder. Applies to the vocoder
+            only.
+        unit_hifi_gan_vocab_size (`int`, *optional*, defaults to 10000):
+            Vocabulary size of the SeamlessM4Tv2 vocoder. Defines the number of different unit tokens that can be
+            represented by the `inputs_ids` passed when calling the vocoder of [`~SeamlessM4Tv2Model`],
+            [`~SeamlessM4Tv2ForSpeechToSpeech`] or [`~SeamlessM4Tv2ForTextToSpeech`].
+        unit_embed_dim (`int`, *optional*, defaults to 1280):
+            The projection dimension of the input ids given to the hifi-gan vocoder. Applies to the vocoder only.
+        lang_embed_dim (`int`, *optional*, defaults to 256):
+            The projection dimension of the target language given to the hifi-gan vocoder. Applies to the vocoder only.
+        spkr_embed_dim (`int`, *optional*, defaults to 256):
+            The projection dimension of the speaker id given to the hifi-gan vocoder. Applies to the vocoder only.
+        vocoder_num_langs (`int`, *optional*, defaults to 36):
+            Number of langs supported by the vocoder. Might be different from `t2u_num_langs`.
+        vocoder_num_spkrs (`int`, *optional*, defaults to 200):
+            Number of speakers supported by the vocoder.
+        variance_predictor_kernel_size (`int`, *optional*, defaults to 3):
+            Kernel size of the duration predictor. Applies to the vocoder only.
+        var_pred_dropout (`float`, *optional*, defaults to 0.5):
+            The dropout probability of the duration predictor. Applies to the vocoder only.
+        vocoder_offset (`int`, *optional*, defaults to 4):
+            Offset the unit token ids by this number to account for symbol tokens. Applies to the vocoder only.
+
+    ```python
+    >>> from transformers import SeamlessM4Tv2Model, SeamlessM4Tv2Config
+
+    >>> # Initializing a SeamlessM4Tv2 "" style configuration
+    >>> configuration = SeamlessM4Tv2Config()
+
+    >>> # Initializing a model from the "" style configuration
+    >>> model = SeamlessM4Tv2Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "seamless_m4t_v2"
+
+    def __init__(
+        self,
+        vocab_size=256102,
+        t2u_vocab_size=10082,
+        char_vocab_size=10943,
+        # shared config
+        hidden_size=1024,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        use_cache=True,
+        max_position_embeddings=4096,
+        is_encoder_decoder=True,
+        encoder_layerdrop=0.05,
+        decoder_layerdrop=0.05,
+        activation_function="relu",
+        dropout=0.1,
+        attention_dropout=0.1,
+        activation_dropout=0.0,
+        scale_embedding=True,
+        # text encoder|decoder
+        encoder_layers=24,
+        encoder_ffn_dim=8192,
+        encoder_attention_heads=16,
+        decoder_layers=24,
+        decoder_ffn_dim=8192,
+        decoder_attention_heads=16,
+        decoder_start_token_id=3,
+        max_new_tokens=256,
+        pad_token_id=0,
+        bos_token_id=2,
+        eos_token_id=3,
+        # speech_encoder
+        speech_encoder_layers=24,
+        speech_encoder_attention_heads=16,
+        speech_encoder_intermediate_size=4096,
+        speech_encoder_hidden_act="swish",
+        speech_encoder_dropout=0.0,
+        add_adapter=True,
+        speech_encoder_layerdrop=0.1,
+        feature_projection_input_dim=160,
+        adaptor_kernel_size=8,
+        adaptor_stride=8,
+        adaptor_dropout=0.1,
+        num_adapter_layers=1,
+        position_embeddings_type="relative_key",
+        conv_depthwise_kernel_size=31,
+        left_max_position_embeddings=64,
+        right_max_position_embeddings=8,
+        speech_encoder_chunk_size=20000,
+        speech_encoder_left_chunk_num=128,
+        # t2u config
+        t2u_bos_token_id=0,
+        t2u_pad_token_id=1,
+        t2u_eos_token_id=2,
+        t2u_encoder_layers=6,
+        t2u_encoder_ffn_dim=8192,
+        t2u_encoder_attention_heads=16,
+        t2u_decoder_layers=6,
+        t2u_decoder_ffn_dim=8192,
+        t2u_decoder_attention_heads=16,
+        t2u_max_position_embeddings=4096,
+        t2u_variance_predictor_embed_dim=1024,
+        t2u_variance_predictor_hidden_dim=256,
+        t2u_variance_predictor_kernel_size=3,
+        t2u_variance_pred_dropout=0.5,
+        # hifi-gan vocoder config
+        sampling_rate=16000,
+        upsample_initial_channel=512,
+        upsample_rates=[5, 4, 4, 2, 2],
+        upsample_kernel_sizes=[11, 8, 8, 4, 4],
+        resblock_kernel_sizes=[3, 7, 11],
+        resblock_dilation_sizes=[[1, 3, 5], [1, 3, 5], [1, 3, 5]],
+        leaky_relu_slope=0.1,
+        # specific to Code Hifi-Gan
+        unit_hifi_gan_vocab_size=10000,
+        unit_embed_dim=1280,
+        lang_embed_dim=256,
+        spkr_embed_dim=256,
+        vocoder_num_langs=36,
+        vocoder_num_spkrs=200,
+        variance_predictor_kernel_size=3,
+        var_pred_dropout=0.5,
+        vocoder_offset=4,
+        **kwargs,
+    ):
+        # overall_config
+        self.vocab_size = vocab_size
+        self.t2u_vocab_size = t2u_vocab_size
+        self.char_vocab_size = char_vocab_size
+        self.hidden_size = hidden_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.max_position_embeddings = max_position_embeddings
+        self.use_cache = use_cache
+        self.max_new_tokens = max_new_tokens
+        self.encoder_layerdrop = encoder_layerdrop
+        self.decoder_layerdrop = decoder_layerdrop
+        self.activation_function = activation_function
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.scale_embedding = scale_embedding
+        # for proper config init
+        self.num_attention_heads = decoder_attention_heads
+        self.num_hidden_layers = decoder_layers
+
+        # text|unit encoder|decoder
+        self.encoder_layers = encoder_layers
+        self.encoder_ffn_dim = encoder_ffn_dim
+        self.encoder_attention_heads = encoder_attention_heads
+        self.decoder_layers = decoder_layers
+        self.decoder_ffn_dim = decoder_ffn_dim
+        self.decoder_attention_heads = decoder_attention_heads
+
+        # speech_encoder
+        self.speech_encoder_layers = speech_encoder_layers
+        self.speech_encoder_hidden_act = speech_encoder_hidden_act
+        self.speech_encoder_dropout = speech_encoder_dropout
+        self.speech_encoder_attention_heads = speech_encoder_attention_heads
+        self.speech_encoder_layerdrop = speech_encoder_layerdrop
+        self.speech_encoder_intermediate_size = speech_encoder_intermediate_size
+        self.feature_projection_input_dim = feature_projection_input_dim
+        self.adaptor_kernel_size = adaptor_kernel_size
+        self.adaptor_stride = adaptor_stride
+        self.adaptor_dropout = adaptor_dropout
+        self.num_adapter_layers = num_adapter_layers
+        self.position_embeddings_type = position_embeddings_type
+        self.conv_depthwise_kernel_size = conv_depthwise_kernel_size
+        self.add_adapter = add_adapter
+        self.left_max_position_embeddings = left_max_position_embeddings
+        self.right_max_position_embeddings = right_max_position_embeddings
+        self.speech_encoder_chunk_size = speech_encoder_chunk_size
+        self.speech_encoder_left_chunk_num = speech_encoder_left_chunk_num
+
+        # t2u config
+        self.t2u_bos_token_id = t2u_bos_token_id
+        self.t2u_pad_token_id = t2u_pad_token_id
+        self.t2u_eos_token_id = t2u_eos_token_id
+        self.t2u_encoder_layers = t2u_encoder_layers
+        self.t2u_encoder_ffn_dim = t2u_encoder_ffn_dim
+        self.t2u_encoder_attention_heads = t2u_encoder_attention_heads
+        self.t2u_decoder_layers = t2u_decoder_layers
+        self.t2u_decoder_ffn_dim = t2u_decoder_ffn_dim
+        self.t2u_decoder_attention_heads = t2u_decoder_attention_heads
+        self.t2u_max_position_embeddings = t2u_max_position_embeddings
+        self.t2u_variance_predictor_embed_dim = t2u_variance_predictor_embed_dim  # TODO: add to docstrings
+        self.t2u_variance_predictor_hidden_dim = t2u_variance_predictor_hidden_dim  # TODO: add to docstrings
+        self.t2u_variance_predictor_kernel_size = t2u_variance_predictor_kernel_size  # TODO: add to docstrings
+        self.t2u_variance_pred_dropout = t2u_variance_pred_dropout  # TODO: add to docstrings
+
+        # hifi-gan vocoder config
+        # original parameters specific to Hifi-Gan
+        self.sampling_rate = sampling_rate
+        self.upsample_initial_channel = upsample_initial_channel
+        self.upsample_rates = upsample_rates
+        self.upsample_kernel_sizes = upsample_kernel_sizes
+        self.resblock_kernel_sizes = resblock_kernel_sizes
+        self.resblock_dilation_sizes = resblock_dilation_sizes
+        self.leaky_relu_slope = leaky_relu_slope
+
+        # specific to Code Hifi-Gan
+        self.unit_hifi_gan_vocab_size = unit_hifi_gan_vocab_size
+        self.unit_embed_dim = unit_embed_dim
+        self.lang_embed_dim = lang_embed_dim
+        self.spkr_embed_dim = spkr_embed_dim
+        self.vocoder_num_langs = vocoder_num_langs
+        self.vocoder_num_spkrs = vocoder_num_spkrs
+        self.variance_predictor_kernel_size = variance_predictor_kernel_size
+        self.var_pred_dropout = var_pred_dropout
+        self.vocoder_offset = vocoder_offset
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            decoder_start_token_id=decoder_start_token_id,
+            is_encoder_decoder=is_encoder_decoder,
+            max_position_embeddings=max_position_embeddings,
+            **kwargs,
+        )
+
+
+__all__ = ["SeamlessM4Tv2Config"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py
new file mode 100644
index 0000000000000000000000000000000000000000..bf883374b68f23e195327b153d2a3855b43820b1
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seamless_m4t_v2/modeling_seamless_m4t_v2.py
@@ -0,0 +1,4398 @@
+# coding=utf-8
+# Copyright 2023 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch SeamlessM4Tv2 model."""
+
+import copy
+import math
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+from torch import Tensor, nn
+from torch.nn import CrossEntropyLoss
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache, EncoderDecoderCache
+from ...generation import GenerationMixin
+from ...integrations.deepspeed import is_deepspeed_zero3_enabled
+from ...integrations.fsdp import is_fsdp_managed_module
+from ...modeling_attn_mask_utils import _prepare_4d_attention_mask, _prepare_4d_causal_attention_mask
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BaseModelOutput,
+    BaseModelOutputWithPastAndCrossAttentions,
+    Seq2SeqLMOutput,
+    Seq2SeqModelOutput,
+    Wav2Vec2BaseModelOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import ModelOutput, auto_docstring, logging
+from ...utils.deprecation import deprecate_kwarg
+from .configuration_seamless_m4t_v2 import SeamlessM4Tv2Config
+
+
+logger = logging.get_logger(__name__)
+
+SEAMLESS_M4T_V2_COMMON_CUSTOM_ARGS = r"""
+    input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`):
+        Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the
+        [`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
+    decoder_input_ids (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+        Indices of decoder input sequence tokens in the vocabulary.
+
+        Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+        [`PreTrainedTokenizer.__call__`] for details.
+
+        [What are decoder input IDs?](../glossary#decoder-input-ids)
+
+        Bart uses the `eos_token_id` as the starting token for `decoder_input_ids` generation. If `past_key_values`
+        is used, optionally only the last `decoder_input_ids` have to be input (see `past_key_values`).
+
+        For translation and summarization training, `decoder_input_ids` should be provided. If no
+        `decoder_input_ids` is provided, the model will create this tensor by shifting the `input_ids` to the right
+        for denoising pre-training following the paper.
+    decoder_attention_mask (`torch.LongTensor` of shape `(batch_size, target_sequence_length)`, *optional*):
+        Default behavior: generate a tensor that ignores pad tokens in `decoder_input_ids`. Causal mask will also
+        be used by default.
+
+        If you want to change padding behavior, you should read [`modeling_bart._prepare_decoder_attention_mask`]
+        and modify to your needs. See diagram 1 in [the paper](https://huggingface.co/papers/1910.13461) for more
+        information on the default strategy.
+    inputs_embeds (`torch.FloatTensor` of shape`(batch_size, sequence_length, hidden_size)`, *optional*):
+        Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+        is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+        model's internal embedding lookup matrix.
+"""
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Class defining the generated outputs from [`SeamlessM4Tv2Model`], [`SeamlessM4Tv2ForTextToText`],
+    [`SeamlessM4Tv2ForTextToSpeech`], [`SeamlessM4Tv2ForSpeechToSpeech`] and [`SeamlessM4Tv2ForTextToSpeech`].
+    """
+)
+# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TGenerationOutput with SeamlessM4T->SeamlessM4Tv2
+class SeamlessM4Tv2GenerationOutput(ModelOutput):
+    r"""
+    waveform (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+        The final audio waveform predicted by the model.
+    waveform_lengths (`torch.IntTensor` of shape `(batch_size,)`, *optional*):
+        The length in samples of each element in the `waveform` batch.
+    sequences (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+        The generated translated sequences. This is the output of the text-to-text or the speech-to-text models.
+        The second dimension (sequence_length) is either equal to `max_length` or shorter if all batches finished
+        early due to the `eos_token_id`.
+    unit_sequences (`torch.LongTensor` of shape `(batch_size, unit_sequence_length)`, *optional*):
+        The generated translated unit sequences. This is the output of the text-to-units model. The second
+        dimension (unit_sequence_length) is either equal to `t2u_max_length` or shorter if all batches finished
+        early due to the `t2u_eos_token_id`.
+    """
+
+    waveform: Optional[torch.FloatTensor] = None
+    waveform_lengths: Optional[torch.IntTensor] = None
+    sequences: Optional[tuple[torch.FloatTensor]] = None
+    unit_sequences: Optional[tuple[torch.FloatTensor]] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Class defining the outputs from [`SeamlessM4Tv2TextToUnitDecoder`].
+    """
+)
+class SeamlessM4Tv2TextToUnitDecoderOutput(ModelOutput):
+    r"""
+    padding_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+        Indicates which inputs are to be ignored due to padding, where elements are either 1 for *not masked* or 0
+        for *masked*
+    """
+
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    padding_mask: Optional[torch.Tensor] = None
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Class defining the outputs from [`SeamlessM4Tv2TextToUnitForConditionalGeneration`] and
+        [`SeamlessM4Tv2TextToUnitModel`].
+    """
+)
+class SeamlessM4Tv2TextToUnitOutput(ModelOutput):
+    r"""
+    last_hidden_state (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`):
+        Sequence of hidden-states at the output of the last layer of the decoder of the model.
+
+        If `past_key_values` is used only the last hidden-state of the sequences of shape `(batch_size, 1,
+        hidden_size)` is output.
+    padding_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+        Indicates which inputs are to be ignored due to padding, where elements are either 1 for *not masked* or 0
+        for *masked*
+    loss (`torch.FloatTensor` of shape `(1,)`, *optional*, returned when `labels` is provided):
+        Language modeling loss.
+    """
+
+    last_hidden_state: Optional[torch.FloatTensor] = None
+    padding_mask: Optional[torch.Tensor] = None
+    decoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    decoder_attentions: Optional[tuple[torch.FloatTensor]] = None
+    encoder_last_hidden_state: Optional[torch.FloatTensor] = None
+    encoder_hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    encoder_attentions: Optional[tuple[torch.FloatTensor]] = None
+    loss: Optional[torch.FloatTensor] = None
+
+
+############ UTILS ################
+
+
+# Copied from transformers.models.roberta.modeling_roberta.create_position_ids_from_input_ids
+def create_position_ids_from_input_ids(input_ids, padding_idx, past_key_values_length=0):
+    """
+    Replace non-padding symbols with their position numbers. Position numbers begin at padding_idx+1. Padding symbols
+    are ignored. This is modified from fairseq's `utils.make_positions`.
+
+    Args:
+        x: torch.Tensor x:
+
+    Returns: torch.Tensor
+    """
+    # The series of casts and type-conversions here are carefully balanced to both work with ONNX export and XLA.
+    mask = input_ids.ne(padding_idx).int()
+    incremental_indices = (torch.cumsum(mask, dim=1).type_as(mask) + past_key_values_length) * mask
+    return incremental_indices.long() + padding_idx
+
+
+# Copied from transformers.models.bart.modeling_bart.shift_tokens_right
+def shift_tokens_right(input_ids: torch.Tensor, pad_token_id: int, decoder_start_token_id: int):
+    """
+    Shift input ids one token to the right.
+    """
+    shifted_input_ids = input_ids.new_zeros(input_ids.shape)
+    shifted_input_ids[:, 1:] = input_ids[:, :-1].clone()
+    shifted_input_ids[:, 0] = decoder_start_token_id
+
+    if pad_token_id is None:
+        raise ValueError("self.model.config.pad_token_id has to be defined.")
+    # replace possible -100 values in labels by `pad_token_id`
+    shifted_input_ids.masked_fill_(shifted_input_ids == -100, pad_token_id)
+
+    return shifted_input_ids
+
+
+def _compute_new_attention_mask(hidden_states: torch.Tensor, seq_lens: torch.Tensor):
+    """
+    Computes an attention mask of the form `(batch, seq_len)` with an attention for each element in the batch that
+    stops at the corresponding element in `seq_lens`.
+
+    Args:
+        hidden_states (`torch.FloatTensor` of shape `(batch, seq_len, *)`):
+            The sequences to mask, where `*` is any number of sequence-specific dimensions including none.
+        seq_lens (`torch.Tensor` of shape `(batch)`:
+            Each element represents the length of the sequence at the same index in `hidden_states`
+
+    Returns:
+        `torch.FloatTensor`: The float attention mask of shape `(batch, seq_len)`
+    """
+    batch_size, mask_seq_len = hidden_states.shape[:2]
+
+    indices = torch.arange(mask_seq_len, device=seq_lens.device).expand(batch_size, -1)
+
+    bool_mask = indices >= seq_lens.unsqueeze(1).expand(-1, mask_seq_len)
+
+    mask = hidden_states.new_ones((batch_size, mask_seq_len))
+
+    mask = mask.masked_fill(bool_mask, 0)
+
+    return mask
+
+
+# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.format_speech_generation_kwargs with SeamlessM4T->SeamlessM4Tv2
+def format_speech_generation_kwargs(kwargs):
+    """
+    Format kwargs for SeamlessM4Tv2 models that generate speech, attribute kwargs to either the text generation or the
+    speech generation models.
+
+    Args:
+        kwargs (`dict`)`:
+             Keyword arguments are of two types:
+
+                - Without a prefix, they will be entered as `**kwargs` for the `generate` method of each sub-model,
+                except for `decoder_input_ids` which will only be passed through the text components.
+                - With a *text_* or *speech_* prefix, they will be input for the `generate` method of the
+                text model and speech model respectively. It has the priority over the keywords without a prefix.
+
+                This means you can, for example, specify a generation strategy for one generation but not for the
+                other.
+    """
+    # attribute kwargs to models
+    kwargs_text = {}
+    kwargs_speech = {}
+    for key, value in kwargs.items():
+        if key.startswith("text_"):
+            key = key[len("text_") :]
+            kwargs_text[key] = value
+        elif key.startswith("speech_"):
+            key = key[len("speech_") :]
+            kwargs_speech[key] = value
+        elif key == "generation_config":
+            kwargs_text[key] = value
+        else:
+            # If the key is already in a specific config, then it's been set with a
+            # submodules specific value and we don't override
+            if key not in kwargs_text:
+                kwargs_text[key] = value
+            if key not in kwargs_speech:
+                kwargs_speech[key] = value
+    return kwargs_text, kwargs_speech
+
+
+############ SPEECH ENCODER related code ################
+
+
+class SeamlessM4Tv2ConformerFeatureProjection(nn.Module):
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TConformerFeatureProjection.__init__
+    def __init__(self, config):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(config.feature_projection_input_dim, eps=config.layer_norm_eps)
+        self.projection = nn.Linear(config.feature_projection_input_dim, config.hidden_size)
+        self.dropout = nn.Dropout(config.speech_encoder_dropout)
+
+    def forward(self, hidden_states):
+        # non-projected hidden states are needed for quantization
+        norm_hidden_states = self.layer_norm(hidden_states.to(self.layer_norm.weight.dtype))
+        hidden_states = self.projection(norm_hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TConformerFeedForward with SeamlessM4T->SeamlessM4Tv2
+class SeamlessM4Tv2ConformerFeedForward(nn.Module):
+    def __init__(self, config, act_fn=None, dropout=None):
+        super().__init__()
+        dropout = dropout if dropout is not None else config.speech_encoder_dropout
+        act_fn = act_fn if act_fn is not None else config.speech_encoder_hidden_act
+
+        self.intermediate_dropout = nn.Dropout(dropout)
+        self.intermediate_dense = nn.Linear(config.hidden_size, config.speech_encoder_intermediate_size)
+        self.intermediate_act_fn = ACT2FN[act_fn] if isinstance(act_fn, str) else act_fn
+
+        self.output_dense = nn.Linear(config.speech_encoder_intermediate_size, config.hidden_size)
+        self.output_dropout = nn.Dropout(dropout)
+
+    def forward(self, hidden_states):
+        hidden_states = self.intermediate_dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        hidden_states = self.intermediate_dropout(hidden_states)
+
+        hidden_states = self.output_dense(hidden_states)
+        hidden_states = self.output_dropout(hidden_states)
+        return hidden_states
+
+
+class SeamlessM4Tv2ConformerConvolutionModule(nn.Module):
+    """Convolution block used in the conformer block. Uses a causal depthwise convolution similar to that
+    described in Section 2.1 of https://huggingface.co/papers/1609.03499
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        if (config.conv_depthwise_kernel_size - 1) % 2 == 1:
+            raise ValueError("`config.conv_depthwise_kernel_size` should be a odd number for 'SAME' padding")
+        self.layer_norm = nn.LayerNorm(config.hidden_size)
+        self.pointwise_conv1 = nn.Conv1d(
+            config.hidden_size,
+            2 * config.hidden_size,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+        )
+        self.glu = nn.GLU(dim=1)
+        self.depthwise_conv = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            config.conv_depthwise_kernel_size,
+            stride=1,
+            padding=0,
+            groups=config.hidden_size,
+            bias=False,
+        )
+        self.depthwise_layer_norm = nn.LayerNorm(config.hidden_size)
+        self.activation = ACT2FN[config.speech_encoder_hidden_act]
+        self.pointwise_conv2 = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+        )
+        self.dropout = nn.Dropout(config.speech_encoder_dropout)
+
+    def forward(self, hidden_states, attention_mask=None):
+        hidden_states = self.layer_norm(hidden_states)
+
+        # Ensure that we do not leak padded positions in depthwise convolution.
+        # Put 0 where necessary
+        if attention_mask is not None:
+            hidden_states = hidden_states.masked_fill(~attention_mask.bool().unsqueeze(-1), 0.0)
+
+        # exchange the temporal dimension and the feature dimension
+        hidden_states = hidden_states.transpose(1, 2)
+
+        # GLU mechanism
+        # => (batch, 2*channel, dim)
+        hidden_states = self.pointwise_conv1(hidden_states)
+        # => (batch, channel, dim)
+        hidden_states = self.glu(hidden_states)
+
+        # Pad the sequence entirely on the left because of causal convolution.
+        hidden_states = torch.nn.functional.pad(hidden_states, (self.depthwise_conv.kernel_size[0] - 1, 0))
+
+        # 1D Depthwise Conv
+        hidden_states = self.depthwise_conv(hidden_states)
+        hidden_states = self.depthwise_layer_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        hidden_states = self.activation(hidden_states)
+
+        hidden_states = self.pointwise_conv2(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states
+
+
+class SeamlessM4Tv2ConformerSelfAttention(nn.Module):
+    """Construct a SeamlessM4Tv2ConformerSelfAttention object.
+    Can be enhanced with relative position embeddings.
+    """
+
+    def __init__(self, config, use_position_embeddings=True):
+        super().__init__()
+
+        self.head_size = config.hidden_size // config.speech_encoder_attention_heads
+        self.num_heads = config.speech_encoder_attention_heads
+        self.position_embeddings_type = config.position_embeddings_type if use_position_embeddings else None
+
+        self.linear_q = nn.Linear(config.hidden_size, config.hidden_size)
+        self.linear_k = nn.Linear(config.hidden_size, config.hidden_size)
+        self.linear_v = nn.Linear(config.hidden_size, config.hidden_size)
+        self.linear_out = nn.Linear(config.hidden_size, config.hidden_size)
+
+        self.dropout = nn.Dropout(p=config.speech_encoder_dropout)
+
+        if self.position_embeddings_type == "relative_key":
+            self.left_max_position_embeddings = config.left_max_position_embeddings
+            self.right_max_position_embeddings = config.right_max_position_embeddings
+            num_positions = self.left_max_position_embeddings + self.right_max_position_embeddings + 1
+            self.distance_embedding = nn.Embedding(num_positions, self.head_size)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        # self-attention mechanism
+        batch_size, sequence_length, hidden_size = hidden_states.size()
+
+        # make sure query/key states can be != value states
+        query_key_states = hidden_states
+        value_states = hidden_states
+
+        # project query_key_states and value_states
+        query = self.linear_q(query_key_states).view(batch_size, -1, self.num_heads, self.head_size)
+        key = self.linear_k(query_key_states).view(batch_size, -1, self.num_heads, self.head_size)
+        value = self.linear_v(value_states).view(batch_size, -1, self.num_heads, self.head_size)
+
+        # => (batch, head, time1, d_k)
+        query = query.transpose(1, 2)
+        key = key.transpose(1, 2)
+        value = value.transpose(1, 2)
+
+        attn_weights = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.head_size)
+
+        if self.position_embeddings_type == "relative_key":
+            query_length, key_length = query.shape[2], key.shape[2]
+
+            position_ids_l = torch.arange(query_length, dtype=torch.long, device=hidden_states.device).view(-1, 1)
+            position_ids_r = torch.arange(key_length, dtype=torch.long, device=hidden_states.device).view(1, -1)
+            distance = position_ids_r - position_ids_l
+            distance = torch.clamp(distance, -self.left_max_position_embeddings, self.right_max_position_embeddings)
+
+            positional_embedding = self.distance_embedding(distance + self.left_max_position_embeddings)
+            positional_embedding = positional_embedding.to(dtype=query.dtype)  # fp16 compatibility
+
+            relative_position_attn_weights = torch.einsum("bhld,lrd->bhlr", query, positional_embedding)
+            attn_weights = attn_weights + (relative_position_attn_weights / math.sqrt(self.head_size))
+
+        # apply attention_mask if necessary
+        if attention_mask is not None:
+            attn_weights = attn_weights + attention_mask
+
+        # => (batch, head, time1, time2)
+        attn_weights = torch.softmax(attn_weights, dim=-1)
+        attn_weights = self.dropout(attn_weights)
+
+        # => (batch, head, time1, d_k)
+        attn_output = torch.matmul(attn_weights, value)
+
+        # => (batch, time1, hidden_size)
+        attn_output = attn_output.transpose(1, 2).reshape(batch_size, -1, self.num_heads * self.head_size)
+        attn_output = self.linear_out(attn_output)
+
+        if not output_attentions:
+            attn_weights = None
+
+        return attn_output, attn_weights
+
+
+class SeamlessM4Tv2ConformerEncoderLayer(GradientCheckpointingLayer):
+    """Conformer block based on https://huggingface.co/papers/2005.08100."""
+
+    # Copied from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer.Wav2Vec2ConformerEncoderLayer.__init__ with Wav2Vec2->SeamlessM4Tv2, attention_dropout->speech_encoder_dropout, torch.nn->nn
+    def __init__(self, config):
+        super().__init__()
+        embed_dim = config.hidden_size
+        dropout = config.speech_encoder_dropout
+
+        # Feed-forward 1
+        self.ffn1_layer_norm = nn.LayerNorm(embed_dim)
+        self.ffn1 = SeamlessM4Tv2ConformerFeedForward(config)
+
+        # Self-Attention
+        self.self_attn_layer_norm = nn.LayerNorm(embed_dim)
+        self.self_attn_dropout = nn.Dropout(dropout)
+        self.self_attn = SeamlessM4Tv2ConformerSelfAttention(config)
+
+        # Conformer Convolution
+        self.conv_module = SeamlessM4Tv2ConformerConvolutionModule(config)
+
+        # Feed-forward 2
+        self.ffn2_layer_norm = nn.LayerNorm(embed_dim)
+        self.ffn2 = SeamlessM4Tv2ConformerFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(embed_dim)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        conv_attention_mask: Optional[torch.Tensor] = None,
+    ):
+        # 1. Feed-Forward 1 layer
+        residual = hidden_states
+        hidden_states = self.ffn1_layer_norm(hidden_states)
+        hidden_states = self.ffn1(hidden_states)
+        hidden_states = hidden_states * 0.5 + residual
+        residual = hidden_states
+
+        # 2. Self-Attention layer
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self.self_attn_dropout(hidden_states)
+        hidden_states = hidden_states + residual
+
+        # 3. Convolutional Layer
+        residual = hidden_states
+        hidden_states = self.conv_module(hidden_states, attention_mask=conv_attention_mask)
+        hidden_states = residual + hidden_states
+
+        # 4. Feed-Forward 2 Layer
+        residual = hidden_states
+        hidden_states = self.ffn2_layer_norm(hidden_states)
+        hidden_states = self.ffn2(hidden_states)
+        hidden_states = hidden_states * 0.5 + residual
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        return hidden_states, attn_weights
+
+
+class SeamlessM4Tv2ConformerEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+
+        self.dropout = nn.Dropout(config.speech_encoder_dropout)
+        self.layers = nn.ModuleList(
+            [SeamlessM4Tv2ConformerEncoderLayer(config) for _ in range(config.speech_encoder_layers)]
+        )
+
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+
+        self.gradient_checkpointing = False
+
+    def _apply_chunk_attention(self, attention_mask, hidden_states):
+        """
+        Creates a chunk attention mask. It creates a mask to prevent attention across chunks, ensuring that each
+        position attends only to positions within its own chunk. If a left chunk overlap is specified
+        (`speech_encoder_chunk_size` in the configuration), the attention mask is adjusted accordingly to allow each
+        position to also attends the `speech_encoder_chunk_size - 1` previous chunks.
+        """
+        sequence_len = hidden_states.shape[1]
+
+        chunk_indices = torch.arange(sequence_len, device=hidden_states.device)
+        chunk_indices = torch.div(chunk_indices, self.config.speech_encoder_chunk_size).long()
+
+        start_indices = torch.full_like(chunk_indices, 0)
+        if self.config.speech_encoder_left_chunk_num >= 0:
+            start_indices = (chunk_indices - self.config.speech_encoder_left_chunk_num).clamp_(min=0)
+            start_indices = start_indices * self.config.speech_encoder_chunk_size
+        start_indices = start_indices.unsqueeze(1).expand(-1, sequence_len)
+
+        end_indices = ((chunk_indices + 1) * self.config.speech_encoder_chunk_size).clamp_(max=sequence_len)
+
+        end_indices = end_indices.unsqueeze(1).expand(-1, sequence_len)
+
+        indices = torch.arange(sequence_len, device=hidden_states.device).unsqueeze(0).expand(sequence_len, -1)
+
+        chunk_mask = (indices < start_indices) | (indices >= end_indices)
+        chunk_mask = chunk_mask.unsqueeze(0).unsqueeze(0)
+
+        attention_mask = chunk_mask if attention_mask is None else (attention_mask.bool() | chunk_mask)
+        attention_mask = attention_mask.to(dtype=hidden_states.dtype)
+        return attention_mask
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        conv_attention_mask = attention_mask
+        if attention_mask is not None:
+            # make sure padded tokens output 0
+            hidden_states = hidden_states.masked_fill(~attention_mask.bool().unsqueeze(-1), 0.0)
+            # extend attention_mask
+            attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
+            attention_mask = attention_mask.expand(
+                attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
+            )
+
+        if self.config.speech_encoder_chunk_size is not None:
+            attention_mask = self._apply_chunk_attention(attention_mask, hidden_states)
+
+        if attention_mask is not None:
+            attention_mask = attention_mask * torch.finfo(hidden_states.dtype).min
+
+        hidden_states = self.dropout(hidden_states)
+
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
+
+        for i, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            dropout_probability = torch.rand([])
+
+            skip_the_layer = self.training and dropout_probability < self.config.speech_encoder_layerdrop
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
+                layer_outputs = layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    output_attentions=output_attentions,
+                    conv_attention_mask=conv_attention_mask,
+                )
+                hidden_states = layer_outputs[0]
+
+            if skip_the_layer:
+                layer_outputs = (None, None)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        hidden_states = self.layer_norm(hidden_states)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TConformerAdapterLayer with SeamlessM4T->SeamlessM4Tv2
+class SeamlessM4Tv2ConformerAdapterLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        embed_dim = config.hidden_size
+        dropout = config.adaptor_dropout
+
+        self.kernel_size = config.adaptor_kernel_size
+        self.stride = config.adaptor_stride
+
+        # 1. residual convolution
+        self.residual_layer_norm = nn.LayerNorm(embed_dim)
+        self.residual_conv = nn.Conv1d(
+            embed_dim,
+            2 * embed_dim,
+            self.kernel_size,
+            stride=self.stride,
+            padding=self.stride // 2,
+        )
+        self.activation = nn.GLU(dim=1)
+
+        # Self-Attention
+        self.self_attn_layer_norm = nn.LayerNorm(embed_dim)
+        self.self_attn_conv = nn.Conv1d(
+            embed_dim,
+            2 * embed_dim,
+            self.kernel_size,
+            stride=self.stride,
+            padding=self.stride // 2,
+        )
+        self.self_attn = SeamlessM4Tv2ConformerSelfAttention(config, use_position_embeddings=False)
+        self.self_attn_dropout = nn.Dropout(dropout)
+
+        # Feed-forward
+        self.ffn_layer_norm = nn.LayerNorm(embed_dim)
+        self.ffn = SeamlessM4Tv2ConformerFeedForward(config, act_fn="relu", dropout=dropout)
+
+    def _compute_sub_sample_lengths_from_attention_mask(self, attention_mask):
+        pad = self.kernel_size // 2
+        seq_lens = attention_mask.size(1) - (1 - attention_mask.int()).sum(1)
+
+        seq_lens = ((seq_lens + 2 * pad - self.kernel_size) / self.stride) + 1
+
+        return seq_lens.floor()
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ):
+        residual = self.residual_layer_norm(hidden_states)
+
+        # Apply pooling to the residual to match the sequence length of the
+        # multi-head attention output.
+        # (batch, seq_len, feature_dim) -> (batch, feature_dim, seq_len)
+        residual = residual.transpose(1, 2)
+        residual = self.residual_conv(residual)
+        residual = self.activation(residual)
+        # (batch, feature_dim, seq_len) -> (batch, seq_len, feature_dim)
+        residual = residual.transpose(1, 2)
+
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        # Apply pooling before feeding to the multihead-attention layer.
+        # (batch, seq_len, feature_dim) -> (batch, feature_dim, seq_len)
+        hidden_states = hidden_states.transpose(1, 2)
+        hidden_states = self.self_attn_conv(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        # (batch, feature_dim, seq_len) -> (batch, seq_len, feature_dim)
+        hidden_states = hidden_states.transpose(1, 2)
+
+        if attention_mask is not None:
+            sub_sampled_lengths = self._compute_sub_sample_lengths_from_attention_mask(attention_mask).to(
+                hidden_states.device
+            )
+            attention_mask = _compute_new_attention_mask(hidden_states=hidden_states, seq_lens=sub_sampled_lengths)
+            attention_mask = _prepare_4d_attention_mask(
+                attention_mask,
+                hidden_states.dtype,
+            )
+
+        # The rest of the computation is identical to a vanilla Transformer
+        # encoder layer.
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self.self_attn_dropout(hidden_states)
+        hidden_states = hidden_states + residual
+
+        residual = hidden_states
+
+        hidden_states = self.ffn_layer_norm(hidden_states)
+        hidden_states = self.ffn(hidden_states) + residual
+
+        return hidden_states
+
+
+# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TConformerAdapter with SeamlessM4T->SeamlessM4Tv2
+class SeamlessM4Tv2ConformerAdapter(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        self.layers = nn.ModuleList(
+            SeamlessM4Tv2ConformerAdapterLayer(config) for _ in range(config.num_adapter_layers)
+        )
+
+    def forward(self, hidden_states, attention_mask):
+        # down project hidden_states if necessary
+
+        for layer in self.layers:
+            hidden_states = layer(hidden_states, attention_mask)
+
+        return hidden_states
+
+
+############ TEXT / UNITS related code ################
+
+
+# Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100ScaledWordEmbedding with M2M100->SeamlessM4Tv2
+class SeamlessM4Tv2ScaledWordEmbedding(nn.Embedding):
+    """
+    This module overrides nn.Embeddings' forward by multiplying with embeddings scale.
+    """
+
+    def __init__(self, num_embeddings: int, embedding_dim: int, padding_idx: int, embed_scale: Optional[float] = 1.0):
+        super().__init__(num_embeddings, embedding_dim, padding_idx)
+        self.embed_scale = embed_scale
+
+    def forward(self, input_ids: torch.Tensor):
+        return super().forward(input_ids) * self.embed_scale
+
+
+# Copied from transformers.models.m2m_100.modeling_m2m_100.M2M100SinusoidalPositionalEmbedding
+class SeamlessM4Tv2SinusoidalPositionalEmbedding(nn.Module):
+    """This module produces sinusoidal positional embeddings of any length."""
+
+    def __init__(self, num_positions: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        super().__init__()
+        self.offset = 2
+        self.embedding_dim = embedding_dim
+        self.padding_idx = padding_idx
+        self.make_weights(num_positions + self.offset, embedding_dim, padding_idx)
+
+    def make_weights(self, num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        emb_weights = self.get_embedding(num_embeddings, embedding_dim, padding_idx)
+        if hasattr(self, "weights"):
+            # in forward put the weights on the correct dtype and device of the param
+            emb_weights = emb_weights.to(dtype=self.weights.dtype, device=self.weights.device)
+
+        self.register_buffer("weights", emb_weights, persistent=False)
+
+    @staticmethod
+    def get_embedding(num_embeddings: int, embedding_dim: int, padding_idx: Optional[int] = None):
+        """
+        Build sinusoidal embeddings.
+
+        This matches the implementation in tensor2tensor, but differs slightly from the description in Section 3.5 of
+        "Attention Is All You Need".
+        """
+        half_dim = embedding_dim // 2
+        emb = math.log(10000) / (half_dim - 1)
+        emb = torch.exp(torch.arange(half_dim, dtype=torch.int64).float() * -emb)
+        emb = torch.arange(num_embeddings, dtype=torch.int64).float().unsqueeze(1) * emb.unsqueeze(0)
+        emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1).view(num_embeddings, -1)
+        if embedding_dim % 2 == 1:
+            # zero pad
+            emb = torch.cat([emb, torch.zeros(num_embeddings, 1)], dim=1)
+        if padding_idx is not None:
+            emb[padding_idx, :] = 0
+
+        return emb.to(torch.get_default_dtype())
+
+    @torch.no_grad()
+    def forward(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.Tensor] = None,
+        past_key_values_length: int = 0,
+    ):
+        if input_ids is not None:
+            bsz, seq_len = input_ids.size()
+            # Create the position ids from the input token ids. Any padded tokens remain padded.
+            position_ids = create_position_ids_from_input_ids(input_ids, self.padding_idx, past_key_values_length).to(
+                input_ids.device
+            )
+        else:
+            bsz, seq_len = inputs_embeds.size()[:-1]
+            position_ids = self.create_position_ids_from_inputs_embeds(inputs_embeds, past_key_values_length)
+
+        # expand embeddings if needed
+        max_pos = self.padding_idx + 1 + seq_len + past_key_values_length
+        if max_pos > self.weights.size(0):
+            self.make_weights(max_pos + self.offset, self.embedding_dim, self.padding_idx)
+
+        return self.weights.index_select(0, position_ids.view(-1)).view(bsz, seq_len, self.weights.shape[-1]).detach()
+
+    def create_position_ids_from_inputs_embeds(self, inputs_embeds, past_key_values_length):
+        """
+        We are provided embeddings directly. We cannot infer which are padded so just generate sequential position ids.
+
+        Args:
+            inputs_embeds: torch.Tensor
+
+        Returns: torch.Tensor
+        """
+        input_shape = inputs_embeds.size()[:-1]
+        sequence_length = input_shape[1]
+
+        position_ids = torch.arange(
+            self.padding_idx + 1, sequence_length + self.padding_idx + 1, dtype=torch.long, device=inputs_embeds.device
+        )
+        return position_ids.unsqueeze(0).expand(input_shape).contiguous() + past_key_values_length
+
+
+class SeamlessM4Tv2Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    # Copied from transformers.models.bart.modeling_bart.BartAttention.__init__ with Bart->SeamlessM4Tv2
+    def __init__(
+        self,
+        embed_dim: int,
+        num_heads: int,
+        dropout: float = 0.0,
+        is_decoder: bool = False,
+        bias: bool = True,
+        is_causal: bool = False,
+        config: Optional[SeamlessM4Tv2Config] = None,
+        layer_idx: Optional[int] = None,
+    ):
+        super().__init__()
+        self.embed_dim = embed_dim
+        self.num_heads = num_heads
+        self.dropout = dropout
+        self.head_dim = embed_dim // num_heads
+        self.config = config
+
+        if (self.head_dim * num_heads) != self.embed_dim:
+            raise ValueError(
+                f"embed_dim must be divisible by num_heads (got `embed_dim`: {self.embed_dim}"
+                f" and `num_heads`: {num_heads})."
+            )
+        self.scaling = self.head_dim**-0.5
+        self.is_decoder = is_decoder
+        self.is_causal = is_causal
+        self.layer_idx = layer_idx
+        if layer_idx is None and self.is_decoder:
+            logger.warning_once(
+                f"Instantiating a decoder {self.__class__.__name__} without passing `layer_idx` is not recommended and "
+                "will lead to errors during the forward call, if caching is used. Please make sure to provide a `layer_idx` "
+                "when creating this class."
+            )
+
+        self.k_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.v_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.q_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+        self.out_proj = nn.Linear(embed_dim, embed_dim, bias=bias)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        """Input shape: Batch x Time x Channel"""
+
+        is_cross_attention = encoder_hidden_states is not None
+        batch_size, seq_length = hidden_states.shape[:2]
+
+        is_updated = False
+        if past_key_values is not None:
+            if isinstance(past_key_values, EncoderDecoderCache):
+                is_updated = past_key_values.is_updated.get(self.layer_idx)
+                if is_cross_attention:
+                    # after the first generated id, we can subsequently re-use all key/value_states from cache
+                    curr_past_key_value = past_key_values.cross_attention_cache
+                else:
+                    curr_past_key_value = past_key_values.self_attention_cache
+            else:
+                curr_past_key_value = past_key_values
+
+        current_states = encoder_hidden_states if is_cross_attention else hidden_states
+        if is_cross_attention and past_key_values is not None and is_updated:
+            # reuse k,v, cross_attentions
+            key_states = curr_past_key_value.layers[self.layer_idx].keys
+            value_states = curr_past_key_value.layers[self.layer_idx].values
+        else:
+            key_states = self.k_proj(current_states)
+            value_states = self.v_proj(current_states)
+            key_states = key_states.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
+            value_states = value_states.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
+
+            if past_key_values is not None:
+                # save all key/value_states to cache to be re-used for fast auto-regressive generation
+                cache_position = cache_position if not is_cross_attention else None
+                key_states, value_states = curr_past_key_value.update(
+                    key_states, value_states, self.layer_idx, {"cache_position": cache_position}
+                )
+                # set flag that curr layer for cross-attn is already updated so we can re-use in subsequent calls
+                if is_cross_attention and isinstance(past_key_values, EncoderDecoderCache):
+                    past_key_values.is_updated[self.layer_idx] = True
+
+        query_states = self.q_proj(hidden_states)
+        query_states = query_states.reshape(batch_size, seq_length, self.num_heads, self.head_dim).transpose(1, 2)
+        query_states = query_states * self.scaling
+        attention_scores = torch.matmul(query_states, key_states.transpose(-1, -2))
+
+        if attention_mask is not None:
+            attention_scores = attention_scores + attention_mask
+
+        # (batch_size, n_heads, seq_length, key_length)
+        attn_weights = nn.functional.softmax(attention_scores.float(), dim=-1).type_as(attention_scores)
+        attn_weights = nn.functional.dropout(attn_weights, p=self.dropout, training=self.training)
+
+        #  attn_output = torch.bmm(attn_probs, value_states) ?
+        context_states = torch.matmul(attn_weights, value_states)
+        # attn_output = attn_output.view(bsz, self.num_heads, tgt_len, self.head_dim) ?
+        context_states = context_states.permute(0, 2, 1, 3).contiguous().view(batch_size, seq_length, -1)
+        attn_output = self.out_proj(context_states)
+
+        return attn_output, attn_weights
+
+
+# Copied from transformers.models.nllb_moe.modeling_nllb_moe.NllbMoeDenseActDense with NllbMoe->SeamlessM4Tv2,DenseActDense->FeedForwardNetwork, d_model->hidden_size
+class SeamlessM4Tv2FeedForwardNetwork(nn.Module):
+    def __init__(self, config: SeamlessM4Tv2Config, ffn_dim: int):
+        super().__init__()
+        self.fc1 = nn.Linear(config.hidden_size, ffn_dim)
+        self.fc2 = nn.Linear(ffn_dim, config.hidden_size)
+        self.dropout = nn.Dropout(config.activation_dropout)
+        self.act = ACT2FN[config.activation_function]
+
+    def forward(self, hidden_states):
+        hidden_states = self.fc1(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        if (
+            isinstance(self.fc2.weight, torch.Tensor)
+            and hidden_states.dtype != self.fc2.weight.dtype
+            and (self.fc2.weight.dtype != torch.int8 and self.fc2.weight.dtype != torch.uint8)
+        ):
+            hidden_states = hidden_states.to(self.fc2.weight.dtype)
+        hidden_states = self.fc2(hidden_states)
+        return hidden_states
+
+
+# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TEncoderLayer with SeamlessM4T->SeamlessM4Tv2
+class SeamlessM4Tv2EncoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: SeamlessM4Tv2Config, encoder_ffn_dim=None, encoder_attention_heads=None):
+        super().__init__()
+        encoder_ffn_dim = config.encoder_ffn_dim if encoder_ffn_dim is None else encoder_ffn_dim
+        encoder_attention_heads = (
+            config.encoder_attention_heads if encoder_attention_heads is None else encoder_attention_heads
+        )
+
+        self.embed_dim = config.hidden_size
+        self.self_attn = SeamlessM4Tv2Attention(
+            embed_dim=self.embed_dim,
+            num_heads=encoder_attention_heads,
+            dropout=config.attention_dropout,
+        )
+        self.attn_dropout = nn.Dropout(config.dropout)
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+
+        self.ffn = SeamlessM4Tv2FeedForwardNetwork(config, ffn_dim=encoder_ffn_dim)
+
+        self.ffn_layer_norm = nn.LayerNorm(config.hidden_size)
+        self.ffn_dropout = nn.Dropout(config.activation_dropout)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: torch.Tensor,
+        output_attentions: bool = False,
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`):
+                attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very
+                large negative values.
+        """
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self.attn_dropout(hidden_states)
+        hidden_states = residual + hidden_states
+
+        residual = hidden_states
+
+        hidden_states = self.ffn_layer_norm(hidden_states)
+
+        hidden_states = self.ffn(hidden_states)
+        hidden_states = self.ffn_dropout(hidden_states)
+
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (attn_weights,)
+
+        return outputs
+
+
+# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TDecoderLayer with SeamlessM4T->SeamlessM4Tv2
+class SeamlessM4Tv2DecoderLayer(GradientCheckpointingLayer):
+    def __init__(
+        self, config: SeamlessM4Tv2Config, decoder_ffn_dim=None, decoder_attention_heads=None, layer_idx=None
+    ):
+        super().__init__()
+        decoder_ffn_dim = config.decoder_ffn_dim if decoder_ffn_dim is None else decoder_ffn_dim
+        decoder_attention_heads = (
+            config.decoder_attention_heads if decoder_attention_heads is None else decoder_attention_heads
+        )
+
+        self.embed_dim = config.hidden_size
+        self.self_attn = SeamlessM4Tv2Attention(
+            embed_dim=self.embed_dim,
+            num_heads=decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+            layer_idx=layer_idx,
+        )
+        self.dropout = config.dropout
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.attn_dropout = nn.Dropout(config.dropout)
+
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+        self.cross_attention = SeamlessM4Tv2Attention(
+            self.embed_dim,
+            decoder_attention_heads,
+            config.attention_dropout,
+            is_decoder=True,
+            layer_idx=layer_idx,
+        )
+        self.cross_attention_layer_norm = nn.LayerNorm(self.embed_dim)
+
+        self.ffn = SeamlessM4Tv2FeedForwardNetwork(config, ffn_dim=decoder_ffn_dim)
+
+        self.ffn_layer_norm = nn.LayerNorm(config.hidden_size)
+        self.ffn_dropout = nn.Dropout(config.activation_dropout)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        encoder_attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[Cache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = True,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`):
+                attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very
+                large negative values.
+            encoder_hidden_states (`torch.FloatTensor`):
+                cross attention input to the layer of shape `(batch, seq_len, embed_dim)`
+            encoder_attention_mask (`torch.FloatTensor`):
+                encoder attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by
+                very large negative values.
+            past_key_values (`Cache`):
+                cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            cache_position=cache_position,
+        )
+        hidden_states = self.attn_dropout(hidden_states)
+        hidden_states = residual + hidden_states
+
+        # Cross-Attention Block
+        cross_attn_weights = None
+        if encoder_hidden_states is not None:
+            residual = hidden_states
+            hidden_states = self.cross_attention_layer_norm(hidden_states)
+
+            hidden_states, cross_attn_weights = self.cross_attention(
+                hidden_states=hidden_states,
+                encoder_hidden_states=encoder_hidden_states,
+                past_key_values=past_key_values,
+                attention_mask=encoder_attention_mask,
+                output_attentions=output_attentions,
+                cache_position=cache_position,
+            )
+            hidden_states = self.attn_dropout(hidden_states)
+            hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+
+        hidden_states = self.ffn_layer_norm(hidden_states)
+
+        hidden_states = self.ffn(hidden_states)
+        hidden_states = self.ffn_dropout(hidden_states)
+
+        hidden_states = residual + hidden_states
+
+        return hidden_states, self_attn_weights, cross_attn_weights
+
+
+class SeamlessM4Tv2TextToUnitDecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: SeamlessM4Tv2Config, decoder_ffn_dim=None, decoder_attention_heads=None):
+        super().__init__()
+        decoder_ffn_dim = config.decoder_ffn_dim if decoder_ffn_dim is None else decoder_ffn_dim
+        decoder_attention_heads = (
+            config.decoder_attention_heads if decoder_attention_heads is None else decoder_attention_heads
+        )
+        self.dropout = config.dropout
+        self.embed_dim = config.hidden_size
+
+        self.self_attn = SeamlessM4Tv2Attention(
+            embed_dim=self.embed_dim,
+            num_heads=decoder_attention_heads,
+            dropout=config.attention_dropout,
+            is_decoder=True,
+        )
+        self.self_attn_layer_norm = nn.LayerNorm(self.embed_dim)
+
+        self.conv1 = nn.Conv1d(self.embed_dim, self.embed_dim, kernel_size=7, stride=1, padding="same")
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.conv2 = nn.Conv1d(self.embed_dim, self.embed_dim, kernel_size=7, stride=1, padding="same")
+
+        self.conv_layer_norm = nn.LayerNorm(config.hidden_size)
+        self.conv_dropout = nn.Dropout(self.dropout)
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        padding_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = False,
+    ) -> torch.Tensor:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`):
+                input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`):
+                attention mask of size `(batch, 1, tgt_len, src_len)` where padding elements are indicated by very
+                large negative values.
+            padding_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Indicates which inputs are to be ignored due to padding, where elements are either 1 for *not masked*
+                or 0 for *masked*
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+        """
+        residual = hidden_states
+
+        # Self Attention
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+        )
+        hidden_states = residual + hidden_states
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+
+        # Conv
+        residual = hidden_states
+
+        # Apply padding mask to avoid leaking padded positions in the convolution layer
+        if padding_mask is not None:
+            hidden_states = hidden_states.masked_fill(~padding_mask.bool().unsqueeze(-1), 0.0)
+        hidden_states = self.conv1(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        if padding_mask is not None:
+            hidden_states = hidden_states.masked_fill(~padding_mask.bool().unsqueeze(-1), 0.0)
+
+        hidden_states = self.activation_fn(hidden_states)
+        hidden_states = self.conv2(hidden_states.transpose(1, 2)).transpose(1, 2)
+
+        hidden_states = self.conv_dropout(hidden_states)
+        hidden_states = residual + hidden_states
+        hidden_states = self.conv_layer_norm(hidden_states)
+
+        return hidden_states, self_attn_weights
+
+
+############ SUB-MODELS related code ################
+
+
+@auto_docstring
+class SeamlessM4Tv2PreTrainedModel(PreTrainedModel):
+    config: SeamlessM4Tv2Config
+    base_model_prefix = "seamless_m4t_v2"
+    supports_gradient_checkpointing = True
+    _no_split_modules = [
+        "SeamlessM4Tv2EncoderLayer",
+        "SeamlessM4Tv2DecoderLayer",
+        "SeamlessM4Tv2ConformerEncoderLayer",
+        "SeamlessM4Tv2TextToUnitDecoderLayer",
+    ]
+
+    def _init_weights(self, module: nn.Module):
+        """Initialize the weights"""
+        std = self.config.initializer_range
+        if isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, SeamlessM4Tv2ConformerSelfAttention):
+            if hasattr(module, "pos_bias_u"):
+                nn.init.xavier_uniform_(module.pos_bias_u)
+            if hasattr(module, "pos_bias_v"):
+                nn.init.xavier_uniform_(module.pos_bias_v)
+        elif isinstance(module, SeamlessM4Tv2ConformerFeatureProjection):
+            k = math.sqrt(1 / module.projection.in_features)
+            nn.init.uniform_(module.projection.weight, a=-k, b=k)
+            nn.init.uniform_(module.projection.bias, a=-k, b=k)
+        elif isinstance(module, SeamlessM4Tv2TextToUnitDecoder):
+            module.pos_emb_alpha_char.data.fill_(1)
+            module.pos_emb_alpha.data.fill_(1)
+        elif isinstance(module, nn.LayerNorm):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, (nn.Conv1d, nn.ConvTranspose1d)):
+            nn.init.kaiming_normal_(module.weight)
+            if module.bias is not None:
+                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+                nn.init.uniform_(module.bias, a=-k, b=k)
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TPreTrainedModel._compute_sub_sample_lengths_from_attention_mask
+    def _compute_sub_sample_lengths_from_attention_mask(self, attention_mask):
+        kernel_size, stride = self.config.adaptor_kernel_size, self.config.adaptor_stride
+        pad = kernel_size // 2
+        seq_lens = attention_mask.size(1) - (1 - attention_mask.int()).sum(1)
+
+        seq_lens = ((seq_lens + 2 * pad - kernel_size) / stride) + 1
+
+        return seq_lens.floor()
+
+    def _indices_to_subwords(self, input_ids):
+        """
+        Returns the corresponding text string for each input id.
+        """
+        if not hasattr(self.generation_config, "id_to_text"):
+            raise ValueError(
+                """This model generation config doesn't have a `id_to_text` key which maps
+                token ids to subwords. Make sure to load the right generation config."""
+            )
+        batch_size, sequence_len = input_ids.shape
+
+        subwords_batch = []
+        for batch_id in range(batch_size):
+            subwords = []
+            for i in range(sequence_len):
+                subword = self.generation_config.id_to_text.get(str(input_ids[batch_id, i].item()))
+                subwords.append(str(subword))
+            subwords_batch.append(subwords)
+        return subwords_batch
+
+    def _count_character_length_in_subword(
+        self,
+        input_ids,
+        subwords_batch,
+        merge_space_with_prev_subword=False,
+        pad_token_id=0,
+        unk_token_id=1,
+        space="▁",
+    ):
+        """
+        Counts the number of characters per text string associated with the input token id.
+
+        Args:
+            input_ids (`torch.Tensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary.
+            subwords_batch (`list[list[str]]` of shape `(batch_size, sequence_length)`):
+                Corresponding text string for each input id.
+            merge_space_with_prev_subword (`bool`, *optional*, defaults to `False`):
+                Indicates if the space character is merged with the previous subword. If `False`, it will be merged
+                with the next subword.
+            pad_token_id (`int`, *optional*, defaults to 0):
+                The id of the _padding_ text token. If it is encountered when calculating the length of a subword
+                sample, the lengths of subsequent subwords will be set to 0.
+            unk_token_id (`int`, *optional*, defaults to 1):
+                The id of the _unknown_ text token. Associated to a subword of length 1.
+            space (`str`, *optional*, defaults to `"▁"`):
+                The space character.
+        """
+        batch_size, _ = input_ids.shape
+
+        char_count_per_id = input_ids.new_zeros(input_ids.size())
+
+        subword_lens = input_ids.ne(pad_token_id).sum(1)
+
+        for batch_id in range(batch_size):
+            # We slice out the tensor till the padding index.
+            subword_indices = input_ids[batch_id, : subword_lens[batch_id]]
+            subwords = subwords_batch[batch_id][: subword_lens[batch_id]]
+
+            is_next_start_with_space = [
+                len(subwords[i + 1]) > 1 and subwords[i + 1][0] == space if i < len(subwords) - 1 else False
+                for i in range(len(subwords))
+            ]
+            is_punc = [
+                len(subwords[i]) == 1
+                and not subwords[i].isalpha()
+                and not subwords[i].isnumeric()
+                and subwords[i] != space
+                for i in range(len(subwords))
+            ]
+            for i, (subword_idx, subword) in enumerate(zip(subword_indices, subwords)):
+                if subword_idx == pad_token_id:
+                    break
+
+                if subword_idx == unk_token_id:
+                    # We set char_len to 1 for an unk token.
+                    char_len = 1
+
+                    if merge_space_with_prev_subword and is_next_start_with_space[i]:
+                        char_len += 1
+                else:
+                    # By default, spaces are merged with the next subword.
+                    # char_len includes the space.
+                    char_len = len(subword)
+
+                    if merge_space_with_prev_subword:
+                        # Add the space for the next subword.
+                        if is_next_start_with_space[i]:
+                            char_len += 1
+                        # Subtract the space for the current subword.
+                        if i > 0 and is_next_start_with_space[i - 1]:
+                            char_len -= 1
+                    else:
+                        # Merge space with punctuation mark by default.
+                        if is_punc[i] and is_next_start_with_space[i]:
+                            char_len += 1
+                        # Subtract the space for the subword succeeding the punctuation mark.
+                        elif i > 0 and is_punc[i - 1] and is_next_start_with_space[i - 1]:
+                            char_len -= 1
+
+                char_count_per_id[batch_id, i] = char_len
+
+        return char_count_per_id
+
+    def _get_char_input_ids(self, input_ids, subwords_batch, char_count_per_id, pad_token_id=0, unk_token_id=1):
+        """
+        Returns the corresponding character input id for each character of `subwords_batch`.
+
+        Args:
+            input_ids (`torch.Tensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary.
+            subwords_batch (`list[list[str]]` of shape `(batch_size, sequence_length)`):
+                Corresponding text string for each input id.
+            char_count_per_id (`torch.Tensor` of shape `(batch_size, sequence_length)`):
+                Number of characters per input id.
+            pad_token_id (`int`, *optional*, defaults to 0):
+                The id of the _padding_ text token. If it is encountered when calculating the length of a subword
+                sample, the lengths of subsequent subwords will be set to 0.
+            unk_token_id (`int`, *optional*, defaults to 1):
+                The id of the _unknown_ text token. Associated to a subword of length 1.
+        Returns:
+            `torch.Tensor`: Tensor of shape `(batch_size, char_sequence_length)` containing the id of each character.
+        """
+        if not hasattr(self.generation_config, "char_to_id"):
+            raise ValueError(
+                """This model generation config doesn't have a `char_to_id` key which maps
+                characters to character ids. Make sure to load the right generation config."""
+            )
+
+        batch_size = input_ids.shape[0]
+        max_len = int(char_count_per_id.sum(1).max().item())
+
+        char_seqs = input_ids.new_zeros((batch_size, max_len)).fill_(pad_token_id)
+
+        subword_lens = input_ids.ne(pad_token_id).sum(1)
+
+        for batch_id in range(batch_size):
+            total = 0
+            subword_indices = input_ids[batch_id, : subword_lens[batch_id]]
+            subwords = subwords_batch[batch_id][: subword_lens[batch_id]]
+            for subword_idx, subword in zip(subword_indices, subwords):
+                if subword_idx == unk_token_id:
+                    char_ids = [unk_token_id]
+                else:
+                    # Get char token indices corresponding to the subwords.
+                    char_ids = [self.generation_config.char_to_id.get(ch, unk_token_id) for ch in list(subword)]
+                char_seq_len = len(char_ids)
+                char_seqs[batch_id, total : total + char_seq_len] = torch.tensor(char_ids).to(char_seqs)
+                total += char_seq_len
+        return char_seqs
+
+    def _hard_upsample(self, hidden_states, durations):
+        """
+        Repeats the time dimension of each sample in the batch based on the corresponding duration.
+
+        Args:
+            hidden_states (`torch.Tensor` of shape `(batch_size, sequence_length, *)`, *optional*):
+                The sequence to repeat, where `*` is any number of sequence-specific dimensions including none.
+            durations (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Indicates how many times to repeat time segments.
+        """
+        if hidden_states.size(0) == 1:
+            hidden_states = torch.repeat_interleave(hidden_states, durations.view(-1), dim=1)
+        else:
+            # if batched sample, need to interleave per sample, and pad -> loss of parallelism
+            if hidden_states.shape[0] > 1 and self.training:
+                logger.warning_once(
+                    """`self.training=True` and you use batching. You lose parallelism during the hifigan
+                               forward pass because the samples are interleaved."""
+                )
+            hidden_states = [
+                torch.repeat_interleave(hidden_state, duration, dim=0)
+                for (hidden_state, duration) in zip(hidden_states, durations)
+            ]
+
+            hidden_states = nn.utils.rnn.pad_sequence(hidden_states, batch_first=True)
+
+        return hidden_states
+
+
+@auto_docstring(
+    custom_intro="""
+    Transformer speech encoder consisting of *config.speech_encoder_layers* conformer self attention layers.
+    Each layer is a [`SeamlessM4Tv2ConformerEncoderLayer`].
+    """
+)
+# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TSpeechEncoder with SeamlessM4T->SeamlessM4Tv2
+class SeamlessM4Tv2SpeechEncoder(SeamlessM4Tv2PreTrainedModel):
+    main_input_name = "input_features"
+
+    def __init__(self, config: SeamlessM4Tv2Config):
+        super().__init__(config)
+
+        self.feature_projection = SeamlessM4Tv2ConformerFeatureProjection(config)
+        self.encoder = SeamlessM4Tv2ConformerEncoder(config)
+        self.intermediate_ffn = SeamlessM4Tv2ConformerFeedForward(config, act_fn="relu", dropout=0.0)
+        self.adapter = SeamlessM4Tv2ConformerAdapter(config) if config.add_adapter else None
+        self.inner_layer_norm = nn.LayerNorm(config.hidden_size)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        input_features: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[tuple, Wav2Vec2BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_features is None:
+            raise ValueError(
+                """Both `input_features` and `inputs_embeds` are `None` in `SeamlessM4Tv2SpeechEncoder.forward`.
+                Make sure one of them is not `None`."""
+            )
+
+        hidden_states = self.feature_projection(input_features)
+
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = encoder_outputs[0]
+
+        expanded_hidden_states = self.intermediate_ffn(hidden_states)
+        hidden_states = hidden_states + 0.5 * expanded_hidden_states
+
+        if self.adapter is not None:
+            hidden_states = self.adapter(hidden_states, attention_mask=attention_mask)
+
+        hidden_states = self.inner_layer_norm(hidden_states)
+
+        if not return_dict:
+            return (hidden_states,) + encoder_outputs[1:]
+
+        return Wav2Vec2BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+# inspired from MBart and NllbMoe
+@auto_docstring(
+    custom_intro="""
+    Transformer encoder consisting of *config.encoder_layers* self attention layers. Each layer is a [`SeamlessM4Tv2EncoderLayer`].
+    """
+)
+# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TEncoder with SeamlessM4T->SeamlessM4Tv2
+class SeamlessM4Tv2Encoder(SeamlessM4Tv2PreTrainedModel):
+    def __init__(
+        self,
+        config: SeamlessM4Tv2Config,
+        embed_tokens: Optional[nn.Embedding] = None,
+        is_t2u_encoder: bool = False,
+    ):
+        r"""
+        embed_tokens (`nn.Embedding`, *optional*):
+            Input embedding
+        is_t2u_encoder (`bool`, *optional*, defaults to `False`):
+            indicates if it belongs to the text-to-units model, in which case it won't have input embeddings
+        """
+        super().__init__(config)
+
+        self.dropout = config.dropout
+        self.layerdrop = config.encoder_layerdrop
+        self.padding_idx = config.pad_token_id
+        embed_dim = config.hidden_size
+
+        self.is_t2u_encoder = is_t2u_encoder
+        self.max_source_positions = config.max_position_embeddings
+
+        if not self.is_t2u_encoder:
+            embed_scale = math.sqrt(embed_dim) if config.scale_embedding else 1.0
+
+            self.embed_tokens = SeamlessM4Tv2ScaledWordEmbedding(
+                config.vocab_size, embed_dim, self.padding_idx, embed_scale=embed_scale
+            )
+
+            if embed_tokens is not None:
+                self.embed_tokens.weight = embed_tokens.weight
+
+            self.embed_positions = SeamlessM4Tv2SinusoidalPositionalEmbedding(
+                self.max_source_positions,
+                embed_dim,
+                self.padding_idx,
+            )
+
+        layers = []
+        for _ in range(config.encoder_layers):
+            layers.append(
+                SeamlessM4Tv2EncoderLayer(
+                    config,
+                    encoder_attention_heads=config.encoder_attention_heads,
+                    encoder_ffn_dim=config.encoder_ffn_dim,
+                )
+            )
+
+        self.layers = nn.ModuleList(layers)
+
+        self.layer_norm = nn.LayerNorm(config.hidden_size)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[tuple, BaseModelOutput]:
+        r"""
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you
+                provide it.
+
+                Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
+                [`PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
+
+                - 1 for tokens that are **not masked**,
+                - 0 for tokens that are **masked**.
+
+                [What are attention masks?](../glossary#attention-mask)
+            inputs_embeds (`torch.FloatTensor` of shape `(batch_size, sequence_length, hidden_size)`, *optional*):
+                Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation.
+                This is useful if you want more control over how to convert `input_ids` indices into associated vectors
+                than the model's internal embedding lookup matrix.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if input_ids is not None and self.is_t2u_encoder:
+            raise ValueError(
+                "You cannot pass input_ids to the encoder of the text_to_units model. Pass inputs_embeds instead."
+            )
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+        elif input_ids is not None:
+            input = input_ids
+            input_shape = input.shape
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input = inputs_embeds[:, :, -1]
+        else:
+            raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if not self.is_t2u_encoder:
+            embed_pos = self.embed_positions(input)
+
+            hidden_states = inputs_embeds + embed_pos.to(inputs_embeds.device)
+        else:
+            hidden_states = inputs_embeds
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # expand attention_mask
+        if attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            attention_mask = _prepare_4d_attention_mask(attention_mask, inputs_embeds.dtype)
+
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+
+            if to_drop:
+                layer_outputs = (None, None)
+            else:
+                layer_outputs = encoder_layer(
+                    hidden_states,
+                    attention_mask,
+                    output_attentions=output_attentions,
+                )
+
+                hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`SeamlessM4Tv2DecoderLayer`].
+    """
+)
+# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TDecoder with SeamlessM4T->SeamlessM4Tv2
+class SeamlessM4Tv2Decoder(SeamlessM4Tv2PreTrainedModel):
+    def __init__(
+        self,
+        config: SeamlessM4Tv2Config,
+        embed_tokens: Optional[nn.Embedding] = None,
+    ):
+        r"""
+        embed_tokens (`nn.Embedding`, *optional*):
+            Input embedding
+        """
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.max_target_positions = config.max_position_embeddings
+        embed_scale = math.sqrt(config.hidden_size) if config.scale_embedding else 1.0
+
+        if embed_tokens is not None:
+            # if embed_tokens defined, use its shape instead
+            self.embed_tokens = SeamlessM4Tv2ScaledWordEmbedding(
+                embed_tokens.num_embeddings, embed_tokens.embedding_dim, self.padding_idx, embed_scale=embed_scale
+            )
+            self.embed_tokens.weight = embed_tokens.weight
+        else:
+            self.embed_tokens = SeamlessM4Tv2ScaledWordEmbedding(
+                self.vocab_size, config.hidden_size, self.padding_idx, embed_scale=embed_scale
+            )
+
+        self.embed_positions = SeamlessM4Tv2SinusoidalPositionalEmbedding(
+            self.max_target_positions,
+            config.hidden_size,
+            padding_idx=self.padding_idx,
+        )
+
+        layers = []
+        for i in range(config.decoder_layers):
+            layers.append(
+                SeamlessM4Tv2DecoderLayer(
+                    config,
+                    decoder_attention_heads=config.decoder_attention_heads,
+                    decoder_ffn_dim=config.decoder_ffn_dim,
+                    layer_idx=i,
+                )
+            )
+        self.layers = nn.ModuleList(layers)
+        self.layer_norm = nn.LayerNorm(config.hidden_size)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        encoder_attention_mask: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> Union[tuple, BaseModelOutputWithPastAndCrossAttentions]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # retrieve input_ids and inputs_embeds
+        if input_ids is not None and inputs_embeds is not None:
+            raise ValueError("You cannot specify both decoder_input_ids and decoder_inputs_embeds at the same time")
+        elif input_ids is not None:
+            input = input_ids
+            input_shape = input.size()
+            input_ids = input_ids.view(-1, input_shape[-1])
+        elif inputs_embeds is not None:
+            input_shape = inputs_embeds.size()[:-1]
+            input = inputs_embeds[:, :, -1]
+        else:
+            raise ValueError("You have to specify either decoder_input_ids or decoder_inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if self.gradient_checkpointing and self.training:
+            if use_cache:
+                logger.warning_once(
+                    "`use_cache=True` is incompatible with gradient checkpointing`. Setting `use_cache=False`..."
+                )
+                use_cache = False
+
+        # initialize `past_key_values`
+        if use_cache and past_key_values is None:
+            past_key_values = EncoderDecoderCache(DynamicCache(config=self.config), DynamicCache(config=self.config))
+        if use_cache and isinstance(past_key_values, tuple):
+            logger.warning_once(
+                "Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.58.0. "
+                "You should pass an instance of `EncoderDecoderCache` instead, e.g. "
+                "`past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`."
+            )
+            past_key_values = EncoderDecoderCache.from_legacy_cache(past_key_values)
+
+        past_key_values_length = past_key_values.get_seq_length() if past_key_values is not None else 0
+        attention_mask = _prepare_4d_causal_attention_mask(
+            attention_mask, input_shape, inputs_embeds, past_key_values_length
+        )
+
+        # expand encoder attention mask
+        if encoder_hidden_states is not None and encoder_attention_mask is not None:
+            # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len]
+            encoder_attention_mask = _prepare_4d_attention_mask(
+                encoder_attention_mask, inputs_embeds.dtype, tgt_len=input_shape[-1]
+            )
+
+        # embed positions
+        positions = self.embed_positions(input, past_key_values_length=past_key_values_length)
+
+        hidden_states = inputs_embeds + positions.to(inputs_embeds.device)
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+        all_cross_attentions = () if (output_attentions and encoder_hidden_states is not None) else None
+
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask,
+                encoder_hidden_states,  # as a positional argument for gradient checkpointing
+                encoder_attention_mask=encoder_attention_mask,
+                past_key_values=past_key_values,
+                output_attentions=output_attentions,
+                use_cache=use_cache,
+                cache_position=cache_position,
+            )
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[1],)
+
+                if encoder_hidden_states is not None:
+                    all_cross_attentions += (layer_outputs[2],)
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        if not return_dict:
+            return tuple(
+                v
+                for v in [hidden_states, past_key_values, all_hidden_states, all_self_attns, all_cross_attentions]
+                if v is not None
+            )
+        return BaseModelOutputWithPastAndCrossAttentions(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            cross_attentions=all_cross_attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    Transformer decoder consisting of *config.decoder_layers* layers. Each layer is a [`SeamlessM4Tv2DecoderLayer`].
+    """
+)
+class SeamlessM4Tv2TextToUnitDecoder(SeamlessM4Tv2PreTrainedModel):
+    def __init__(
+        self,
+        config: SeamlessM4Tv2Config,
+        embed_tokens: Optional[nn.Embedding] = None,
+    ):
+        r"""
+        embed_tokens (`nn.Embedding`, *optional*):
+            Input embedding
+        """
+        super().__init__(config)
+        self.dropout = config.dropout
+        self.layerdrop = config.decoder_layerdrop
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+        self.max_target_positions = config.max_position_embeddings
+        self.embed_scale = math.sqrt(config.hidden_size) if config.scale_embedding else 1.0
+
+        if embed_tokens is not None:
+            # if embed_tokens defined, use its shape instead
+            self.embed_tokens = nn.Embedding(embed_tokens.num_embeddings, embed_tokens.embedding_dim, self.padding_idx)
+            self.embed_tokens.weight = embed_tokens.weight
+        else:
+            self.embed_tokens = nn.Embedding(self.vocab_size, config.hidden_size, self.padding_idx)
+
+        self.embed_char = nn.Embedding(config.char_vocab_size, config.hidden_size)
+        self.embed_char_positions = SeamlessM4Tv2SinusoidalPositionalEmbedding(
+            self.max_target_positions,
+            config.hidden_size,
+            padding_idx=self.padding_idx,
+        )
+
+        self.pos_emb_alpha_char = nn.Parameter(torch.ones(1))
+        self.pos_emb_alpha = nn.Parameter(torch.ones(1))
+        self.duration_predictor = SeamlessM4Tv2VariancePredictor(
+            config.variance_predictor_embed_dim,
+            config.variance_predictor_hidden_dim,
+            config.variance_predictor_kernel_size,
+            config.variance_pred_dropout,
+        )
+
+        self.embed_positions = SeamlessM4Tv2SinusoidalPositionalEmbedding(
+            self.max_target_positions,
+            config.hidden_size,
+            padding_idx=self.padding_idx,
+        )
+
+        layers = []
+        for _ in range(config.decoder_layers):
+            layers.append(
+                SeamlessM4Tv2TextToUnitDecoderLayer(
+                    config,
+                    decoder_attention_heads=config.decoder_attention_heads,
+                    decoder_ffn_dim=config.decoder_ffn_dim,
+                )
+            )
+        self.layers = nn.ModuleList(layers)
+        self.layer_norm = nn.LayerNorm(config.hidden_size)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        char_input_ids: Optional[torch.LongTensor] = None,
+        char_count_per_id: Optional[torch.LongTensor] = None,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, SeamlessM4Tv2TextToUnitDecoderOutput]:
+        r"""
+        Args:
+            char_input_ids (`torch.LongTensor` of shape `(batch_size, char_sequence_length)`):
+                Character indices. The correspondence between characters and indices can be found in `char_to_id`, a
+                dictionary in the generation configuration.
+            char_count_per_id (`torch.Tensor` of shape `(batch_size, encoder_sequence_length)`):
+                Number of characters per text input id.
+            encoder_hidden_states (`torch.FloatTensor` of shape `(batch_size, encoder_sequence_length, hidden_size)`):
+                Sequence of hidden-states at the output of the last layer of the encoder. Used in the cross-attention
+                of the decoder.
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            output_hidden_states (`bool`, *optional*):
+                Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors
+                for more detail.
+            return_dict (`bool`, *optional*):
+                Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # create padding mask for character lengths
+        char_padding_mask = _compute_new_attention_mask(char_input_ids, char_count_per_id.sum(1))
+
+        # upsample hidden states according to characters sequence lengths
+        char_hidden_states = self._hard_upsample(encoder_hidden_states, char_count_per_id)
+        # embed char positions
+        char_positions = self.pos_emb_alpha_char * self.embed_char_positions(inputs_embeds=char_hidden_states)
+        # update char hidden states with positions and char embeddings
+        char_hidden_states = self.embed_char(char_input_ids) * self.embed_scale + char_positions + char_hidden_states
+
+        # predict duration
+        log_dur_pred = self.duration_predictor(char_hidden_states, padding_mask=char_padding_mask)
+        dur_out = torch.clamp(torch.round(torch.expm1(log_dur_pred)).long(), min=1)
+        dur_out = dur_out.masked_fill(~char_padding_mask.bool(), 0.0)
+
+        # upsample char hidden states according to predicted duration
+        char_hidden_states = self._hard_upsample(char_hidden_states, dur_out)
+
+        positions = self.pos_emb_alpha * self.embed_positions(inputs_embeds=char_hidden_states)
+        hidden_states = char_hidden_states + positions
+
+        padding_mask = _compute_new_attention_mask(hidden_states, dur_out.sum(1))
+        attention_mask = _prepare_4d_attention_mask(padding_mask, hidden_states.dtype)
+
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+
+        # decoder layers
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+
+        for idx, decoder_layer in enumerate(self.layers):
+            # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:
+                    continue
+
+            layer_outputs = decoder_layer(
+                hidden_states,
+                attention_mask=attention_mask,
+                padding_mask=padding_mask,
+                output_attentions=output_attentions,
+            )
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                all_self_attns += (layer_outputs[2],)
+
+        hidden_states = self.layer_norm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attns, padding_mask] if v is not None)
+        return SeamlessM4Tv2TextToUnitDecoderOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+            padding_mask=padding_mask,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    Transformer bare text-to-unit encoder-decoder. The encoder is a [`SeamlessM4Tv2Encoder`] without embeddings and the decoder is a [`SeamlessM4Tv2TextToUnitDecoder`].
+    """
+)
+class SeamlessM4Tv2TextToUnitModel(SeamlessM4Tv2PreTrainedModel):
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TTextToUnitModel.__init__ with SeamlessM4T->SeamlessM4Tv2, Decoder->TextToUnitDecoder
+    def __init__(
+        self,
+        config: SeamlessM4Tv2Config,
+        embed_tokens_decoder: Optional[nn.Embedding] = None,
+    ):
+        r"""
+        embed_tokens_decoder (`nn.Embedding`, *optional*):
+            input embedding of the decoder.
+        """
+        super().__init__(config)
+
+        self.encoder = SeamlessM4Tv2Encoder(config, is_t2u_encoder=True)
+        self.decoder = SeamlessM4Tv2TextToUnitDecoder(config, embed_tokens_decoder)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        char_input_ids: Optional[torch.LongTensor] = None,
+        char_count_per_id: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple[torch.Tensor], Seq2SeqModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if encoder_outputs is None:
+            encoder_outputs = self.encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        # decoder outputs consists of (dec_features, dec_hidden, dec_attn, padding_mask)
+        decoder_outputs = self.decoder(
+            char_input_ids=char_input_ids,
+            char_count_per_id=char_count_per_id,
+            encoder_hidden_states=encoder_outputs[0],
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+
+        return SeamlessM4Tv2TextToUnitOutput(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            padding_mask=decoder_outputs.padding_mask,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    Transformer text-to-unit encoder-decoder with a language model head. The base encoder-decoder model is a [`SeamlessM4Tv2TextToUnitModel`].
+    """
+)
+class SeamlessM4Tv2TextToUnitForConditionalGeneration(SeamlessM4Tv2PreTrainedModel, GenerationMixin):
+    _keys_to_ignore_on_load_missing = [
+        "vocoder",
+        "speech_encoder",
+        "text_encoder",
+        "text_decoder",
+    ]
+    _tied_weights_keys = ["lm_head.weight"]
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TTextToUnitForConditionalGeneration.__init__ with SeamlessM4T->SeamlessM4Tv2
+    def __init__(
+        self,
+        config: SeamlessM4Tv2Config,
+        embed_tokens_decoder: Optional[nn.Embedding] = None,
+    ):
+        r"""
+        embed_tokens_decoder (`nn.Embedding`, *optional*):
+            input embedding of the decoder.
+        """
+        # update config - used principality for bos_token_id etc.
+        config = copy.deepcopy(config)
+        for param, val in config.to_dict().items():
+            if param.startswith("t2u_"):
+                config.__setattr__(param[4:], val)
+        super().__init__(config)
+
+        self.model = SeamlessM4Tv2TextToUnitModel(config, embed_tokens_decoder)
+
+        self.lm_head = nn.Linear(config.hidden_size, config.t2u_vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TTextToUnitForConditionalGeneration.get_encoder
+    def get_encoder(self):
+        return self.model.encoder
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TTextToUnitForConditionalGeneration.get_decoder
+    def get_decoder(self):
+        return self.model.decoder
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TTextToUnitForConditionalGeneration.get_input_embeddings
+    def get_input_embeddings(self):
+        return self.model.decoder.embed_tokens
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TTextToUnitForConditionalGeneration.set_input_embeddings
+    def set_input_embeddings(self, value):
+        self.model.decoder.embed_tokens = value
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        char_input_ids: Optional[torch.LongTensor] = None,
+        char_count_per_id: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Seq2SeqLMOutput, tuple[torch.FloatTensor]]:
+        r"""
+        char_input_ids (`torch.LongTensor` of shape `(batch_size, char_sequence_length)`):
+            Character indices. The correspondence between characters and indices can be found in `char_to_id`, a
+            dictionary in the generation configuration.
+        char_count_per_id (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+            Number of characters per input id.
+        inputs_embeds (`torch.FloatTensor` of shape`(batch_size, sequence_length, hidden_size)`, *optional*):
+            Optionally, instead of passing `input_ids` you can choose to directly pass an embedded representation. This
+            is useful if you want more control over how to convert `input_ids` indices into associated vectors than the
+            model's internal embedding lookup matrix.
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        outputs = self.model(
+            input_ids,
+            char_input_ids=char_input_ids,
+            char_count_per_id=char_count_per_id,
+            attention_mask=attention_mask,
+            encoder_outputs=encoder_outputs,
+            inputs_embeds=inputs_embeds,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        lm_logits = self.lm_head(outputs[0])
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            labels = labels.to(lm_logits.device)
+            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            output = (lm_logits,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return SeamlessM4Tv2TextToUnitOutput(
+            last_hidden_state=lm_logits,
+            padding_mask=outputs.padding_mask,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+            loss=masked_lm_loss,
+        )
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TTextToUnitForConditionalGeneration._tie_weights
+    def _tie_weights(self) -> None:
+        if getattr(self.config, "tie_word_embeddings", True):
+            output_embeddings = self.get_output_embeddings()
+            if output_embeddings is not None:
+                self._tie_or_clone_weights(output_embeddings, self.get_input_embeddings())
+
+
+############ VOCODER related code ################
+
+
+# Copied from transformers.models.speecht5.modeling_speecht5.HifiGanResidualBlock
+class HifiGanResidualBlock(nn.Module):
+    def __init__(self, channels, kernel_size=3, dilation=(1, 3, 5), leaky_relu_slope=0.1):
+        super().__init__()
+        self.leaky_relu_slope = leaky_relu_slope
+
+        self.convs1 = nn.ModuleList(
+            [
+                nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    stride=1,
+                    dilation=dilation[i],
+                    padding=self.get_padding(kernel_size, dilation[i]),
+                )
+                for i in range(len(dilation))
+            ]
+        )
+        self.convs2 = nn.ModuleList(
+            [
+                nn.Conv1d(
+                    channels,
+                    channels,
+                    kernel_size,
+                    stride=1,
+                    dilation=1,
+                    padding=self.get_padding(kernel_size, 1),
+                )
+                for _ in range(len(dilation))
+            ]
+        )
+
+    def get_padding(self, kernel_size, dilation=1):
+        return (kernel_size * dilation - dilation) // 2
+
+    def apply_weight_norm(self):
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
+        for layer in self.convs1:
+            weight_norm(layer)
+        for layer in self.convs2:
+            weight_norm(layer)
+
+    def remove_weight_norm(self):
+        for layer in self.convs1:
+            nn.utils.remove_weight_norm(layer)
+        for layer in self.convs2:
+            nn.utils.remove_weight_norm(layer)
+
+    def forward(self, hidden_states):
+        for conv1, conv2 in zip(self.convs1, self.convs2):
+            residual = hidden_states
+            hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
+            hidden_states = conv1(hidden_states)
+            hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
+            hidden_states = conv2(hidden_states)
+            hidden_states = hidden_states + residual
+        return hidden_states
+
+
+class SeamlessM4Tv2VariancePredictor(nn.Module):
+    def __init__(self, embed_dim, hidden_dim, kernel_size, var_pred_dropout):
+        super().__init__()
+
+        self.conv1 = nn.Conv1d(
+            embed_dim,
+            hidden_dim,
+            kernel_size=kernel_size,
+            padding="same",
+        )
+        self.activation_function = nn.ReLU()
+        self.ln1 = nn.LayerNorm(hidden_dim)
+        self.dropout_module = nn.Dropout(p=var_pred_dropout)
+        self.conv2 = nn.Conv1d(
+            hidden_dim,
+            hidden_dim,
+            kernel_size=kernel_size,
+            padding="same",
+        )
+        self.ln2 = nn.LayerNorm(hidden_dim)
+        self.proj = nn.Linear(hidden_dim, 1)
+
+    def forward(self, hidden_states: Tensor, padding_mask: Optional[Tensor] = None) -> Tensor:
+        # Input: B x T x C; Output: B x T
+        if padding_mask is not None:
+            hidden_states = hidden_states.masked_fill(~padding_mask.bool().unsqueeze(-1), 0.0)
+        hidden_states = self.conv1(hidden_states.transpose(1, 2))
+        hidden_states = self.activation_function(hidden_states).transpose(1, 2)
+        hidden_states = self.dropout_module(self.ln1(hidden_states))
+        if padding_mask is not None:
+            hidden_states = hidden_states.masked_fill(~padding_mask.bool().unsqueeze(-1), 0.0)
+        hidden_states = self.conv2(hidden_states.transpose(1, 2))
+        hidden_states = self.activation_function(hidden_states).transpose(1, 2)
+        hidden_states = self.dropout_module(self.ln2(hidden_states))
+        return self.proj(hidden_states).squeeze(dim=2)
+
+
+# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4THifiGan with SeamlessM4T->SeamlessM4Tv2
+class SeamlessM4Tv2HifiGan(nn.Module):
+    def __init__(self, config: SeamlessM4Tv2Config):
+        super().__init__()
+        model_in_dim = config.unit_embed_dim + config.lang_embed_dim + config.spkr_embed_dim
+        self.leaky_relu_slope = config.leaky_relu_slope
+        self.num_kernels = len(config.resblock_kernel_sizes)
+        self.num_upsamples = len(config.upsample_rates)
+        self.conv_pre = nn.Conv1d(
+            model_in_dim,
+            config.upsample_initial_channel,
+            kernel_size=7,
+            stride=1,
+            padding=3,
+        )
+
+        self.upsampler = nn.ModuleList()
+        for i, (upsample_rate, kernel_size) in enumerate(zip(config.upsample_rates, config.upsample_kernel_sizes)):
+            self.upsampler.append(
+                nn.ConvTranspose1d(
+                    config.upsample_initial_channel // (2**i),
+                    config.upsample_initial_channel // (2 ** (i + 1)),
+                    kernel_size=kernel_size,
+                    stride=upsample_rate,
+                    padding=(kernel_size - upsample_rate) // 2,
+                )
+            )
+
+        self.resblocks = nn.ModuleList()
+        for i in range(len(self.upsampler)):
+            channels = config.upsample_initial_channel // (2 ** (i + 1))
+            for kernel_size, dilation in zip(config.resblock_kernel_sizes, config.resblock_dilation_sizes):
+                self.resblocks.append(HifiGanResidualBlock(channels, kernel_size, dilation, config.leaky_relu_slope))
+
+        self.conv_post = nn.Conv1d(channels, 1, kernel_size=7, stride=1, padding=3)
+
+    def forward(self, input_embeds: torch.FloatTensor) -> torch.FloatTensor:
+        r"""
+        Converts a log-mel spectrogram into a speech waveform. Passing a batch of log-mel spectrograms returns a batch
+        of speech waveforms. Passing a single, un-batched log-mel spectrogram returns a single, un-batched speech
+        waveform.
+
+        Args:
+            spectrogram (`torch.FloatTensor`):
+                Tensor containing the log-mel spectrograms. Can be batched and of shape `(batch_size, sequence_length,
+                model_in_dim)`, or un-batched and of shape `(sequence_length, model_in_dim)`. Note that `model_in_dim`
+                is the sum of `config.unit_embed_dim`, `config.lang_embed_dim` and `config.spkr_embed_dim`.
+
+        Returns:
+            `torch.FloatTensor`: Tensor containing the speech waveform. If the input spectrogram is batched, will be of
+            shape `(batch_size, num_frames,)`. If un-batched, will be of shape `(num_frames,)`.
+        """
+
+        hidden_states = self.conv_pre(input_embeds)
+        for i in range(self.num_upsamples):
+            hidden_states = nn.functional.leaky_relu(hidden_states, self.leaky_relu_slope)
+            hidden_states = self.upsampler[i](hidden_states)
+
+            res_state = self.resblocks[i * self.num_kernels](hidden_states)
+            for j in range(1, self.num_kernels):
+                res_state += self.resblocks[i * self.num_kernels + j](hidden_states)
+            hidden_states = res_state / self.num_kernels
+
+        hidden_states = nn.functional.leaky_relu(hidden_states)
+        hidden_states = self.conv_post(hidden_states)
+        hidden_states = torch.tanh(hidden_states)
+
+        # remove seq-len dim since this collapses to 1
+        waveform = hidden_states.squeeze(1)
+
+        return waveform
+
+
+@auto_docstring(
+    custom_intro="""
+    Code HiFi-GAN vocoder as described in this [repository](https://github.com/facebookresearch/speech-resynthesis).
+    """
+)
+class SeamlessM4Tv2CodeHifiGan(PreTrainedModel):
+    config: SeamlessM4Tv2Config
+    main_input_name = "input_embeds"
+    _no_split_modules = []
+
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.pad_token_id = config.t2u_pad_token_id
+        embed_dim = config.unit_embed_dim
+        kernel_size = config.variance_predictor_kernel_size
+        var_pred_dropout = config.var_pred_dropout
+        self.dur_predictor = SeamlessM4Tv2VariancePredictor(embed_dim, embed_dim, kernel_size, var_pred_dropout)
+
+        self.unit_embedding = nn.Embedding(config.unit_hifi_gan_vocab_size, config.unit_embed_dim)
+        self.speaker_embedding = nn.Embedding(config.vocoder_num_spkrs, config.spkr_embed_dim)
+        self.language_embedding = nn.Embedding(config.vocoder_num_langs, config.lang_embed_dim)
+
+        self.hifi_gan = SeamlessM4Tv2HifiGan(config)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TCodeHifiGan._get_dur_output_lengths
+    def _get_dur_output_lengths(self, input_ids, dur_out):
+        """
+        Computes the output length after the duration layer.
+        """
+        unit_lengths = (input_ids != self.pad_token_id).sum(1)
+
+        # take care of edge cases where no padding or too many padding
+        unit_lengths = torch.clamp(unit_lengths, 0, dur_out.shape[1] - 1)
+
+        cumulative_dur_out = torch.cumsum(dur_out, dim=1)
+        unit_lengths = cumulative_dur_out.gather(dim=1, index=unit_lengths.unsqueeze(1)).squeeze()
+
+        return unit_lengths
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TCodeHifiGan._get_output_hifigan_lengths
+    def _get_output_hifigan_lengths(self, input_lengths: Union[torch.LongTensor, int]):
+        """
+        Computes the output length of the hifigan convolutional layers
+        """
+
+        def _conv_out_length(input_length, kernel_size, stride, pad, dilation=1):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return (
+                torch.div(input_length + 2 * pad - dilation * (kernel_size - 1) - 1, stride, rounding_mode="floor") + 1
+            )
+
+        def _transpose_conv_out_length(input_length, kernel_size, stride, pad, dilation=1):
+            return (input_length - 1) * stride - 2 * pad + dilation * (kernel_size - 1) + 1
+
+        # conv_pre
+        input_lengths = _conv_out_length(input_lengths, 7, 1, 3)
+
+        # upsampler
+        for i, (upsample_rate, kernel_size) in enumerate(
+            zip(self.config.upsample_rates, self.config.upsample_kernel_sizes)
+        ):
+            input_lengths = _transpose_conv_out_length(
+                input_lengths, kernel_size, upsample_rate, (kernel_size - upsample_rate) // 2
+            )
+
+        # resblock
+        for i in range(len(self.config.upsample_rates)):
+            for kernel_size, dilation in zip(self.config.resblock_kernel_sizes, self.config.resblock_dilation_sizes):
+                for dil in dilation:
+                    input_lengths = _conv_out_length(
+                        input_lengths, kernel_size, 1, (kernel_size - 1) * dil // 2, dilation=dil
+                    )
+
+                for dil in dilation:
+                    input_lengths = _conv_out_length(input_lengths, kernel_size, 1, (kernel_size - 1) // 2, dilation=1)
+
+        # conv_post
+        input_lengths = _conv_out_length(input_lengths, 7, 1, 3)
+
+        return input_lengths
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TCodeHifiGan.forward with SeamlessM4T->SeamlessM4Tv2, spkr_id->speaker_id
+    def forward(
+        self, input_ids: torch.LongTensor, speaker_id: torch.Tensor, lang_id: torch.Tensor
+    ) -> tuple[torch.Tensor]:
+        """
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary.
+
+                Indices can be obtained using [`SeamlessM4Tv2TextToUnitForConditionalGeneration`]. [What are input
+                IDs?](../glossary#input-ids)
+            speaker_id (`int`, *optional*):
+                The id of the speaker used for speech synthesis. Must be lower than `config.vocoder_num_spkrs`.
+            tgt_lang (`str`, *optional*):
+                The language id to use as target language for translation.
+        """
+        hidden_states = self.unit_embedding(input_ids).transpose(1, 2)
+        spkr = self.speaker_embedding(speaker_id).transpose(1, 2)
+        lang = self.language_embedding(lang_id).transpose(1, 2)
+
+        log_dur_pred = self.dur_predictor(hidden_states.transpose(1, 2))
+        dur_out = torch.clamp(torch.round(torch.expm1(log_dur_pred)).long(), min=1)
+        # B x C x T
+        if hidden_states.size(0) == 1:
+            hidden_states = torch.repeat_interleave(hidden_states, dur_out.view(-1), dim=2)
+        else:
+            # if batched sample, need to interleave per sample, and pad -> loss of parallelism
+            if hidden_states.shape[0] > 1 and self.training:
+                logger.warning(
+                    """`self.training=True` and you use batching. You lose parallelism during the hifigan
+                               forward pass because the samples are interleaved."""
+                )
+            hidden_states = [
+                torch.repeat_interleave(hidden_state, duration, dim=-1).transpose(0, 1)
+                for (hidden_state, duration) in zip(hidden_states, dur_out)
+            ]
+
+            hidden_states = nn.utils.rnn.pad_sequence(hidden_states, batch_first=True).transpose(1, 2)
+
+        spkr = spkr.repeat(1, 1, hidden_states.shape[-1])
+        lang = lang.repeat(1, 1, hidden_states.shape[-1])
+        hidden_states = torch.cat([lang, hidden_states, spkr], dim=1)
+
+        hidden_states = self.hifi_gan(hidden_states)
+
+        unit_lengths = self._get_dur_output_lengths(input_ids, dur_out)
+        lengths = self._get_output_hifigan_lengths(unit_lengths)
+
+        return hidden_states, lengths
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TCodeHifiGan._init_weights
+    def _init_weights(self, module: nn.Module):
+        """Initialize the weights."""
+        std = self.config.initializer_range
+        if isinstance(module, (nn.Linear, nn.Conv1d, nn.ConvTranspose1d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, nn.LayerNorm):
+            module.weight.data.fill_(1.0)
+            module.bias.data.zero_()
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TCodeHifiGan.apply_weight_norm
+    def apply_weight_norm(self):
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
+        weight_norm(self.hifi_gan.conv_pre)
+        for layer in self.hifi_gan.upsampler:
+            weight_norm(layer)
+        for layer in self.hifi_gan.resblocks:
+            layer.apply_weight_norm()
+        weight_norm(self.hifi_gan.conv_post)
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TCodeHifiGan.remove_weight_norm
+    def remove_weight_norm(self):
+        nn.utils.remove_weight_norm(self.hifi_gan.conv_pre)
+        for layer in self.hifi_gan.upsampler:
+            nn.utils.remove_weight_norm(layer)
+        for layer in self.hifi_gan.resblocks:
+            layer.remove_weight_norm()
+        nn.utils.remove_weight_norm(self.hifi_gan.conv_post)
+
+
+############ WHOLE MODEL related code ################
+
+
+@auto_docstring(
+    custom_intro="""
+    The text-to-text SeamlessM4Tv2 Model transformer which can be used for T2TT.
+    """
+)
+# Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToText with SeamlessM4T->SeamlessM4Tv2,SeamlessM4Tv2Tokenizer->SeamlessM4TTokenizer, SeamlessM4Tv2Processor->SeamlessM4TProcessor, SEAMLESS_M4T->SEAMLESS_M4T_V2
+class SeamlessM4Tv2ForTextToText(SeamlessM4Tv2PreTrainedModel, GenerationMixin):
+    _keys_to_ignore_on_load_missing = ["speech_encoder", "t2u_model", "vocoder"]
+    main_input_name = "input_ids"
+
+    _tied_weights_keys = [
+        "lm_head.weight",
+        "text_encoder.embed_tokens.weight",
+        "text_decoder.embed_tokens.weight",
+    ]
+
+    def __init__(self, config: SeamlessM4Tv2Config):
+        super().__init__(config)
+
+        self.shared = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
+
+        self.text_encoder = SeamlessM4Tv2Encoder(config, self.shared)
+        self.text_decoder = SeamlessM4Tv2Decoder(config, self.shared)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def get_encoder(self):
+        return self.text_encoder
+
+    def get_decoder(self):
+        return self.text_decoder
+
+    def get_input_embeddings(self):
+        return self.text_decoder.embed_tokens
+
+    def set_input_embeddings(self, value):
+        self.text_encoder.embed_tokens = value
+        self.text_decoder.embed_tokens = value
+        self.shared = value
+
+    def _tie_weights(self):
+        if self.config.tie_word_embeddings:
+            self._tie_or_clone_weights(self.text_encoder.embed_tokens, self.shared)
+            self._tie_or_clone_weights(self.text_decoder.embed_tokens, self.shared)
+            self._tie_or_clone_weights(self.lm_head, self.shared)
+
+    @auto_docstring(custom_args=SEAMLESS_M4T_V2_COMMON_CUSTOM_ARGS)
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Seq2SeqLMOutput, tuple[torch.FloatTensor]]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
+        if labels is not None:
+            if use_cache:
+                logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
+            use_cache = False
+            if decoder_input_ids is None and decoder_inputs_embeds is None:
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if encoder_outputs is None:
+            encoder_outputs = self.text_encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        encoder_attention_mask = attention_mask
+
+        # decoder outputs consists of (dec_features, past_key_values, dec_hidden, dec_attn)
+        decoder_outputs = self.text_decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        lm_logits = self.lm_head(decoder_outputs[0])
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            labels = labels.to(lm_logits.device)
+            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            outputs = decoder_outputs + encoder_outputs
+            output = (lm_logits,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=masked_lm_loss,
+            logits=lm_logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+    def generate(
+        self,
+        input_ids=None,
+        tgt_lang=None,
+        generation_config=None,
+        logits_processor=None,
+        stopping_criteria=None,
+        prefix_allowed_tokens_fn=None,
+        synced_gpus=False,
+        **kwargs,
+    ):
+        """
+        Generates sequences of token ids.
+
+        <Tip warning={true}>
+
+        Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
+        model's default generation configuration. You can override any `generation_config` by passing the corresponding
+        parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`.
+
+        For an overview of generation strategies and code examples, check out the [following
+        guide](./generation_strategies).
+
+        </Tip>
+
+        Parameters:
+            input_ids (`torch.Tensor` of varying shape depending on the modality, *optional*):
+                Indices of input sequence tokens in the vocabulary.
+
+                Indices can be obtained using [`SeamlessM4TTokenizer`] or [`SeamlessM4TProcessor`]. See
+                [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            tgt_lang (`str`, *optional*):
+                The language to use as target language for translation.
+            generation_config (`~generation.GenerationConfig`, *optional*):
+                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
+                passed to generate matching the attributes of `generation_config` will override them. If
+                `generation_config` is not provided, the default will be used, which had the following loading
+                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
+                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
+                default values, whose documentation should be checked to parameterize generation.
+            logits_processor (`LogitsProcessorList`, *optional*):
+                Custom logits processors that complement the default logits processors built from arguments and
+                generation config. If a logit processor is passed that is already created with the arguments or a
+                generation config an error is thrown. This feature is intended for advanced users.
+            stopping_criteria (`StoppingCriteriaList`, *optional*):
+                Custom stopping criteria that complement the default stopping criteria built from arguments and a
+                generation config. If a stopping criteria is passed that is already created with the arguments or a
+                generation config an error is thrown. This feature is intended for advanced users.
+            prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], list[int]]`, *optional*):
+                If provided, this function constraints the beam search to allowed tokens only at each step. If not
+                provided no constraint is applied. This function takes 2 arguments: the batch ID `batch_id` and
+                `input_ids`. It has to return a list with the allowed tokens for the next generation step conditioned
+                on the batch ID `batch_id` and the previously generated tokens `inputs_ids`. This argument is useful
+                for constrained generation conditioned on the prefix, as described in [Autoregressive Entity
+                Retrieval](https://huggingface.co/papers/2010.00904).
+            synced_gpus (`bool`, *optional*, defaults to `False`):
+                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
+                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
+            kwargs (`dict[str, Any]`, *optional*):
+                Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
+                forwarded to the `forward` function of the model.
+
+        Return:
+            [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True`
+            or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`. The possible
+            [`~utils.ModelOutput`] types are:
+                - [`~generation.GenerateEncoderDecoderOutput`],
+                - [`~generation.GenerateBeamEncoderDecoderOutput`]
+        """
+        # prepare text_decoder_input_ids
+        text_decoder_input_ids = kwargs.pop("decoder_input_ids", None)
+        # overwrite text_decoder_input_ids if tgt_lang is passed. The latter gets priority over decoder_input_ids.
+        if tgt_lang is not None:
+            batch_size = len(input_ids) if input_ids is not None else len(kwargs.get("inputs_embeds"))
+
+            if hasattr(self.generation_config, "text_decoder_lang_to_code_id"):
+                # also accept __xxx__
+                tgt_lang = tgt_lang.replace("__", "")
+                if tgt_lang not in self.generation_config.text_decoder_lang_to_code_id:
+                    raise ValueError(
+                        f"""`tgt_lang={tgt_lang}` is not supported by this model. Please specify a `tgt_lang` in
+                        {", ".join(self.generation_config.text_decoder_lang_to_code_id.keys())}"""
+                    )
+                # tgt_lang gets priority over decoder input ids
+                text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang)
+                text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size, device=self.device)
+            else:
+                raise ValueError(
+                    """This model generation config doesn't have a `text_decoder_lang_to_code_id` key which maps
+                    the target language to the right token id. Make sure to load the right generation config."""
+                )
+        else:
+            # only a warning, otherwise errors appear in the tests
+            logger.warning(
+                """You must either specify a `tgt_lang` or pass a correct `text_decoder_input_ids` to get
+                a correct generation, otherwise the generation will probably make no sense."""
+            )
+
+        return super().generate(
+            input_ids,
+            generation_config,
+            logits_processor,
+            stopping_criteria,
+            prefix_allowed_tokens_fn,
+            synced_gpus,
+            decoder_input_ids=text_decoder_input_ids,
+            **kwargs,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    The speech-to-text SeamlessM4Tv2 Model transformer which can be used for S2TT.
+    """
+)
+class SeamlessM4Tv2ForSpeechToText(SeamlessM4Tv2PreTrainedModel, GenerationMixin):
+    _keys_to_ignore_on_load_missing = ["text_encoder", "t2u_model", "vocoder"]
+    main_input_name = "input_features"
+
+    _tied_weights_keys = [
+        "lm_head.weight",
+        "text_decoder.embed_tokens.weight",
+    ]
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText.__init__ with SeamlessM4T->SeamlessM4Tv2
+    def __init__(self, config: SeamlessM4Tv2Config):
+        super().__init__(config)
+
+        self.shared = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
+        self.speech_encoder = SeamlessM4Tv2SpeechEncoder(config)
+        self.text_decoder = SeamlessM4Tv2Decoder(config, self.shared)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText.get_encoder
+    def get_encoder(self):
+        return self.speech_encoder
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText.get_decoder
+    def get_decoder(self):
+        return self.text_decoder
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText.get_input_embeddings
+    def get_input_embeddings(self):
+        return self.text_decoder.embed_tokens
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText.set_input_embeddings
+    def set_input_embeddings(self, value):
+        self.text_decoder.embed_tokens = value
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText._tie_weights
+    def _tie_weights(self):
+        if self.config.tie_word_embeddings:
+            self._tie_or_clone_weights(self.text_decoder.embed_tokens, self.shared)
+            self._tie_or_clone_weights(self.lm_head, self.shared)
+
+    @auto_docstring(custom_args=SEAMLESS_M4T_V2_COMMON_CUSTOM_ARGS)
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText.forward
+    def forward(
+        self,
+        input_features: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Seq2SeqLMOutput, tuple[torch.FloatTensor]]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
+        if labels is not None:
+            if use_cache:
+                logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
+            use_cache = False
+            if decoder_input_ids is None and decoder_inputs_embeds is None:
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if encoder_outputs is None:
+            encoder_outputs = self.speech_encoder(
+                input_features=input_features,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        encoder_attention_mask = attention_mask
+        if attention_mask is not None:
+            sub_sampled_lengths = self._compute_sub_sample_lengths_from_attention_mask(attention_mask).to(
+                encoder_outputs[0].device
+            )
+            encoder_attention_mask = _compute_new_attention_mask(
+                hidden_states=encoder_outputs[0], seq_lens=sub_sampled_lengths
+            )
+
+        # decoder outputs consists of (dec_features, past_key_values, dec_hidden, dec_attn)
+        decoder_outputs = self.text_decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        lm_logits = self.lm_head(decoder_outputs[0])
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            labels = labels.to(lm_logits.device)
+            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            outputs = decoder_outputs + encoder_outputs
+            output = (lm_logits,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=masked_lm_loss,
+            logits=lm_logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToText.generate
+    def generate(
+        self,
+        input_features=None,
+        tgt_lang=None,
+        generation_config=None,
+        logits_processor=None,
+        stopping_criteria=None,
+        prefix_allowed_tokens_fn=None,
+        synced_gpus=False,
+        **kwargs,
+    ):
+        """
+        Generates sequences of token ids.
+
+        <Tip warning={true}>
+
+        Most generation-controlling parameters are set in `generation_config` which, if not passed, will be set to the
+        model's default generation configuration. You can override any `generation_config` by passing the corresponding
+        parameters to generate(), e.g. `.generate(inputs, num_beams=4, do_sample=True)`.
+
+        For an overview of generation strategies and code examples, check out the [following
+        guide](./generation_strategies).
+
+        </Tip>
+
+        Parameters:
+            input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`):
+                Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the
+                [`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
+
+            tgt_lang (`str`, *optional*):
+                The language to use as target language for translation.
+            generation_config (`~generation.GenerationConfig`, *optional*):
+                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
+                passed to generate matching the attributes of `generation_config` will override them. If
+                `generation_config` is not provided, the default will be used, which had the following loading
+                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
+                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
+                default values, whose documentation should be checked to parameterize generation.
+            logits_processor (`LogitsProcessorList`, *optional*):
+                Custom logits processors that complement the default logits processors built from arguments and
+                generation config. If a logit processor is passed that is already created with the arguments or a
+                generation config an error is thrown. This feature is intended for advanced users.
+            stopping_criteria (`StoppingCriteriaList`, *optional*):
+                Custom stopping criteria that complement the default stopping criteria built from arguments and a
+                generation config. If a stopping criteria is passed that is already created with the arguments or a
+                generation config an error is thrown. This feature is intended for advanced users.
+            prefix_allowed_tokens_fn (`Callable[[int, torch.Tensor], list[int]]`, *optional*):
+                If provided, this function constraints the beam search to allowed tokens only at each step. If not
+                provided no constraint is applied. This function takes 2 arguments: the batch ID `batch_id` and
+                `input_ids`. It has to return a list with the allowed tokens for the next generation step conditioned
+                on the batch ID `batch_id` and the previously generated tokens `inputs_ids`. This argument is useful
+                for constrained generation conditioned on the prefix, as described in [Autoregressive Entity
+                Retrieval](https://huggingface.co/papers/2010.00904).
+            synced_gpus (`bool`, *optional*, defaults to `False`):
+                Whether to continue running the while loop until max_length (needed to avoid deadlocking with
+                `FullyShardedDataParallel` and DeepSpeed ZeRO Stage 3).
+            kwargs (`dict[str, Any]`, *optional*):
+                Ad hoc parametrization of `generate_config` and/or additional model-specific kwargs that will be
+                forwarded to the `forward` function of the model.
+
+        Return:
+            [`~utils.ModelOutput`] or `torch.LongTensor`: A [`~utils.ModelOutput`] (if `return_dict_in_generate=True`
+            or when `config.return_dict_in_generate=True`) or a `torch.FloatTensor`. The possible
+            [`~utils.ModelOutput`] types are:
+                - [`~generation.GenerateEncoderDecoderOutput`],
+                - [`~generation.GenerateBeamEncoderDecoderOutput`]
+        """
+        text_decoder_input_ids = kwargs.pop("decoder_input_ids", None)
+        # overwrite text_decoder_input_ids if tgt_lang is passed. The latter gets priority over decoder_input_ids.
+        input_features = input_features if input_features is not None else kwargs.pop("inputs")
+        if tgt_lang is not None:
+            inputs = kwargs.get("input_embeds") if input_features is None else input_features
+            inputs = (
+                inputs
+                if inputs is not None
+                else kwargs.get("encoder_outputs", {"last_hidden_state": None})["last_hidden_state"]
+            )
+            batch_size = len(inputs)
+
+            if hasattr(self.generation_config, "text_decoder_lang_to_code_id"):
+                # also accept __xxx__
+                tgt_lang = tgt_lang.replace("__", "")
+                if tgt_lang not in self.generation_config.text_decoder_lang_to_code_id:
+                    raise ValueError(
+                        f"""`tgt_lang={tgt_lang}` is not supported by this model. Please specify a `tgt_lang` in
+                        {", ".join(self.generation_config.text_decoder_lang_to_code_id.keys())}"""
+                    )
+                # tgt_lang gets priority over decoder input ids
+                text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang)
+                text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size, device=self.device)
+            else:
+                raise ValueError(
+                    """This model generation config doesn't have a `text_decoder_lang_to_code_id` key which maps
+                    the target language to the right token id. Make sure to load the right generation config."""
+                )
+        else:
+            # only a warning, otherwise errors appear in the tests
+            logger.warning(
+                """You must either specify a `tgt_lang` or pass a correct `text_decoder_input_ids` to get
+                a correct generation, otherwise the generation will probably make no sense."""
+            )
+        return super().generate(
+            input_features,
+            generation_config,
+            logits_processor,
+            stopping_criteria,
+            prefix_allowed_tokens_fn,
+            synced_gpus,
+            decoder_input_ids=text_decoder_input_ids,
+            **kwargs,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    The text-to-speech SeamlessM4Tv2 Model transformer which can be used for T2ST.
+    """
+)
+class SeamlessM4Tv2ForTextToSpeech(SeamlessM4Tv2PreTrainedModel, GenerationMixin):
+    _keys_to_ignore_on_load_missing = ["speech_encoder"]
+    main_input_name = "input_ids"
+
+    _tied_weights_keys = [
+        "lm_head.weight",
+        "text_encoder.embed_tokens.weight",
+        "text_decoder.embed_tokens.weight",
+    ]
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech.__init__ with SeamlessM4T->SeamlessM4Tv2
+    def __init__(self, config: SeamlessM4Tv2Config):
+        super().__init__(config)
+
+        self.shared = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
+
+        self.text_encoder = SeamlessM4Tv2Encoder(config, self.shared)
+        self.text_decoder = SeamlessM4Tv2Decoder(config, self.shared)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        self.t2u_model = SeamlessM4Tv2TextToUnitForConditionalGeneration(config)
+        self.vocoder = SeamlessM4Tv2CodeHifiGan(config)
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech.get_encoder
+    def get_encoder(self):
+        return self.text_encoder
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech.get_decoder
+    def get_decoder(self):
+        return self.text_decoder
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech.get_input_embeddings
+    def get_input_embeddings(self):
+        return self.text_decoder.embed_tokens
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech.set_input_embeddings
+    def set_input_embeddings(self, value):
+        self.text_encoder.embed_tokens = value
+        self.text_decoder.embed_tokens = value
+        self.shared = value
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech._tie_weights
+    def _tie_weights(self):
+        if self.config.tie_word_embeddings:
+            self._tie_or_clone_weights(self.text_encoder.embed_tokens, self.shared)
+            self._tie_or_clone_weights(self.text_decoder.embed_tokens, self.shared)
+            self._tie_or_clone_weights(self.lm_head, self.shared)
+
+    @auto_docstring(custom_args=SEAMLESS_M4T_V2_COMMON_CUSTOM_ARGS)
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForTextToSpeech.forward with SeamlessM4T->SeamlessM4Tv2
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.Tensor] = None,
+    ) -> Union[Seq2SeqLMOutput, tuple[torch.FloatTensor]]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
+        if labels is not None:
+            if use_cache:
+                logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
+            use_cache = False
+            if decoder_input_ids is None and decoder_inputs_embeds is None:
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if encoder_outputs is None:
+            # if encoder_outputs is not None, it's probably used within a .generate method so no need to warn
+            logger.warning(
+                "This is the same forward method as `SeamlessM4Tv2ForTextToText`."
+                "It doesn't use the text-to-unit model `SeamlessM4Tv2TextToUnitForConditionalGeneration`."
+                "If you want to generate speech, use the `.generate` method."
+            )
+            encoder_outputs = self.text_encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        encoder_attention_mask = attention_mask
+
+        # decoder outputs consists of (dec_features, past_key_values, dec_hidden, dec_attn)
+        decoder_outputs = self.text_decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            cache_position=cache_position,
+        )
+
+        lm_logits = self.lm_head(decoder_outputs[0])
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            labels = labels.to(lm_logits.device)
+            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            outputs = decoder_outputs + encoder_outputs
+            output = (lm_logits,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=masked_lm_loss,
+            logits=lm_logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        return_intermediate_token_ids: Optional[bool] = None,
+        tgt_lang: Optional[str] = None,
+        speaker_id: Optional[int] = 0,
+        **kwargs,
+    ) -> Union[torch.Tensor, SeamlessM4Tv2GenerationOutput]:
+        """
+        Generates translated audio waveforms.
+
+        <Tip>
+
+        This method successively calls the `.generate` function of two different sub-models. You can specify keyword
+        arguments at two different levels: general arguments that will be passed to both models, or prefixed arguments
+        that will be passed to one of them.
+
+        For example, calling `.generate(input_ids, num_beams=4, speech_do_sample=True)` will successively perform
+        beam-search decoding on the text model, and multinomial beam-search sampling on the speech model.
+
+        For an overview of generation strategies and code examples, check out the [following
+        guide](./generation_strategies).
+
+        </Tip>
+
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                Indices of input sequence tokens in the vocabulary.
+
+                Indices can be obtained using [`SeamlessM4TTokenizer`] or [`SeamlessM4TProcessor`]. See
+                [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            return_intermediate_token_ids (`bool`, *optional*):
+                If `True`, also returns the intermediate generated text and unit tokens. Set to `True` if you also want
+                to get translated text alongside the audio.
+            tgt_lang (`str`, *optional*):
+                The language to use as target language for translation.
+            speaker_id (`int`, *optional*, defaults to 0):
+                The id of the speaker used for speech synthesis. Must be lower than `config.vocoder_num_spkrs`.
+            kwargs (*optional*):
+                Remaining dictionary of keyword arguments that will be passed to [`GenerationMixin.generate`]. Keyword
+                arguments are of two types:
+
+                    - Without a prefix, they will be entered as `**kwargs` for the `generate` method of each sub-model,
+                    except for `decoder_input_ids` which will only be passed through the text components.
+                    - With a *text_* or *speech_* prefix, they will be input for the `generate` method of the
+                    text model and speech model respectively. It has the priority over the keywords without a prefix.
+
+                    This means you can, for example, specify a generation strategy for one generation but not for the
+                    other.
+
+
+        Returns:
+            `Union[SeamlessM4Tv2GenerationOutput, tuple[Tensor]]`:
+            - If `return_intermediate_token_ids`, returns [`SeamlessM4Tv2GenerationOutput`].
+            - If not `return_intermediate_token_ids`, returns a tuple composed of waveforms of shape `(batch_size,
+              sequence_length)` and `waveform_lengths` which gives the length of each sample.
+        """
+        batch_size = len(input_ids) if input_ids is not None else len(kwargs.get("inputs_embeds"))
+
+        if tgt_lang is None:
+            raise ValueError("You must specify a `tgt_lang` to generate translated speech.")
+        else:
+            # also accept __xxx__
+            tgt_lang = tgt_lang.replace("__", "")
+            for key in ["text_decoder_lang_to_code_id", "t2u_lang_code_to_id", "vocoder_lang_code_to_id"]:
+                lang_code_to_id = getattr(self.generation_config, key, None)
+                if lang_code_to_id is None:
+                    raise ValueError(
+                        f"""This model generation config doesn't have a `{key}` key which maps the target language
+                        to the right token id. Make sure to load the right generation config."""
+                    )
+                elif tgt_lang not in lang_code_to_id:
+                    raise ValueError(
+                        f"""`tgt_lang={tgt_lang}` is not supported by this model.
+                    Please specify a `tgt_lang` in {",".join(lang_code_to_id.keys())}. Note that SeamlessM4Tv2 supports
+                    more languages for text translation than for speech synthesis."""
+                    )
+
+        kwargs_text, kwargs_speech = format_speech_generation_kwargs(kwargs)
+        kwargs_text["output_hidden_states"] = True
+        kwargs_text["return_dict_in_generate"] = True
+        kwargs_text["output_scores"] = True
+
+        text_decoder_input_ids = kwargs_text.get("decoder_input_ids")
+
+        # overwrite text_decoder_input_ids if tgt_lang is passed. The latter gets priority over decoder_input_ids.
+        text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang)
+        text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size, device=self.device)
+
+        kwargs_text["decoder_input_ids"] = text_decoder_input_ids
+
+        # first generation
+        text_generation_output = super().generate(input_ids, **kwargs_text)
+        sequences = text_generation_output.sequences
+
+        # prepare second generation
+        num_return_sequences = len(sequences) // batch_size
+        attention_mask = kwargs_speech.get("attention_mask", kwargs_text.get("attention_mask", None))
+
+        if attention_mask is not None:
+            # repeat attention mask alongside batch dimension
+            attention_mask = torch.repeat_interleave(attention_mask, num_return_sequences, dim=0)
+        encoder_hidden_states = text_generation_output.encoder_hidden_states[-1]
+
+        # repeat attention mask alongside batch dimension
+        encoder_hidden_states = torch.repeat_interleave(encoder_hidden_states, num_return_sequences, dim=0)
+
+        # get decoder last hidden state - must do a pass through the text decoder
+        t2u_input_embeds = self.text_decoder(
+            input_ids=sequences[:, :-1],  # Manually trim the final EOS token
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=attention_mask,
+        ).last_hidden_state
+
+        pad_token_id = self.generation_config.pad_token_id
+
+        # Compute new attention mask
+        seq_lens = (sequences[:, :-1] != pad_token_id).int().sum(1)
+        t2u_model_attention_mask = _compute_new_attention_mask(t2u_input_embeds, seq_lens)
+        kwargs_speech["attention_mask"] = t2u_model_attention_mask
+
+        # REMOVE EOS and lang_id
+        t2u_input_ids = sequences[:, 2:-1]
+        # replace every other EOS
+        t2u_input_ids = torch.masked_fill(
+            t2u_input_ids, t2u_input_ids == self.generation_config.eos_token_id, pad_token_id
+        )
+
+        # compute t2u_char_input_ids
+        t2u_subwords = self._indices_to_subwords(t2u_input_ids)
+        t2u_char_count_per_id = self._count_character_length_in_subword(
+            t2u_input_ids, t2u_subwords, pad_token_id=pad_token_id
+        )
+
+        # Add pads for lang, EOS tokens as per NLLB "source" tokenizer mode.
+        pad_zero = t2u_char_count_per_id.new_zeros((t2u_char_count_per_id.shape[0], 1))
+        t2u_char_count_per_id = torch.cat([pad_zero, t2u_char_count_per_id, pad_zero], dim=1)
+        t2u_char_input_ids = self._get_char_input_ids(
+            t2u_input_ids, t2u_subwords, t2u_char_count_per_id, pad_token_id=pad_token_id
+        )
+
+        # second pass
+        t2u_output = self.t2u_model(
+            inputs_embeds=t2u_input_embeds,
+            char_input_ids=t2u_char_input_ids,
+            char_count_per_id=t2u_char_count_per_id,
+            **kwargs_speech,
+        )
+
+        t2u_logits = t2u_output[0]
+        padding_mask = t2u_output[1].bool()
+
+        # The text-to-unit model is non auto-regressive. We keep the ability to use sampling with temperature
+        temperature = kwargs_speech.get("temperature", None)
+        if (temperature is None or temperature == 1.0) or not kwargs_speech.get("do_sample", False):
+            unit_ids = t2u_logits.argmax(dim=-1)
+        else:
+            t2u_logits = t2u_logits / temperature
+            # apply softmax
+            probs = nn.functional.softmax(t2u_logits, dim=-1)
+            # reshape to 2D: (batch_size, seq_len, t2u_vocab_size) -> (batch_size*seq_len, t2u_vocab_size)
+            probs = probs.reshape((-1, probs.shape[2]))
+            # multinomial then reshape : (batch_size*seq_len)-> (batch_size,seq_len)
+            unit_ids = torch.multinomial(probs, num_samples=1).view(t2u_logits.shape[0], -1)
+
+        output_unit_ids = unit_ids.detach().clone()
+
+        replace_mask = (unit_ids == self.config.t2u_eos_token_id) | (~padding_mask)
+        # replace eos per pad
+        unit_ids = unit_ids.masked_fill(replace_mask, self.config.t2u_pad_token_id)
+
+        # offset of control symbols
+        unit_ids = torch.where(
+            unit_ids == self.config.t2u_pad_token_id, unit_ids, unit_ids - self.config.vocoder_offset
+        )
+
+        vocoder_tgt_lang_id = self.generation_config.vocoder_lang_code_to_id.get(tgt_lang)
+        vocoder_tgt_lang_id = torch.tensor([[vocoder_tgt_lang_id]] * len(unit_ids), device=self.device)
+
+        speaker_id = torch.tensor([[speaker_id]] * len(unit_ids), device=self.device)
+
+        waveform, waveform_lengths = self.vocoder(
+            input_ids=unit_ids, speaker_id=speaker_id, lang_id=vocoder_tgt_lang_id
+        )
+
+        if return_intermediate_token_ids:
+            return SeamlessM4Tv2GenerationOutput(
+                waveform=waveform,
+                waveform_lengths=waveform_lengths,
+                sequences=sequences,
+                unit_sequences=output_unit_ids,
+            )
+
+        return waveform, waveform_lengths
+
+
+@auto_docstring(
+    custom_intro="""
+    The speech-to-speech SeamlessM4Tv2 Model transformer which can be used for S2ST.
+    """
+)
+class SeamlessM4Tv2ForSpeechToSpeech(SeamlessM4Tv2PreTrainedModel, GenerationMixin):
+    _keys_to_ignore_on_load_missing = ["text_encoder"]
+    main_input_name = "input_features"
+
+    _tied_weights_keys = [
+        "lm_head.weight",
+        "text_decoder.embed_tokens.weight",
+    ]
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToSpeech.__init__ with SeamlessM4T->SeamlessM4Tv2
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.shared = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
+        self.speech_encoder = SeamlessM4Tv2SpeechEncoder(config)
+        self.text_decoder = SeamlessM4Tv2Decoder(config, self.shared)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        self.t2u_model = SeamlessM4Tv2TextToUnitForConditionalGeneration(config)
+        self.vocoder = SeamlessM4Tv2CodeHifiGan(config)
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToSpeech.get_encoder
+    def get_encoder(self):
+        return self.speech_encoder
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToSpeech.get_decoder
+    def get_decoder(self):
+        return self.text_decoder
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToSpeech.get_input_embeddings
+    def get_input_embeddings(self):
+        return self.text_decoder.embed_tokens
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToSpeech.set_input_embeddings
+    def set_input_embeddings(self, value):
+        self.text_decoder.embed_tokens = value
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToSpeech._tie_weights
+    def _tie_weights(self):
+        if self.config.tie_word_embeddings:
+            self._tie_or_clone_weights(self.text_decoder.embed_tokens, self.shared)
+            self._tie_or_clone_weights(self.lm_head, self.shared)
+
+    @auto_docstring(custom_args=SEAMLESS_M4T_V2_COMMON_CUSTOM_ARGS)
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TForSpeechToSpeech.forward with SeamlessM4T->SeamlessM4Tv2
+    def forward(
+        self,
+        input_features: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Seq2SeqLMOutput, tuple[torch.FloatTensor]]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
+        if labels is not None:
+            if use_cache:
+                logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
+            use_cache = False
+            if decoder_input_ids is None and decoder_inputs_embeds is None:
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if encoder_outputs is None:
+            # if encoder_outputs is not None, it's probably used within a .generate method so no need to warn
+            logger.warning(
+                "This is the same forward method as `SeamlessM4Tv2ForSpeechToText`. It doesn't use `self.t2u_model`."
+                "If you want to generate speech, use the `generate` method."
+            )
+
+            encoder_outputs = self.speech_encoder(
+                input_features=input_features,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        encoder_attention_mask = attention_mask
+        if attention_mask is not None:
+            sub_sampled_lengths = self._compute_sub_sample_lengths_from_attention_mask(attention_mask).to(
+                encoder_outputs[0].device
+            )
+            encoder_attention_mask = _compute_new_attention_mask(
+                hidden_states=encoder_outputs[0], seq_lens=sub_sampled_lengths
+            )
+
+        # decoder outputs consists of (dec_features, past_key_values, dec_hidden, dec_attn)
+        decoder_outputs = self.text_decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        lm_logits = self.lm_head(decoder_outputs[0])
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            labels = labels.to(lm_logits.device)
+            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            outputs = decoder_outputs + encoder_outputs
+            output = (lm_logits,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=masked_lm_loss,
+            logits=lm_logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        input_features: Optional[torch.Tensor] = None,
+        return_intermediate_token_ids: Optional[bool] = None,
+        tgt_lang: Optional[str] = None,
+        speaker_id: Optional[int] = 0,
+        **kwargs,
+    ) -> Union[torch.Tensor, SeamlessM4Tv2GenerationOutput]:
+        """
+        Generates translated audio waveforms.
+
+        <Tip>
+
+        This method successively calls the `.generate` function of two different sub-models. You can specify keyword
+        arguments at two different levels: general arguments that will be passed to both models, or prefixed arguments
+        that will be passed to one of them.
+
+        For example, calling `.generate(input_features, num_beams=4, speech_do_sample=True)` will successively perform
+        beam-search decoding on the text model, and multinomial beam-search sampling on the speech model.
+
+        For an overview of generation strategies and code examples, check out the [following
+        guide](./generation_strategies).
+
+        </Tip>
+
+        Args:
+            input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`):
+                Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the
+                [`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
+            return_intermediate_token_ids (`bool`, *optional*):
+                If `True`, also returns the intermediate generated text and unit tokens. Set to `True` if you also want
+                to get translated text alongside the audio.
+            tgt_lang (`str`, *optional*):
+                The language to use as target language for translation.
+            speaker_id (`int`, *optional*, defaults to 0):
+                The id of the speaker used for speech synthesis. Must be lower than `config.vocoder_num_spkrs`.
+
+            kwargs (*optional*):
+                Remaining dictionary of keyword arguments that will be passed to [`GenerationMixin.generate`]. Keyword
+                arguments are of two types:
+
+                    - Without a prefix, they will be entered as `**kwargs` for the `generate` method of each sub-model,
+                    except for `decoder_input_ids` which will only be passed through the text components.
+                    - With a *text_* or *speech_* prefix, they will be input for the `generate` method of the
+                    text model and speech model respectively. It has the priority over the keywords without a prefix.
+
+                    This means you can, for example, specify a generation strategy for one generation but not for the
+                    other.
+
+
+        Returns:
+            `Union[SeamlessM4Tv2GenerationOutput, tuple[Tensor]]`:
+            - If `return_intermediate_token_ids`, returns [`SeamlessM4Tv2GenerationOutput`].
+            - If not `return_intermediate_token_ids`, returns a tuple composed of waveforms of shape `(batch_size,
+              sequence_length)` and `waveform_lengths` which gives the length of each sample.
+        """
+        batch_size = len(input_features) if input_features is not None else len(kwargs.get("inputs_embeds"))
+
+        if tgt_lang is None:
+            raise ValueError("You must specify a `tgt_lang` to generate translated speech.")
+        else:
+            # also accept __xxx__
+            tgt_lang = tgt_lang.replace("__", "")
+            for key in ["text_decoder_lang_to_code_id", "t2u_lang_code_to_id", "vocoder_lang_code_to_id"]:
+                lang_code_to_id = getattr(self.generation_config, key, None)
+                if lang_code_to_id is None:
+                    raise ValueError(
+                        f"""This model generation config doesn't have a `{key}` key which maps the target language
+                        to the right token id. Make sure to load the right generation config."""
+                    )
+                elif tgt_lang not in lang_code_to_id:
+                    raise ValueError(
+                        f"""`tgt_lang={tgt_lang}` is not supported by this model.
+                    Please specify a `tgt_lang` in {",".join(lang_code_to_id.keys())}. Note that SeamlessM4Tv2 supports
+                    more languages for text translation than for speech synthesis."""
+                    )
+
+        kwargs_text, kwargs_speech = format_speech_generation_kwargs(kwargs)
+        kwargs_text["output_hidden_states"] = True
+        kwargs_text["return_dict_in_generate"] = True
+        kwargs_text["output_scores"] = True
+
+        text_decoder_input_ids = kwargs_text.get("decoder_input_ids")
+        # overwrite text_decoder_input_ids if tgt_lang is passed. The latter gets priority over decoder_input_ids.
+        text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang)
+        text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size, device=self.device)
+
+        kwargs_text["decoder_input_ids"] = text_decoder_input_ids
+
+        # first generation
+        text_generation_output = super().generate(input_features, **kwargs_text)
+        sequences = text_generation_output.sequences
+
+        # prepare second generation
+        num_return_sequences = len(sequences) // batch_size
+        attention_mask = kwargs_speech.get("attention_mask", kwargs_text.get("attention_mask", None))
+
+        # get last_hidden_state from encoder
+        encoder_hidden_states = self.speech_encoder(input_features=input_features, attention_mask=attention_mask)[0]
+
+        # input modality = speech so new attention mask for the decoder
+        if attention_mask is not None:
+            sub_sampled_lengths = self._compute_sub_sample_lengths_from_attention_mask(attention_mask).to(
+                encoder_hidden_states.device
+            )
+            attention_mask = _compute_new_attention_mask(
+                hidden_states=encoder_hidden_states, seq_lens=sub_sampled_lengths
+            )
+
+            # repeat attention mask alongside batch dimension
+            attention_mask = torch.repeat_interleave(attention_mask, num_return_sequences, dim=0)
+
+        # repeat attention mask alongside batch dimension
+        encoder_hidden_states = torch.repeat_interleave(encoder_hidden_states, num_return_sequences, dim=0)
+
+        # get decoder last hidden state - must do a pass through the text decoder
+        t2u_input_embeds = self.text_decoder(
+            input_ids=sequences[:, :-1],  # Manually trim the final EOS token
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=attention_mask,
+        ).last_hidden_state
+
+        pad_token_id = self.generation_config.pad_token_id
+
+        # Compute new attention mask
+        seq_lens = (sequences[:, :-1] != pad_token_id).int().sum(1)
+        t2u_model_attention_mask = _compute_new_attention_mask(t2u_input_embeds, seq_lens)
+        kwargs_speech["attention_mask"] = t2u_model_attention_mask
+
+        # REMOVE EOS and lang_id
+        t2u_input_ids = sequences[:, 2:-1]
+        # replace every other EOS
+        t2u_input_ids = torch.masked_fill(
+            t2u_input_ids, t2u_input_ids == self.generation_config.eos_token_id, pad_token_id
+        )
+
+        # compute t2u_char_input_ids
+        t2u_subwords = self._indices_to_subwords(t2u_input_ids)
+        t2u_char_count_per_id = self._count_character_length_in_subword(
+            t2u_input_ids, t2u_subwords, pad_token_id=pad_token_id
+        )
+
+        # Add pads for lang, EOS tokens as per NLLB "source" tokenizer mode.
+        pad_zero = t2u_char_count_per_id.new_zeros((t2u_char_count_per_id.shape[0], 1))
+        t2u_char_count_per_id = torch.cat([pad_zero, t2u_char_count_per_id, pad_zero], dim=1)
+        t2u_char_input_ids = self._get_char_input_ids(
+            t2u_input_ids, t2u_subwords, t2u_char_count_per_id, pad_token_id=pad_token_id
+        )
+
+        # second pass
+        t2u_output = self.t2u_model(
+            inputs_embeds=t2u_input_embeds,
+            char_input_ids=t2u_char_input_ids,
+            char_count_per_id=t2u_char_count_per_id,
+            **kwargs_speech,
+        )
+
+        t2u_logits = t2u_output[0]
+        padding_mask = t2u_output[1].bool()
+
+        # The text-to-unit model is non auto-regressive. We keep the ability to use sampling with temperature
+        temperature = kwargs_speech.get("temperature", None)
+        if (temperature is None or temperature == 1.0) or not kwargs_speech.get("do_sample", False):
+            unit_ids = t2u_logits.argmax(dim=-1)
+        else:
+            t2u_logits = t2u_logits / temperature
+            # apply softmax
+            probs = nn.functional.softmax(t2u_logits, dim=-1)
+            # reshape to 2D: (batch_size, seq_len, t2u_vocab_size) -> (batch_size*seq_len, t2u_vocab_size)
+            probs = probs.reshape((-1, probs.shape[2]))
+            # multinomial then reshape : (batch_size*seq_len)-> (batch_size,seq_len)
+            unit_ids = torch.multinomial(probs, num_samples=1).view(t2u_logits.shape[0], -1)
+
+        output_unit_ids = unit_ids.detach().clone()
+
+        replace_mask = (unit_ids == self.config.t2u_eos_token_id) | (~padding_mask)
+        # replace eos per pad
+        unit_ids = unit_ids.masked_fill(replace_mask, self.config.t2u_pad_token_id)
+
+        # offset of control symbols
+        unit_ids = torch.where(
+            unit_ids == self.config.t2u_pad_token_id, unit_ids, unit_ids - self.config.vocoder_offset
+        )
+
+        vocoder_tgt_lang_id = self.generation_config.vocoder_lang_code_to_id.get(tgt_lang)
+        vocoder_tgt_lang_id = torch.tensor([[vocoder_tgt_lang_id]] * len(unit_ids), device=self.device)
+
+        speaker_id = torch.tensor([[speaker_id]] * len(unit_ids), device=self.device)
+
+        waveform, waveform_lengths = self.vocoder(
+            input_ids=unit_ids, speaker_id=speaker_id, lang_id=vocoder_tgt_lang_id
+        )
+
+        if return_intermediate_token_ids:
+            return SeamlessM4Tv2GenerationOutput(
+                waveform=waveform,
+                waveform_lengths=waveform_lengths,
+                sequences=sequences,
+                unit_sequences=output_unit_ids,
+            )
+
+        return waveform, waveform_lengths
+
+
+@auto_docstring(
+    custom_intro="""
+    The original SeamlessM4Tv2 Model transformer which can be used for every tasks available (S2ST, S2TT, T2TT, T2ST).
+    """
+)
+class SeamlessM4Tv2Model(SeamlessM4Tv2PreTrainedModel, GenerationMixin):
+    _tied_weights_keys = [
+        "lm_head.weight",
+        "text_encoder.embed_tokens.weight",
+        "text_decoder.embed_tokens.weight",
+    ]
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TModel.__init__ with SeamlessM4T->SeamlessM4Tv2
+    def __init__(self, config, current_modality="text"):
+        r"""
+        current_modality (`str`, *optional*, defaults to `"text"`):
+            Default modality. Used to initialize the model.
+        """
+        super().__init__(config)
+
+        self.shared = nn.Embedding(config.vocab_size, config.hidden_size, config.pad_token_id)
+
+        self.text_encoder = SeamlessM4Tv2Encoder(config, self.shared)
+        self.speech_encoder = SeamlessM4Tv2SpeechEncoder(config)
+        self.text_decoder = SeamlessM4Tv2Decoder(config, self.shared)
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+        self.current_modality = current_modality
+        if current_modality == "speech":
+            self.main_input_name = "input_features"
+
+        # these models already call post_init in their initialization
+        self.t2u_model = SeamlessM4Tv2TextToUnitForConditionalGeneration(config)
+        self.vocoder = SeamlessM4Tv2CodeHifiGan(config)
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TModel.set_modality
+    def set_modality(self, modality="text"):
+        if modality == "text":
+            self.main_input_name = "input_ids"
+            self.current_modality = "text"
+        elif modality == "speech":
+            self.main_input_name = "input_features"
+            self.current_modality = "speech"
+        else:
+            raise ValueError(f"`modality={modality}` is not a valid modality. It must be `text` or `speech`.")
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TModel.get_encoder
+    def get_encoder(self):
+        if self.current_modality == "text":
+            return self.text_encoder
+        else:
+            return self.speech_encoder
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TModel.get_input_embeddings
+    def get_input_embeddings(self):
+        return self.text_decoder.embed_tokens
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TModel.set_input_embeddings
+    def set_input_embeddings(self, value):
+        self.text_encoder.embed_tokens = value
+        self.text_decoder.embed_tokens = value
+        self.shared = value
+
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TModel._tie_weights
+    def _tie_weights(self):
+        if self.config.tie_word_embeddings:
+            self._tie_or_clone_weights(self.text_encoder.embed_tokens, self.shared)
+            self._tie_or_clone_weights(self.text_decoder.embed_tokens, self.shared)
+            self._tie_or_clone_weights(self.lm_head, self.shared)
+
+    @auto_docstring(custom_args=SEAMLESS_M4T_V2_COMMON_CUSTOM_ARGS)
+    # Copied from transformers.models.seamless_m4t.modeling_seamless_m4t.SeamlessM4TModel.forward with SeamlessM4T->SeamlessM4Tv2
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        input_features: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        decoder_input_ids: Optional[torch.LongTensor] = None,
+        decoder_attention_mask: Optional[torch.LongTensor] = None,
+        encoder_outputs: Optional[tuple[tuple[torch.FloatTensor]]] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        decoder_inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        **kwargs,
+    ) -> Union[Seq2SeqLMOutput, tuple[torch.FloatTensor]]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should be in `[-100, 0, ...,
+            config.vocab_size]` (see `input_ids` docstring) Tokens with indices set to `-100` are ignored (masked), the
+            loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if labels is not None:
+            if use_cache:
+                logger.warning("The `use_cache` argument is changed to `False` since `labels` is provided.")
+            use_cache = False
+            if decoder_input_ids is None and decoder_inputs_embeds is None:
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )
+
+        if input_ids is None and input_features is None and inputs_embeds is None and encoder_outputs is None:
+            raise ValueError(
+                "`input_ids`,`input_features`, `inputs_embeds` and `encoder_outputs` are all empty. Make sure at least one of them is not."
+            )
+        elif input_features is not None:
+            if input_ids is not None:
+                logger.warning(
+                    "`input_ids` is not `None` but `input_features` has been given."
+                    "`input_features` will be used in priority through the `speech_encoder`. "
+                    "Make sure that `input_features` and `input_ids` are mutually exclusive."
+                )
+
+            if inputs_embeds is not None:
+                logger.warning(
+                    "`inputs_embeds` is not `None` but `input_features` has been given."
+                    "`input_features` will be used in priority through `speech_encoder`. "
+                    "`inputs_embeds` will be ignored."
+                )
+
+            # if encoder_outputs is not None, it's probably used within a .generate method so no need to warn
+            logger.warning(
+                "This calls the same method `forward` as `SeamlessM4Tv2ForTextToText` and `SeamlessM4Tv2ForSpeechToText`"
+                "depending on the input modality. If you want to generate speech, use the `generate` method."
+            )
+
+            self.set_modality("speech")
+
+            encoder_outputs = self.speech_encoder(
+                input_features=input_features,
+                attention_mask=attention_mask,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+
+        elif input_ids is not None or inputs_embeds is not None:
+            # if encoder_outputs is not None, it's probably used within a .generate method so no need to warn
+            logger.warning(
+                "This calls the same method `forward` as `SeamlessM4Tv2ForTextToText` and `SeamlessM4Tv2ForSpeechToText`"
+                "depending on the input modality. If you want to generate speech, use the `generate` method."
+            )
+            self.set_modality("text")
+            encoder_outputs = self.text_encoder(
+                input_ids=input_ids,
+                attention_mask=attention_mask,
+                inputs_embeds=inputs_embeds,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+                return_dict=return_dict,
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+            encoder_outputs = BaseModelOutput(
+                last_hidden_state=encoder_outputs[0],
+                hidden_states=encoder_outputs[1] if len(encoder_outputs) > 1 else None,
+                attentions=encoder_outputs[2] if len(encoder_outputs) > 2 else None,
+            )
+
+        encoder_attention_mask = attention_mask
+        # input modality = speech so new attention mask
+        if self.current_modality == "speech" and attention_mask is not None:
+            sub_sampled_lengths = self._compute_sub_sample_lengths_from_attention_mask(attention_mask).to(
+                encoder_outputs[0].device
+            )
+            encoder_attention_mask = _compute_new_attention_mask(
+                hidden_states=encoder_outputs[0], seq_lens=sub_sampled_lengths
+            )
+
+        # decoder outputs consists of (dec_features, past_key_values, dec_hidden, dec_attn)
+        decoder_outputs = self.text_decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs[0],
+            encoder_attention_mask=encoder_attention_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        lm_logits = self.lm_head(decoder_outputs[0])
+
+        masked_lm_loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            labels = labels.to(lm_logits.device)
+            masked_lm_loss = loss_fct(lm_logits.view(-1, self.config.vocab_size), labels.view(-1))
+
+        if not return_dict:
+            outputs = decoder_outputs + encoder_outputs
+            output = (lm_logits,) + outputs[1:]
+            return ((masked_lm_loss,) + output) if masked_lm_loss is not None else output
+
+        return Seq2SeqLMOutput(
+            loss=masked_lm_loss,
+            logits=lm_logits,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.last_hidden_state,
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+        )
+
+    @torch.no_grad()
+    def generate(
+        self,
+        input_ids: Optional[torch.Tensor] = None,
+        input_features: Optional[torch.Tensor] = None,
+        return_intermediate_token_ids: Optional[bool] = None,
+        tgt_lang: Optional[str] = None,
+        speaker_id: Optional[int] = 0,
+        generate_speech: Optional[bool] = True,
+        **kwargs,
+    ) -> Union[torch.Tensor, SeamlessM4Tv2GenerationOutput]:
+        """
+        Generates translated token ids and/or translated audio waveforms.
+
+        <Tip>
+
+        This method successively calls the `.generate` function of two different sub-models. You can specify keyword
+        arguments at two different levels: general arguments that will be passed to both models, or prefixed arguments
+        that will be passed to one of them.
+
+        For example, calling `.generate(input_ids=input_ids, num_beams=4, speech_do_sample=True)` will successively
+        perform beam-search decoding on the text model, and multinomial beam-search sampling on the speech model.
+
+        For an overview of generation strategies and code examples, check out the [following
+        guide](./generation_strategies).
+
+        </Tip>
+
+
+        Args:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+                Indices of input sequence tokens in the vocabulary.
+
+                Indices can be obtained using [`SeamlessM4TTokenizer`] or [`SeamlessM4TProcessor`]. See
+                [`PreTrainedTokenizer.encode`] and [`PreTrainedTokenizer.__call__`] for details.
+
+                [What are input IDs?](../glossary#input-ids)
+            input_features (`torch.FloatTensor` of shape `(batch_size, sequence_length, num_banks)`, *optional*):
+                Input audio features. This should be returned by the [`SeamlessM4TFeatureExtractor`] class or the
+                [`SeamlessM4TProcessor`] class. See [`SeamlessM4TFeatureExtractor.__call__`] for details.
+            return_intermediate_token_ids (`bool`, *optional*):
+                If `True`, also returns the intermediate generated text and unit tokens. Set to `True` if you also want
+                to get translated text alongside the audio.
+                Note that if `generate_speech=False`, this parameter will be ignored and
+                the text tokens are returned.
+            tgt_lang (`str`, *optional*):
+                The language to use as target language for translation.
+            speaker_id (`int`, *optional*, defaults to 0):
+                The id of the speaker used for speech synthesis. Must be lower than `config.vocoder_num_spkrs`.
+            generate_speech (`bool`, *optional*, defaults to `True`):
+                If `False`, will only returns the text tokens and won't generate speech.
+
+            kwargs (*optional*):
+                Remaining dictioy of keyword arguments that will be passed to [`GenerationMixin.generate`]. Keyword
+                arguments are of two types:
+
+                    - Without a prefix, they will be entered as `**kwargs` for the `generate` method of each sub-model,
+                    except for `decoder_input_ids` which will only be passed through the text components.
+                    - With a *text_* or *speech_* prefix, they will be input for the `generate` method of the
+                    text model and speech model respectively. It has the priority over the keywords without a prefix.
+
+                    This means you can, for example, specify a generation strategy for one generation but not for the
+                    other.
+
+        Returns:
+            `Union[SeamlessM4Tv2GenerationOutput, tuple[Tensor], ModelOutput]`:
+            - If `generate_speech` and `return_intermediate_token_ids`, returns [`SeamlessM4Tv2GenerationOutput`].
+            - If `generate_speech` and not `return_intermediate_token_ids`, returns a tuple composed of waveforms of
+              shape `(batch_size, sequence_length)` and `waveform_lengths` which gives the length of each sample.
+            - If `generate_speech=False`, it will returns `ModelOutput`.
+        """
+        if input_ids is None and input_features is None and kwargs.get("inputs_embeds") is None:
+            raise ValueError(
+                "`input_ids`,`input_features` and `inputs_embeds` are all empty. Make sure at least one of them is not."
+            )
+
+        if generate_speech and tgt_lang is None:
+            raise ValueError("You must specify a `tgt_lang` to generate translated speech.")
+
+        if tgt_lang is not None:
+            # also accept __xxx__
+            tgt_lang = tgt_lang.replace("__", "")
+            if generate_speech:
+                keys_to_check = ["text_decoder_lang_to_code_id", "t2u_lang_code_to_id", "vocoder_lang_code_to_id"]
+            else:
+                keys_to_check = ["text_decoder_lang_to_code_id"]
+            for key in keys_to_check:
+                lang_code_to_id = getattr(self.generation_config, key, None)
+                if lang_code_to_id is None:
+                    raise ValueError(
+                        f"""This model generation config doesn't have a `{key}` key which maps the target language
+                        to the right token id. Make sure to load the right generation config."""
+                    )
+                elif tgt_lang not in lang_code_to_id:
+                    raise ValueError(
+                        f"""`tgt_lang={tgt_lang}` is not supported by this model.
+                    Please specify a `tgt_lang` in {",".join(lang_code_to_id.keys())}. Note that SeamlessM4Tv2 supports
+                    more languages for text translation than for speech synthesis."""
+                    )
+
+        batch_size = (
+            len(input_features)
+            if input_features is not None
+            else (len(input_ids) if input_ids is not None else len(kwargs.get("inputs_embeds")))
+        )
+
+        kwargs_text, kwargs_speech = format_speech_generation_kwargs(kwargs)
+        kwargs_text["output_hidden_states"] = True
+        kwargs_text["return_dict_in_generate"] = True
+        kwargs_text["output_scores"] = True
+
+        text_decoder_input_ids = kwargs_text.get("decoder_input_ids")
+        # overwrite text_decoder_input_ids if tgt_lang is passed. The latter gets priority over decoder_input_ids.
+        if tgt_lang is not None:
+            # tgt_lang gets priority over decoder input ids
+            text_tgt_lang_id = self.generation_config.text_decoder_lang_to_code_id.get(tgt_lang)
+            text_decoder_input_ids = torch.tensor([[text_tgt_lang_id]] * batch_size, device=self.device)
+
+        kwargs_text["decoder_input_ids"] = text_decoder_input_ids
+
+        # first generation
+        if input_features is not None:
+            self.set_modality("speech")
+            if input_ids is not None:
+                logger.warning(
+                    "`input_features` and `input_ids` are both non empty. `input_features` will be used in priority "
+                    "through the speech encoder. Make sure `input_features=None` if you want to use the text encoder."
+                )
+            text_generation_output = super().generate(input_features=input_features, **kwargs_text)
+        else:
+            self.set_modality("text")
+            text_generation_output = super().generate(input_ids=input_ids, input_features=None, **kwargs_text)
+        sequences = text_generation_output.sequences
+
+        if not generate_speech:
+            return text_generation_output
+
+        # prepare second generation
+        num_return_sequences = len(sequences) // batch_size
+        attention_mask = kwargs_speech.get("attention_mask", kwargs_text.get("attention_mask", None))
+
+        # get encoder last hidden states
+        if self.current_modality == "speech":
+            # get last_hidden_state from encoder - must do a pass through the speech encoder
+            encoder_hidden_states = self.speech_encoder(
+                input_features=input_features, attention_mask=attention_mask
+            ).last_hidden_state
+
+            # input modality = speech so new attention mask for the decoder
+            if attention_mask is not None:
+                sub_sampled_lengths = self._compute_sub_sample_lengths_from_attention_mask(attention_mask).to(
+                    encoder_hidden_states.device
+                )
+                attention_mask = _compute_new_attention_mask(
+                    hidden_states=encoder_hidden_states, seq_lens=sub_sampled_lengths
+                )
+        else:
+            encoder_hidden_states = text_generation_output.encoder_hidden_states[-1]
+
+        if attention_mask is not None:
+            # repeat attention mask alongside batch dimension
+            attention_mask = torch.repeat_interleave(attention_mask, num_return_sequences, dim=0)
+
+        # repeat attention mask alongside batch dimension
+        encoder_hidden_states = torch.repeat_interleave(encoder_hidden_states, num_return_sequences, dim=0)
+
+        # get decoder last hidden state - must do a pass through the text decoder
+        t2u_input_embeds = self.text_decoder(
+            input_ids=sequences[:, :-1],  # Manually trim the final EOS token
+            encoder_hidden_states=encoder_hidden_states,
+            encoder_attention_mask=attention_mask,
+        ).last_hidden_state
+
+        pad_token_id = self.generation_config.pad_token_id
+
+        # Compute new attention mask
+        seq_lens = (sequences[:, :-1] != pad_token_id).int().sum(1)
+        t2u_model_attention_mask = _compute_new_attention_mask(t2u_input_embeds, seq_lens)
+        kwargs_speech["attention_mask"] = t2u_model_attention_mask
+
+        # REMOVE EOS and lang_id
+        t2u_input_ids = sequences[:, 2:-1]
+        # replace every other EOS
+        t2u_input_ids = torch.masked_fill(
+            t2u_input_ids, t2u_input_ids == self.generation_config.eos_token_id, pad_token_id
+        )
+
+        # compute t2u_char_input_ids
+        t2u_subwords = self._indices_to_subwords(t2u_input_ids)
+        t2u_char_count_per_id = self._count_character_length_in_subword(
+            t2u_input_ids, t2u_subwords, pad_token_id=pad_token_id
+        )
+
+        # Add pads for lang, EOS tokens as per NLLB "source" tokenizer mode.
+        pad_zero = t2u_char_count_per_id.new_zeros((t2u_char_count_per_id.shape[0], 1))
+        t2u_char_count_per_id = torch.cat([pad_zero, t2u_char_count_per_id, pad_zero], dim=1)
+        t2u_char_input_ids = self._get_char_input_ids(
+            t2u_input_ids, t2u_subwords, t2u_char_count_per_id, pad_token_id=pad_token_id
+        )
+
+        # second pass
+        t2u_output = self.t2u_model(
+            inputs_embeds=t2u_input_embeds,
+            char_input_ids=t2u_char_input_ids,
+            char_count_per_id=t2u_char_count_per_id,
+            **kwargs_speech,
+        )
+
+        t2u_logits = t2u_output[0]
+        padding_mask = t2u_output[1].bool()
+
+        # The text-to-unit model is non auto-regressive. We keep the ability to use sampling with temperature
+        temperature = kwargs_speech.get("temperature", None)
+        if (temperature is None or temperature == 1.0) or not kwargs_speech.get("do_sample", False):
+            unit_ids = t2u_logits.argmax(dim=-1)
+        else:
+            t2u_logits = t2u_logits / temperature
+            # apply softmax
+            probs = nn.functional.softmax(t2u_logits, dim=-1)
+            # reshape to 2D: (batch_size, seq_len, t2u_vocab_size) -> (batch_size*seq_len, t2u_vocab_size)
+            probs = probs.reshape((-1, probs.shape[2]))
+            # multinomial then reshape : (batch_size*seq_len)-> (batch_size,seq_len)
+            unit_ids = torch.multinomial(probs, num_samples=1).view(t2u_logits.shape[0], -1)
+
+        output_unit_ids = unit_ids.detach().clone()
+
+        replace_mask = (unit_ids == self.config.t2u_eos_token_id) | (~padding_mask)
+        # replace eos per pad
+        unit_ids = unit_ids.masked_fill(replace_mask, self.config.t2u_pad_token_id)
+
+        # offset of control symbols
+        unit_ids = torch.where(
+            unit_ids == self.config.t2u_pad_token_id, unit_ids, unit_ids - self.config.vocoder_offset
+        )
+
+        vocoder_tgt_lang_id = self.generation_config.vocoder_lang_code_to_id.get(tgt_lang)
+        vocoder_tgt_lang_id = torch.tensor([[vocoder_tgt_lang_id]] * len(unit_ids), device=self.device)
+
+        speaker_id = torch.tensor([[speaker_id]] * len(unit_ids), device=self.device)
+
+        waveform, waveform_lengths = self.vocoder(
+            input_ids=unit_ids, speaker_id=speaker_id, lang_id=vocoder_tgt_lang_id
+        )
+
+        if return_intermediate_token_ids:
+            return SeamlessM4Tv2GenerationOutput(
+                waveform=waveform,
+                waveform_lengths=waveform_lengths,
+                sequences=sequences,
+                unit_sequences=output_unit_ids,
+            )
+
+        return waveform, waveform_lengths
+
+
+__all__ = [
+    "SeamlessM4Tv2ForTextToSpeech",
+    "SeamlessM4Tv2ForSpeechToSpeech",
+    "SeamlessM4Tv2ForTextToText",
+    "SeamlessM4Tv2ForSpeechToText",
+    "SeamlessM4Tv2Model",
+    "SeamlessM4Tv2PreTrainedModel",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seed_oss/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seed_oss/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2968e9ec3e72ce365688284ea83c7b8743203ca5
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seed_oss/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seed_oss/__pycache__/configuration_seed_oss.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seed_oss/__pycache__/configuration_seed_oss.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..53e1fb2247ea7de52f7f235261be7c601da30f78
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seed_oss/__pycache__/configuration_seed_oss.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seed_oss/__pycache__/modeling_seed_oss.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seed_oss/__pycache__/modeling_seed_oss.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dc3bf785bcff47269b00a5f5bf50c351763d1d39
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seed_oss/__pycache__/modeling_seed_oss.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seed_oss/__pycache__/modular_seed_oss.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seed_oss/__pycache__/modular_seed_oss.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..09b22fc2b38ad84c6d70e03dab46f0d6b5a89b0f
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/seed_oss/__pycache__/modular_seed_oss.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f6e7fcb016c02c39d92c6bfa3e16f3375ff7d2b1
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/__pycache__/configuration_segformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/__pycache__/configuration_segformer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6defbc4eb8ed4b0e9d2d777517076cf514e52828
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/__pycache__/configuration_segformer.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/__pycache__/feature_extraction_segformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/__pycache__/feature_extraction_segformer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f457088952f34497bc70313a351f12d1319edc7d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/__pycache__/feature_extraction_segformer.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/__pycache__/image_processing_segformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/__pycache__/image_processing_segformer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d9bd902594f20f523909b4d03bfc03a3a8afe861
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/__pycache__/image_processing_segformer.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/__pycache__/image_processing_segformer_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/__pycache__/image_processing_segformer_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d41073e3aa839a57e1ba05378cb6c5ee442cc6ce
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/__pycache__/image_processing_segformer_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/__pycache__/modeling_segformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/__pycache__/modeling_segformer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6e27e305356e9af070c0e779788def9720c967a1
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/__pycache__/modeling_segformer.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/__pycache__/modeling_tf_segformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/__pycache__/modeling_tf_segformer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..63a45366939b884fa97c08311f43d29c39d324c2
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/__pycache__/modeling_tf_segformer.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/__pycache__/modular_segformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/__pycache__/modular_segformer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..396606b9c2b10d4bfe4643eb2d755f847cd485d8
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/__pycache__/modular_segformer.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/image_processing_segformer_fast.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/image_processing_segformer_fast.py
new file mode 100644
index 0000000000000000000000000000000000000000..36befda0eaa3f80e8f21db9bd9167f98ceb19948
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/image_processing_segformer_fast.py
@@ -0,0 +1,234 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/segformer/modular_segformer.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_segformer.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2025 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Optional, Union
+
+import torch
+from torchvision.transforms.v2 import functional as F
+
+from ...image_processing_utils import BatchFeature
+from ...image_processing_utils_fast import (
+    BaseImageProcessorFast,
+    DefaultFastImageProcessorKwargs,
+    group_images_by_shape,
+    reorder_images,
+)
+from ...image_utils import (
+    IMAGENET_DEFAULT_MEAN,
+    IMAGENET_DEFAULT_STD,
+    ChannelDimension,
+    ImageInput,
+    PILImageResampling,
+    SizeDict,
+    is_torch_tensor,
+)
+from ...processing_utils import Unpack
+from ...utils import TensorType, auto_docstring
+
+
+class SegformerFastImageProcessorKwargs(DefaultFastImageProcessorKwargs):
+    r"""
+    do_reduce_labels (`bool`, *optional*, defaults to `self.do_reduce_labels`):
+        Whether or not to reduce all label values of segmentation maps by 1. Usually used for datasets where 0
+        is used for background, and background itself is not included in all classes of a dataset (e.g.
+        ADE20k). The background label will be replaced by 255.
+    """
+
+    do_reduce_labels: Optional[bool]
+
+
+@auto_docstring
+class SegformerImageProcessorFast(BaseImageProcessorFast):
+    resample = PILImageResampling.BILINEAR
+    image_mean = IMAGENET_DEFAULT_MEAN
+    image_std = IMAGENET_DEFAULT_STD
+    size = {"height": 512, "width": 512}
+    default_to_square = True
+    crop_size = None
+    do_resize = True
+    do_center_crop = None
+    do_rescale = True
+    do_normalize = True
+    do_reduce_labels = False
+    valid_kwargs = SegformerFastImageProcessorKwargs
+    rescale_factor = 1 / 255
+
+    def __init__(self, **kwargs: Unpack[SegformerFastImageProcessorKwargs]):
+        super().__init__(**kwargs)
+
+    def reduce_label(self, labels: list["torch.Tensor"]):
+        for idx in range(len(labels)):
+            label = labels[idx]
+            label = torch.where(label == 0, torch.tensor(255, dtype=label.dtype), label)
+            label = label - 1
+            label = torch.where(label == 254, torch.tensor(255, dtype=label.dtype), label)
+            labels[idx] = label
+
+        return label
+
+    @auto_docstring
+    def preprocess(
+        self,
+        images: ImageInput,
+        segmentation_maps: Optional[ImageInput] = None,
+        **kwargs: Unpack[SegformerFastImageProcessorKwargs],
+    ) -> BatchFeature:
+        r"""
+        segmentation_maps (`ImageInput`, *optional*):
+            The segmentation maps to preprocess.
+        """
+        return super().preprocess(images, segmentation_maps, **kwargs)
+
+    def _preprocess_image_like_inputs(
+        self,
+        images: ImageInput,
+        segmentation_maps: Optional[ImageInput],
+        do_convert_rgb: bool,
+        input_data_format: ChannelDimension,
+        device: Optional[Union[str, "torch.device"]] = None,
+        **kwargs: Unpack[SegformerFastImageProcessorKwargs],
+    ) -> BatchFeature:
+        """
+        Preprocess image-like inputs.
+        """
+        images = self._prepare_image_like_inputs(
+            images=images, do_convert_rgb=do_convert_rgb, input_data_format=input_data_format, device=device
+        )
+        images_kwargs = kwargs.copy()
+        images_kwargs["do_reduce_labels"] = False
+        batch_feature = self._preprocess(images, **images_kwargs)
+
+        if segmentation_maps is not None:
+            processed_segmentation_maps = self._prepare_image_like_inputs(
+                images=segmentation_maps,
+                expected_ndims=2,
+                do_convert_rgb=False,
+                input_data_format=ChannelDimension.FIRST,
+            )
+
+            segmentation_maps_kwargs = kwargs.copy()
+            segmentation_maps_kwargs.update(
+                {
+                    "do_normalize": False,
+                    "do_rescale": False,
+                    # Nearest interpolation is used for segmentation maps instead of BILINEAR.
+                    "interpolation": F.InterpolationMode.NEAREST_EXACT,
+                }
+            )
+            processed_segmentation_maps = self._preprocess(
+                images=processed_segmentation_maps, **segmentation_maps_kwargs
+            ).pixel_values
+            batch_feature["labels"] = processed_segmentation_maps.squeeze(1).to(torch.int64)
+
+        return batch_feature
+
+    def _preprocess(
+        self,
+        images: list["torch.Tensor"],
+        do_reduce_labels: bool,
+        interpolation: Optional["F.InterpolationMode"],
+        do_resize: bool,
+        do_rescale: bool,
+        do_normalize: bool,
+        size: SizeDict,
+        rescale_factor: float,
+        image_mean: Union[float, list[float]],
+        image_std: Union[float, list[float]],
+        disable_grouping: bool,
+        return_tensors: Optional[Union[str, TensorType]],
+        **kwargs,
+    ) -> BatchFeature:  # Return type can be list if return_tensors=None
+        if do_reduce_labels:
+            images = self.reduce_label(images)  # Apply reduction if needed
+
+        # Group images by size for batched resizing
+        resized_images = images
+        if do_resize:
+            grouped_images, grouped_images_index = group_images_by_shape(images, disable_grouping=disable_grouping)
+            resized_images_grouped = {}
+            for shape, stacked_images in grouped_images.items():
+                resized_stacked_images = self.resize(image=stacked_images, size=size, interpolation=interpolation)
+                resized_images_grouped[shape] = resized_stacked_images
+            resized_images = reorder_images(resized_images_grouped, grouped_images_index)
+
+        # Group images by size for further processing (rescale/normalize)
+        # Needed in case do_resize is False, or resize returns images with different sizes
+        grouped_images, grouped_images_index = group_images_by_shape(resized_images, disable_grouping=disable_grouping)
+        processed_images_grouped = {}
+        for shape, stacked_images in grouped_images.items():
+            # Fused rescale and normalize
+            stacked_images = self.rescale_and_normalize(
+                stacked_images, do_rescale, rescale_factor, do_normalize, image_mean, image_std
+            )
+            processed_images_grouped[shape] = stacked_images
+
+        processed_images = reorder_images(processed_images_grouped, grouped_images_index)
+
+        # Stack images into a single tensor if return_tensors is set
+        processed_images = torch.stack(processed_images, dim=0) if return_tensors else processed_images
+
+        return BatchFeature(data={"pixel_values": processed_images}, tensor_type=return_tensors)
+
+    def post_process_semantic_segmentation(self, outputs, target_sizes: Optional[list[tuple]] = None):
+        """
+        Converts the output of [`SegformerForSemanticSegmentation`] into semantic segmentation maps. Only supports PyTorch.
+
+        Args:
+            outputs ([`SegformerForSemanticSegmentation`]):
+                Raw outputs of the model.
+            target_sizes (`list[Tuple]` of length `batch_size`, *optional*):
+                List of tuples corresponding to the requested final size (height, width) of each prediction. If unset,
+                predictions will not be resized.
+
+        Returns:
+            semantic_segmentation: `list[torch.Tensor]` of length `batch_size`, where each item is a semantic
+            segmentation map of shape (height, width) corresponding to the target_sizes entry (if `target_sizes` is
+            specified). Each entry of each `torch.Tensor` correspond to a semantic class id.
+        """
+        # TODO: add support for other frameworks
+        logits = outputs.logits
+
+        # Resize logits and compute semantic segmentation maps
+        if target_sizes is not None:
+            if len(logits) != len(target_sizes):
+                raise ValueError(
+                    "Make sure that you pass in as many target sizes as the batch dimension of the logits"
+                )
+
+            if is_torch_tensor(target_sizes):
+                target_sizes = target_sizes.numpy()
+
+            semantic_segmentation = []
+
+            for idx in range(len(logits)):
+                resized_logits = torch.nn.functional.interpolate(
+                    logits[idx].unsqueeze(dim=0), size=target_sizes[idx], mode="bilinear", align_corners=False
+                )
+                semantic_map = resized_logits[0].argmax(dim=0)
+                semantic_segmentation.append(semantic_map)
+        else:
+            semantic_segmentation = logits.argmax(dim=1)
+            semantic_segmentation = [semantic_segmentation[i] for i in range(semantic_segmentation.shape[0])]
+
+        return semantic_segmentation
+
+
+__all__ = ["SegformerImageProcessorFast"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/modeling_tf_segformer.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/modeling_tf_segformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..2f8e68b957487a172101d8b32241a572c71ddf02
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/segformer/modeling_tf_segformer.py
@@ -0,0 +1,1044 @@
+# coding=utf-8
+# Copyright 2022 NVIDIA The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TensorFlow SegFormer model."""
+
+from __future__ import annotations
+
+import math
+
+import tensorflow as tf
+
+from ...activations_tf import get_tf_activation
+from ...file_utils import (
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    replace_return_docstrings,
+)
+from ...modeling_tf_outputs import TFBaseModelOutput, TFSemanticSegmenterOutput, TFSequenceClassifierOutput
+from ...modeling_tf_utils import (
+    TFPreTrainedModel,
+    TFSequenceClassificationLoss,
+    keras,
+    keras_serializable,
+    unpack_inputs,
+)
+from ...tf_utils import shape_list, stable_softmax
+from ...utils import logging
+from .configuration_segformer import SegformerConfig
+
+
+logger = logging.get_logger(__name__)
+
+# General docstring
+_CONFIG_FOR_DOC = "SegformerConfig"
+
+# Base docstring
+_CHECKPOINT_FOR_DOC = "nvidia/mit-b0"
+_EXPECTED_OUTPUT_SHAPE = [1, 256, 16, 16]
+
+# Image classification docstring
+_IMAGE_CLASS_CHECKPOINT = "nvidia/mit-b0"
+_IMAGE_CLASS_EXPECTED_OUTPUT = "tabby, tabby cat"
+
+
+# Copied from transformers.models.convnext.modeling_tf_convnext.TFConvNextDropPath with ConvNext->Segformer
+class TFSegformerDropPath(keras.layers.Layer):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    References:
+        (1) github.com:rwightman/pytorch-image-models
+    """
+
+    def __init__(self, drop_path: float, **kwargs):
+        super().__init__(**kwargs)
+        self.drop_path = drop_path
+
+    def call(self, x: tf.Tensor, training=None):
+        if training:
+            keep_prob = 1 - self.drop_path
+            shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1)
+            random_tensor = keep_prob + tf.random.uniform(shape, 0, 1)
+            random_tensor = tf.floor(random_tensor)
+            return (x / keep_prob) * random_tensor
+        return x
+
+
+class TFSegformerOverlapPatchEmbeddings(keras.layers.Layer):
+    """Construct the overlapping patch embeddings."""
+
+    def __init__(self, patch_size, stride, num_channels, hidden_size, **kwargs):
+        super().__init__(**kwargs)
+        self.padding = keras.layers.ZeroPadding2D(padding=patch_size // 2)
+        self.proj = keras.layers.Conv2D(
+            filters=hidden_size, kernel_size=patch_size, strides=stride, padding="VALID", name="proj"
+        )
+
+        self.layer_norm = keras.layers.LayerNormalization(epsilon=1e-05, name="layer_norm")
+        self.num_channels = num_channels
+        self.hidden_size = hidden_size
+
+    def call(self, pixel_values: tf.Tensor) -> tuple[tf.Tensor, int, int]:
+        embeddings = self.proj(self.padding(pixel_values))
+        height = shape_list(embeddings)[1]
+        width = shape_list(embeddings)[2]
+        hidden_dim = shape_list(embeddings)[3]
+        # (batch_size, height, width, num_channels) -> (batch_size, height*width, num_channels)
+        # this can be fed to a Transformer layer
+        embeddings = tf.reshape(embeddings, (-1, height * width, hidden_dim))
+        embeddings = self.layer_norm(embeddings)
+        return embeddings, height, width
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "proj", None) is not None:
+            with tf.name_scope(self.proj.name):
+                self.proj.build([None, None, None, self.num_channels])
+        if getattr(self, "layer_norm", None) is not None:
+            with tf.name_scope(self.layer_norm.name):
+                self.layer_norm.build([None, None, self.hidden_size])
+
+
+class TFSegformerEfficientSelfAttention(keras.layers.Layer):
+    """SegFormer's efficient self-attention mechanism. Employs the sequence reduction process introduced in the [PvT
+    paper](https://huggingface.co/papers/2102.12122)."""
+
+    def __init__(
+        self,
+        config: SegformerConfig,
+        hidden_size: int,
+        num_attention_heads: int,
+        sequence_reduction_ratio: int,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_size = hidden_size
+        self.num_attention_heads = num_attention_heads
+
+        if self.hidden_size % self.num_attention_heads != 0:
+            raise ValueError(
+                f"The hidden size ({self.hidden_size}) is not a multiple of the number of attention "
+                f"heads ({self.num_attention_heads})"
+            )
+
+        self.attention_head_size = self.hidden_size // self.num_attention_heads
+        self.all_head_size = self.num_attention_heads * self.attention_head_size
+        self.sqrt_att_head_size = math.sqrt(self.attention_head_size)
+
+        self.query = keras.layers.Dense(self.all_head_size, name="query")
+        self.key = keras.layers.Dense(self.all_head_size, name="key")
+        self.value = keras.layers.Dense(self.all_head_size, name="value")
+
+        self.dropout = keras.layers.Dropout(config.attention_probs_dropout_prob)
+
+        self.sr_ratio = sequence_reduction_ratio
+        if sequence_reduction_ratio > 1:
+            self.sr = keras.layers.Conv2D(
+                filters=hidden_size, kernel_size=sequence_reduction_ratio, strides=sequence_reduction_ratio, name="sr"
+            )
+            self.layer_norm = keras.layers.LayerNormalization(epsilon=1e-05, name="layer_norm")
+
+    def transpose_for_scores(self, tensor: tf.Tensor) -> tf.Tensor:
+        # Reshape from [batch_size, seq_length, all_head_size]
+        # to [batch_size, seq_length, num_attention_heads, attention_head_size]
+        batch_size = shape_list(tensor)[0]
+        tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size))
+
+        # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size]
+        # to [batch_size, num_attention_heads, seq_length, attention_head_size]
+        return tf.transpose(tensor, perm=[0, 2, 1, 3])
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        height: int,
+        width: int,
+        output_attentions: bool = False,
+        training: bool = False,
+    ) -> tf.Tensor | tuple[tf.Tensor, tf.Tensor]:
+        batch_size = shape_list(hidden_states)[0]
+        num_channels = shape_list(hidden_states)[2]
+
+        query_layer = self.transpose_for_scores(self.query(hidden_states))
+
+        if self.sr_ratio > 1:
+            # Reshape to (batch_size, height, width, num_channels)
+            hidden_states = tf.reshape(hidden_states, (batch_size, height, width, num_channels))
+            # Apply sequence reduction
+            hidden_states = self.sr(hidden_states)
+            # Reshape back to (batch_size, seq_len, num_channels)
+            hidden_states = tf.reshape(hidden_states, (batch_size, -1, num_channels))
+            hidden_states = self.layer_norm(hidden_states)
+
+        key_layer = self.transpose_for_scores(self.key(hidden_states))
+        value_layer = self.transpose_for_scores(self.value(hidden_states))
+
+        # Take the dot product between "query" and "key" to get the raw attention scores.
+        attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
+
+        scale = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype)
+        attention_scores = tf.divide(attention_scores, scale)
+
+        # Normalize the attention scores to probabilities.
+        attention_probs = stable_softmax(logits=attention_scores, axis=-1)
+
+        # This is actually dropping out entire tokens to attend to, which might
+        # seem a bit unusual, but is taken from the original Transformer paper.
+        attention_probs = self.dropout(attention_probs, training=training)
+
+        context_layer = tf.matmul(attention_probs, value_layer)
+
+        context_layer = tf.transpose(context_layer, perm=[0, 2, 1, 3])
+        # (batch_size, seq_len_q, all_head_size)
+        context_layer = tf.reshape(context_layer, (batch_size, -1, self.all_head_size))
+
+        outputs = (context_layer, attention_probs) if output_attentions else (context_layer,)
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "query", None) is not None:
+            with tf.name_scope(self.query.name):
+                self.query.build([None, None, self.hidden_size])
+        if getattr(self, "key", None) is not None:
+            with tf.name_scope(self.key.name):
+                self.key.build([None, None, self.hidden_size])
+        if getattr(self, "value", None) is not None:
+            with tf.name_scope(self.value.name):
+                self.value.build([None, None, self.hidden_size])
+        if getattr(self, "sr", None) is not None:
+            with tf.name_scope(self.sr.name):
+                self.sr.build([None, None, None, self.hidden_size])
+        if getattr(self, "layer_norm", None) is not None:
+            with tf.name_scope(self.layer_norm.name):
+                self.layer_norm.build([None, None, self.hidden_size])
+
+
+class TFSegformerSelfOutput(keras.layers.Layer):
+    def __init__(self, config: SegformerConfig, hidden_size: int, **kwargs):
+        super().__init__(**kwargs)
+        self.dense = keras.layers.Dense(hidden_size, name="dense")
+        self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
+        self.hidden_size = hidden_size
+
+    def call(self, hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
+        hidden_states = self.dense(hidden_states)
+        hidden_states = self.dropout(hidden_states, training=training)
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense", None) is not None:
+            with tf.name_scope(self.dense.name):
+                self.dense.build([None, None, self.hidden_size])
+
+
+class TFSegformerAttention(keras.layers.Layer):
+    def __init__(
+        self,
+        config: SegformerConfig,
+        hidden_size: int,
+        num_attention_heads: int,
+        sequence_reduction_ratio: int,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.self = TFSegformerEfficientSelfAttention(
+            config=config,
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            sequence_reduction_ratio=sequence_reduction_ratio,
+            name="self",
+        )
+        self.dense_output = TFSegformerSelfOutput(config, hidden_size=hidden_size, name="output")
+
+    def call(
+        self, hidden_states: tf.Tensor, height: int, width: int, output_attentions: bool = False
+    ) -> tf.Tensor | tuple[tf.Tensor, tf.Tensor]:
+        self_outputs = self.self(hidden_states, height, width, output_attentions)
+
+        attention_output = self.dense_output(self_outputs[0])
+        outputs = (attention_output,) + self_outputs[1:]  # add attentions if we output them
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "self", None) is not None:
+            with tf.name_scope(self.self.name):
+                self.self.build(None)
+        if getattr(self, "dense_output", None) is not None:
+            with tf.name_scope(self.dense_output.name):
+                self.dense_output.build(None)
+
+
+class TFSegformerDWConv(keras.layers.Layer):
+    def __init__(self, dim: int = 768, **kwargs):
+        super().__init__(**kwargs)
+        self.depthwise_convolution = keras.layers.Conv2D(
+            filters=dim, kernel_size=3, strides=1, padding="same", groups=dim, name="dwconv"
+        )
+        self.dim = dim
+
+    def call(self, hidden_states: tf.Tensor, height: int, width: int) -> tf.Tensor:
+        batch_size = shape_list(hidden_states)[0]
+        num_channels = shape_list(hidden_states)[-1]
+        hidden_states = tf.reshape(hidden_states, (batch_size, height, width, num_channels))
+        hidden_states = self.depthwise_convolution(hidden_states)
+
+        new_height = shape_list(hidden_states)[1]
+        new_width = shape_list(hidden_states)[2]
+        num_channels = shape_list(hidden_states)[3]
+        hidden_states = tf.reshape(hidden_states, (batch_size, new_height * new_width, num_channels))
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "depthwise_convolution", None) is not None:
+            with tf.name_scope(self.depthwise_convolution.name):
+                self.depthwise_convolution.build([None, None, None, self.dim])
+
+
+class TFSegformerMixFFN(keras.layers.Layer):
+    def __init__(
+        self,
+        config: SegformerConfig,
+        in_features: int,
+        hidden_features: int | None = None,
+        out_features: int | None = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        out_features = out_features or in_features
+        self.dense1 = keras.layers.Dense(hidden_features, name="dense1")
+        self.depthwise_convolution = TFSegformerDWConv(hidden_features, name="dwconv")
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = get_tf_activation(config.hidden_act)
+        else:
+            self.intermediate_act_fn = config.hidden_act
+        self.dense2 = keras.layers.Dense(out_features, name="dense2")
+        self.dropout = keras.layers.Dropout(config.hidden_dropout_prob)
+        self.hidden_features = hidden_features
+        self.in_features = in_features
+
+    def call(self, hidden_states: tf.Tensor, height: int, width: int, training: bool = False) -> tf.Tensor:
+        hidden_states = self.dense1(hidden_states)
+        hidden_states = self.depthwise_convolution(hidden_states, height, width)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        hidden_states = self.dropout(hidden_states, training=training)
+        hidden_states = self.dense2(hidden_states)
+        hidden_states = self.dropout(hidden_states, training=training)
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "dense1", None) is not None:
+            with tf.name_scope(self.dense1.name):
+                self.dense1.build([None, None, self.in_features])
+        if getattr(self, "depthwise_convolution", None) is not None:
+            with tf.name_scope(self.depthwise_convolution.name):
+                self.depthwise_convolution.build(None)
+        if getattr(self, "dense2", None) is not None:
+            with tf.name_scope(self.dense2.name):
+                self.dense2.build([None, None, self.hidden_features])
+
+
+class TFSegformerLayer(keras.layers.Layer):
+    """This corresponds to the Block class in the original implementation."""
+
+    def __init__(
+        self,
+        config,
+        hidden_size: int,
+        num_attention_heads: int,
+        drop_path: float,
+        sequence_reduction_ratio: int,
+        mlp_ratio: int,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.layer_norm_1 = keras.layers.LayerNormalization(epsilon=1e-05, name="layer_norm_1")
+        self.attention = TFSegformerAttention(
+            config,
+            hidden_size=hidden_size,
+            num_attention_heads=num_attention_heads,
+            sequence_reduction_ratio=sequence_reduction_ratio,
+            name="attention",
+        )
+        self.drop_path = TFSegformerDropPath(drop_path) if drop_path > 0.0 else keras.layers.Activation("linear")
+        self.layer_norm_2 = keras.layers.LayerNormalization(epsilon=1e-05, name="layer_norm_2")
+        mlp_hidden_size = int(hidden_size * mlp_ratio)
+        self.mlp = TFSegformerMixFFN(config, in_features=hidden_size, hidden_features=mlp_hidden_size, name="mlp")
+        self.hidden_size = hidden_size
+
+    def call(
+        self,
+        hidden_states: tf.Tensor,
+        height: int,
+        width: int,
+        output_attentions: bool = False,
+        training: bool = False,
+    ) -> tuple:
+        self_attention_outputs = self.attention(
+            self.layer_norm_1(hidden_states),  # in Segformer, layernorm is applied before self-attention
+            height,
+            width,
+            output_attentions=output_attentions,
+            training=training,
+        )
+
+        attention_output = self_attention_outputs[0]
+        outputs = self_attention_outputs[1:]  # add self attentions if we output attention weights
+
+        # first residual connection (with stochastic depth)
+        attention_output = self.drop_path(attention_output, training=training)
+        hidden_states = attention_output + hidden_states
+        mlp_output = self.mlp(self.layer_norm_2(hidden_states), height, width)
+
+        # second residual connection (with stochastic depth)
+        mlp_output = self.drop_path(mlp_output, training=training)
+        layer_output = mlp_output + hidden_states
+
+        outputs = (layer_output,) + outputs
+
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "layer_norm_1", None) is not None:
+            with tf.name_scope(self.layer_norm_1.name):
+                self.layer_norm_1.build([None, None, self.hidden_size])
+        if getattr(self, "attention", None) is not None:
+            with tf.name_scope(self.attention.name):
+                self.attention.build(None)
+        if getattr(self, "layer_norm_2", None) is not None:
+            with tf.name_scope(self.layer_norm_2.name):
+                self.layer_norm_2.build([None, None, self.hidden_size])
+        if getattr(self, "mlp", None) is not None:
+            with tf.name_scope(self.mlp.name):
+                self.mlp.build(None)
+
+
+class TFSegformerEncoder(keras.layers.Layer):
+    def __init__(self, config: SegformerConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.config = config
+
+        # stochastic depth decay rule
+        drop_path_decays = [x.numpy() for x in tf.linspace(0.0, config.drop_path_rate, sum(config.depths))]
+
+        # patch embeddings
+        embeddings = []
+        for i in range(config.num_encoder_blocks):
+            embeddings.append(
+                TFSegformerOverlapPatchEmbeddings(
+                    patch_size=config.patch_sizes[i],
+                    stride=config.strides[i],
+                    num_channels=config.num_channels if i == 0 else config.hidden_sizes[i - 1],
+                    hidden_size=config.hidden_sizes[i],
+                    name=f"patch_embeddings.{i}",
+                )
+            )
+        self.embeddings = embeddings
+
+        # Transformer blocks
+        blocks = []
+        cur = 0
+        for i in range(config.num_encoder_blocks):
+            # each block consists of layers
+            layers = []
+            if i != 0:
+                cur += config.depths[i - 1]
+            for j in range(config.depths[i]):
+                layers.append(
+                    TFSegformerLayer(
+                        config,
+                        hidden_size=config.hidden_sizes[i],
+                        num_attention_heads=config.num_attention_heads[i],
+                        drop_path=drop_path_decays[cur + j],
+                        sequence_reduction_ratio=config.sr_ratios[i],
+                        mlp_ratio=config.mlp_ratios[i],
+                        name=f"block.{i}.{j}",
+                    )
+                )
+            blocks.append(layers)
+
+        self.block = blocks
+
+        # Layer norms
+        self.layer_norms = [
+            keras.layers.LayerNormalization(epsilon=1e-05, name=f"layer_norm.{i}")
+            for i in range(config.num_encoder_blocks)
+        ]
+
+    def call(
+        self,
+        pixel_values: tf.Tensor,
+        output_attentions: bool | None = False,
+        output_hidden_states: bool | None = False,
+        return_dict: bool | None = True,
+        training: bool = False,
+    ) -> tuple | TFBaseModelOutput:
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        batch_size = shape_list(pixel_values)[0]
+
+        hidden_states = pixel_values
+        for idx, x in enumerate(zip(self.embeddings, self.block, self.layer_norms)):
+            embedding_layer, block_layer, norm_layer = x
+            # first, obtain patch embeddings
+            hidden_states, height, width = embedding_layer(hidden_states)
+
+            # second, send embeddings through blocks
+            # (each block consists of multiple layers i.e., list of layers)
+            for i, blk in enumerate(block_layer):
+                layer_outputs = blk(
+                    hidden_states,
+                    height,
+                    width,
+                    output_attentions,
+                    training=training,
+                )
+                hidden_states = layer_outputs[0]
+                if output_attentions:
+                    all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+            # third, apply layer norm
+            hidden_states = norm_layer(hidden_states)
+
+            # fourth, optionally reshape back to (batch_size, height, width, num_channels)
+            if idx != len(self.embeddings) - 1 or (idx == len(self.embeddings) - 1 and self.config.reshape_last_stage):
+                num_channels = shape_list(hidden_states)[-1]
+                hidden_states = tf.reshape(hidden_states, (batch_size, height, width, num_channels))
+
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return TFBaseModelOutput(
+            last_hidden_state=hidden_states, hidden_states=all_hidden_states, attentions=all_self_attentions
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "layer_norms", None) is not None:
+            for layer, shape in zip(self.layer_norms, self.config.hidden_sizes):
+                with tf.name_scope(layer.name):
+                    layer.build([None, None, shape])
+        if getattr(self, "block", None) is not None:
+            for block in self.block:
+                for layer in block:
+                    with tf.name_scope(layer.name):
+                        layer.build(None)
+        if getattr(self, "embeddings", None) is not None:
+            for layer in self.embeddings:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+
+
+@keras_serializable
+class TFSegformerMainLayer(keras.layers.Layer):
+    config_class = SegformerConfig
+
+    def __init__(self, config: SegformerConfig, **kwargs):
+        super().__init__(**kwargs)
+
+        self.config = config
+        # hierarchical Transformer encoder
+        self.encoder = TFSegformerEncoder(config, name="encoder")
+
+    @unpack_inputs
+    def call(
+        self,
+        pixel_values: tf.Tensor,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool = False,
+    ) -> tuple | TFBaseModelOutput:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # When running on CPU, `keras.layers.Conv2D` doesn't support `NCHW` format.
+        # So change the input format from `NCHW` to `NHWC`.
+        # shape = (batch_size, in_height, in_width, in_channels=num_channels)
+        pixel_values = tf.transpose(pixel_values, perm=(0, 2, 3, 1))
+
+        encoder_outputs = self.encoder(
+            pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        sequence_output = encoder_outputs[0]
+        # Change to NCHW output format to have uniformity in the modules
+        sequence_output = tf.transpose(sequence_output, perm=[0, 3, 1, 2])
+
+        # Change the other hidden state outputs to NCHW as well
+        if output_hidden_states:
+            hidden_states = tuple(tf.transpose(h, perm=(0, 3, 1, 2)) for h in encoder_outputs[1])
+
+        if not return_dict:
+            if tf.greater(len(encoder_outputs[1:]), 0):
+                transposed_encoder_outputs = tuple(tf.transpose(v, perm=[0, 3, 1, 2]) for v in encoder_outputs[1:][0])
+                return (sequence_output,) + (transposed_encoder_outputs,)
+            else:
+                return (sequence_output,) + encoder_outputs[1:]
+
+        return TFBaseModelOutput(
+            last_hidden_state=sequence_output,
+            hidden_states=hidden_states if output_hidden_states else encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "encoder", None) is not None:
+            with tf.name_scope(self.encoder.name):
+                self.encoder.build(None)
+
+
+class TFSegformerPreTrainedModel(TFPreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+
+    config_class = SegformerConfig
+    base_model_prefix = "segformer"
+    main_input_name = "pixel_values"
+
+    @property
+    def input_signature(self):
+        return {"pixel_values": tf.TensorSpec(shape=(None, self.config.num_channels, 512, 512), dtype=tf.float32)}
+
+
+SEGFORMER_START_DOCSTRING = r"""
+    This model inherits from [`TFPreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
+    etc.)
+
+    This model is also a [keras.Model](https://www.tensorflow.org/api_docs/python/tf/keras/Model) subclass. Use it
+    as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage and
+    behavior.
+
+    Parameters:
+        config ([`SegformerConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~TFPreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+
+SEGFORMER_INPUTS_DOCSTRING = r"""
+
+    Args:
+        pixel_values (`np.ndarray`, `tf.Tensor`, `list[tf.Tensor]` ``dict[str, tf.Tensor]` or `dict[str, np.ndarray]` and each example must have the shape `(batch_size, num_channels, height, width)`):
+            Pixel values. Pixel values can be obtained using [`AutoImageProcessor`]. See
+            [`SegformerImageProcessor.__call__`] for details.
+
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
+            config will be used instead.
+
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
+            used instead.
+
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple. This argument can be used in
+            eager mode, in graph mode the value will always be set to True.
+
+        training (`bool`, *optional*, defaults to `False``):
+            Whether or not to use the model in training mode (some modules like dropout modules have different
+            behaviors between training and evaluation).
+"""
+
+
+@add_start_docstrings(
+    "The bare SegFormer encoder (Mix-Transformer) outputting raw hidden-states without any specific head on top.",
+    SEGFORMER_START_DOCSTRING,
+)
+class TFSegformerModel(TFSegformerPreTrainedModel):
+    def __init__(self, config: SegformerConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+        self.config = config
+
+        # hierarchical Transformer encoder
+        self.segformer = TFSegformerMainLayer(config, name="segformer")
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(SEGFORMER_INPUTS_DOCSTRING.format("(batch_size, sequence_length)"))
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TFBaseModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="vision",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    def call(
+        self,
+        pixel_values: tf.Tensor,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+        training: bool = False,
+    ) -> tuple | TFBaseModelOutput:
+        outputs = self.segformer(
+            pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            training=training,
+        )
+        return outputs
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "segformer", None) is not None:
+            with tf.name_scope(self.segformer.name):
+                self.segformer.build(None)
+
+
+@add_start_docstrings(
+    """
+    SegFormer Model transformer with an image classification head on top (a linear layer on top of the final hidden
+    states) e.g. for ImageNet.
+    """,
+    SEGFORMER_START_DOCSTRING,
+)
+class TFSegformerForImageClassification(TFSegformerPreTrainedModel, TFSequenceClassificationLoss):
+    def __init__(self, config: SegformerConfig, *inputs, **kwargs):
+        super().__init__(config, *inputs, **kwargs)
+
+        self.num_labels = config.num_labels
+        self.segformer = TFSegformerMainLayer(config, name="segformer")
+
+        # Classifier head
+        self.classifier = keras.layers.Dense(config.num_labels, name="classifier")
+        self.config = config
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(SEGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @add_code_sample_docstrings(
+        checkpoint=_IMAGE_CLASS_CHECKPOINT,
+        output_type=TFSequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_IMAGE_CLASS_EXPECTED_OUTPUT,
+    )
+    def call(
+        self,
+        pixel_values: tf.Tensor | None = None,
+        labels: tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+    ) -> tuple | TFSequenceClassifierOutput:
+        outputs = self.segformer(
+            pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        sequence_output = outputs[0]
+
+        # convert last hidden states to (batch_size, height*width, hidden_size)
+        batch_size = shape_list(sequence_output)[0]
+        sequence_output = tf.transpose(sequence_output, perm=[0, 2, 3, 1])
+        sequence_output = tf.reshape(sequence_output, (batch_size, -1, self.config.hidden_sizes[-1]))
+
+        # global average pooling
+        sequence_output = tf.reduce_mean(sequence_output, axis=1)
+
+        logits = self.classifier(sequence_output)
+
+        loss = None if labels is None else self.hf_compute_loss(labels=labels, logits=logits)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFSequenceClassifierOutput(
+            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "segformer", None) is not None:
+            with tf.name_scope(self.segformer.name):
+                self.segformer.build(None)
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build([None, None, self.config.hidden_sizes[-1]])
+
+
+class TFSegformerMLP(keras.layers.Layer):
+    """
+    Linear Embedding.
+    """
+
+    def __init__(self, input_dim: int, config: SegformerConfig, **kwargs):
+        super().__init__(**kwargs)
+        self.proj = keras.layers.Dense(config.decoder_hidden_size, name="proj")
+        self.input_dim = input_dim
+
+    def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+        height = shape_list(hidden_states)[1]
+        width = shape_list(hidden_states)[2]
+        hidden_dim = shape_list(hidden_states)[-1]
+        hidden_states = tf.reshape(hidden_states, (-1, height * width, hidden_dim))
+        hidden_states = self.proj(hidden_states)
+        return hidden_states
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "proj", None) is not None:
+            with tf.name_scope(self.proj.name):
+                self.proj.build([None, None, self.input_dim])
+
+
+class TFSegformerDecodeHead(TFSegformerPreTrainedModel):
+    def __init__(self, config: SegformerConfig, **kwargs):
+        super().__init__(config, **kwargs)
+        # linear layers which will unify the channel dimension of each of the encoder blocks to the same config.decoder_hidden_size
+        mlps = []
+        for i in range(config.num_encoder_blocks):
+            mlp = TFSegformerMLP(config=config, input_dim=config.hidden_sizes[i], name=f"linear_c.{i}")
+            mlps.append(mlp)
+        self.mlps = mlps
+
+        # the following 3 layers implement the ConvModule of the original implementation
+        self.linear_fuse = keras.layers.Conv2D(
+            filters=config.decoder_hidden_size, kernel_size=1, use_bias=False, name="linear_fuse"
+        )
+        self.batch_norm = keras.layers.BatchNormalization(epsilon=1e-5, momentum=0.9, name="batch_norm")
+        self.activation = keras.layers.Activation("relu")
+
+        self.dropout = keras.layers.Dropout(config.classifier_dropout_prob)
+        self.classifier = keras.layers.Conv2D(filters=config.num_labels, kernel_size=1, name="classifier")
+
+        self.config = config
+
+    def call(self, encoder_hidden_states: tf.Tensor, training: bool = False) -> tf.Tensor:
+        all_hidden_states = ()
+        for encoder_hidden_state, mlp in zip(encoder_hidden_states, self.mlps):
+            if self.config.reshape_last_stage is False and len(shape_list(encoder_hidden_state)) == 3:
+                height = tf.math.sqrt(tf.cast(shape_list(encoder_hidden_state)[1], tf.float32))
+                height = width = tf.cast(height, tf.int32)
+                channel_dim = shape_list(encoder_hidden_state)[-1]
+                encoder_hidden_state = tf.reshape(encoder_hidden_state, (-1, height, width, channel_dim))
+
+            # unify channel dimension
+            encoder_hidden_state = tf.transpose(encoder_hidden_state, perm=[0, 2, 3, 1])
+            height, width = shape_list(encoder_hidden_state)[1:3]
+            encoder_hidden_state = mlp(encoder_hidden_state)
+            channel_dim = shape_list(encoder_hidden_state)[-1]
+            encoder_hidden_state = tf.reshape(encoder_hidden_state, (-1, height, width, channel_dim))
+
+            # upsample
+            temp_state = tf.transpose(encoder_hidden_states[0], perm=[0, 2, 3, 1])
+            upsample_resolution = shape_list(temp_state)[1:-1]
+            encoder_hidden_state = tf.image.resize(encoder_hidden_state, size=upsample_resolution, method="bilinear")
+            all_hidden_states += (encoder_hidden_state,)
+
+        hidden_states = self.linear_fuse(tf.concat(all_hidden_states[::-1], axis=-1))
+        hidden_states = self.batch_norm(hidden_states, training=training)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.dropout(hidden_states, training=training)
+
+        # logits of shape (batch_size, height/4, width/4, num_labels)
+        logits = self.classifier(hidden_states)
+
+        return logits
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "linear_fuse", None) is not None:
+            with tf.name_scope(self.linear_fuse.name):
+                self.linear_fuse.build(
+                    [None, None, None, self.config.decoder_hidden_size * self.config.num_encoder_blocks]
+                )
+        if getattr(self, "batch_norm", None) is not None:
+            with tf.name_scope(self.batch_norm.name):
+                self.batch_norm.build([None, None, None, self.config.decoder_hidden_size])
+        if getattr(self, "classifier", None) is not None:
+            with tf.name_scope(self.classifier.name):
+                self.classifier.build([None, None, None, self.config.decoder_hidden_size])
+        if getattr(self, "mlps", None) is not None:
+            for layer in self.mlps:
+                with tf.name_scope(layer.name):
+                    layer.build(None)
+
+
+@add_start_docstrings(
+    """SegFormer Model transformer with an all-MLP decode head on top e.g. for ADE20k, CityScapes.""",
+    SEGFORMER_START_DOCSTRING,
+)
+class TFSegformerForSemanticSegmentation(TFSegformerPreTrainedModel):
+    def __init__(self, config: SegformerConfig, **kwargs):
+        super().__init__(config, **kwargs)
+        self.segformer = TFSegformerMainLayer(config, name="segformer")
+        self.decode_head = TFSegformerDecodeHead(config, name="decode_head")
+
+    def hf_compute_loss(self, logits, labels):
+        # upsample logits to the images' original size
+        # `labels` is of shape (batch_size, height, width)
+        label_interp_shape = shape_list(labels)[1:]
+
+        upsampled_logits = tf.image.resize(logits, size=label_interp_shape, method="bilinear")
+        # compute weighted loss
+        loss_fct = keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction="none")
+
+        def masked_loss(real, pred):
+            unmasked_loss = loss_fct(real, pred)
+            mask = tf.cast(real != self.config.semantic_loss_ignore_index, dtype=unmasked_loss.dtype)
+            masked_loss = unmasked_loss * mask
+            # Reduction strategy in the similar spirit with
+            # https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_tf_utils.py#L210
+            reduced_masked_loss = tf.reduce_sum(masked_loss) / tf.reduce_sum(mask)
+            return tf.reshape(reduced_masked_loss, (1,))
+
+        return masked_loss(labels, upsampled_logits)
+
+    @unpack_inputs
+    @add_start_docstrings_to_model_forward(SEGFORMER_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+    @replace_return_docstrings(output_type=TFSemanticSegmenterOutput, config_class=_CONFIG_FOR_DOC)
+    def call(
+        self,
+        pixel_values: tf.Tensor,
+        labels: tf.Tensor | None = None,
+        output_attentions: bool | None = None,
+        output_hidden_states: bool | None = None,
+        return_dict: bool | None = None,
+    ) -> tuple | TFSemanticSegmenterOutput:
+        r"""
+        labels (`tf.Tensor` of shape `(batch_size, height, width)`, *optional*):
+            Ground truth semantic segmentation maps for computing the loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels > 1`, a (per-pixel) classification loss is computed
+            (Cross-Entropy).
+
+        Returns:
+
+        Examples:
+
+        ```python
+        >>> from transformers import AutoImageProcessor, TFSegformerForSemanticSegmentation
+        >>> from PIL import Image
+        >>> import requests
+
+        >>> url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+        >>> image = Image.open(requests.get(url, stream=True).raw)
+
+        >>> image_processor = AutoImageProcessor.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
+        >>> model = TFSegformerForSemanticSegmentation.from_pretrained("nvidia/segformer-b0-finetuned-ade-512-512")
+
+        >>> inputs = image_processor(images=image, return_tensors="tf")
+        >>> outputs = model(**inputs, training=False)
+        >>> # logits are of shape (batch_size, num_labels, height/4, width/4)
+        >>> logits = outputs.logits
+        >>> list(logits.shape)
+        [1, 150, 128, 128]
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+
+        if labels is not None and not self.config.num_labels > 1:
+            raise ValueError("The number of labels should be greater than one")
+
+        outputs = self.segformer(
+            pixel_values,
+            output_attentions=output_attentions,
+            output_hidden_states=True,  # we need the intermediate hidden states
+            return_dict=return_dict,
+        )
+
+        encoder_hidden_states = outputs.hidden_states if return_dict else outputs[1]
+
+        logits = self.decode_head(encoder_hidden_states)
+
+        loss = None
+        if labels is not None:
+            loss = self.hf_compute_loss(logits=logits, labels=labels)
+
+        # make logits of shape (batch_size, num_labels, height, width) to
+        # keep them consistent across APIs
+        logits = tf.transpose(logits, perm=[0, 3, 1, 2])
+
+        if not return_dict:
+            if output_hidden_states:
+                output = (logits,) + outputs[1:]
+            else:
+                output = (logits,) + outputs[2:]
+            return ((loss,) + output) if loss is not None else output
+
+        return TFSemanticSegmenterOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states if output_hidden_states else None,
+            attentions=outputs.attentions,
+        )
+
+    def build(self, input_shape=None):
+        if self.built:
+            return
+        self.built = True
+        if getattr(self, "segformer", None) is not None:
+            with tf.name_scope(self.segformer.name):
+                self.segformer.build(None)
+        if getattr(self, "decode_head", None) is not None:
+            with tf.name_scope(self.decode_head.name):
+                self.decode_head.build(None)
+
+
+__all__ = [
+    "TFSegformerDecodeHead",
+    "TFSegformerForImageClassification",
+    "TFSegformerForSemanticSegmentation",
+    "TFSegformerModel",
+    "TFSegformerPreTrainedModel",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/shieldgemma2/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/shieldgemma2/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a525a70490db62f8d28587f02b60d6714490b5d4
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/shieldgemma2/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/shieldgemma2/__pycache__/configuration_shieldgemma2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/shieldgemma2/__pycache__/configuration_shieldgemma2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..68c7414c7bb86e2a2ff1fd55a3d80929a2703eef
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/shieldgemma2/__pycache__/configuration_shieldgemma2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/shieldgemma2/__pycache__/modeling_shieldgemma2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/shieldgemma2/__pycache__/modeling_shieldgemma2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5db8eecf989cc1f44be80b4a9aafc4bae0894d73
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/shieldgemma2/__pycache__/modeling_shieldgemma2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/shieldgemma2/__pycache__/processing_shieldgemma2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/shieldgemma2/__pycache__/processing_shieldgemma2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c2611d3bbcf917b376974f5dbe66842f677f2f44
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/shieldgemma2/__pycache__/processing_shieldgemma2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smolvlm/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smolvlm/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0aba93226dc854776048e1063a16d4b584af862e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smolvlm/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smolvlm/__pycache__/configuration_smolvlm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smolvlm/__pycache__/configuration_smolvlm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1dd132e277a8b2961f000869ba4563507949f011
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/smolvlm/__pycache__/configuration_smolvlm.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speecht5/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speecht5/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2e66626045d6dc5c8c2ed303756cb9679c18735c
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speecht5/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speecht5/__pycache__/configuration_speecht5.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speecht5/__pycache__/configuration_speecht5.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dffa2013d9922bea2630a6d8aad3eb82fa000fd9
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speecht5/__pycache__/configuration_speecht5.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speecht5/__pycache__/feature_extraction_speecht5.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speecht5/__pycache__/feature_extraction_speecht5.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..068775f55dc2f0990b31842d765ddb67a16a6c88
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speecht5/__pycache__/feature_extraction_speecht5.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speecht5/__pycache__/number_normalizer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speecht5/__pycache__/number_normalizer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fed2966b02a77c460d0f2d8bb0f152e6cbed8b07
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speecht5/__pycache__/number_normalizer.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speecht5/__pycache__/processing_speecht5.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speecht5/__pycache__/processing_speecht5.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cdb7da1fd43b5bc674042807e05d62ed27e87a0b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speecht5/__pycache__/processing_speecht5.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speecht5/__pycache__/tokenization_speecht5.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speecht5/__pycache__/tokenization_speecht5.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..181fffc523ee7070011331b40189a02a036a1ead
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/speecht5/__pycache__/tokenization_speecht5.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/splinter/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/splinter/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..45c1aa9dfcabf1c4ed1732b8cfd476a59c46e388
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/splinter/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/splinter/__pycache__/configuration_splinter.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/splinter/__pycache__/configuration_splinter.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e2ab8941b6a33da49130345150964a7e85aa93cd
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/splinter/__pycache__/configuration_splinter.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/splinter/__pycache__/modeling_splinter.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/splinter/__pycache__/modeling_splinter.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e8000d47fb7a5552ac1c2624aede64171a37b838
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/splinter/__pycache__/modeling_splinter.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/splinter/__pycache__/tokenization_splinter.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/splinter/__pycache__/tokenization_splinter.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e2190271f12d831508acc5db666935a2eaba792a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/splinter/__pycache__/tokenization_splinter.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/splinter/__pycache__/tokenization_splinter_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/splinter/__pycache__/tokenization_splinter_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f9e00687933fb0d1b972d83f2341aa0294db04c1
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/splinter/__pycache__/tokenization_splinter_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/starcoder2/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/starcoder2/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..6349255ed3a4756aa7d449ff6af35847886dd136
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/starcoder2/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_starcoder2 import *
+    from .modeling_starcoder2 import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/starcoder2/configuration_starcoder2.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/starcoder2/configuration_starcoder2.py
new file mode 100644
index 0000000000000000000000000000000000000000..a700b4f4254830103c21f7920dd71dfe5066eaf8
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/starcoder2/configuration_starcoder2.py
@@ -0,0 +1,207 @@
+# coding=utf-8
+# Copyright 2024 BigCode and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Starcoder2 model configuration"""
+
+from ...configuration_utils import PretrainedConfig
+from ...modeling_rope_utils import rope_config_validation
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class Starcoder2Config(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Starcoder2Model`]. It is used to instantiate a
+    Starcoder2 model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the [bigcode/starcoder2-7b](https://huggingface.co/bigcode/starcoder2-7b) model.
+
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 49152):
+            Vocabulary size of the Starcoder2 model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`Starcoder2Model`]
+        hidden_size (`int`, *optional*, defaults to 3072):
+            Dimension of the hidden representations.
+        intermediate_size (`int`, *optional*, defaults to 12288):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 30):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 24):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        num_key_value_heads (`int`, *optional*, defaults to 2):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1` the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245). If it is not specified, will default to `8`.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu_pytorch_tanh"`):
+            The non-linear activation function (function or string) in the decoder.
+        max_position_embeddings (`int`, *optional*, defaults to 4096):
+            The maximum sequence length that this model might ever be used with. Starcoder2's sliding window attention
+            allows sequence of up to 4096*32 tokens.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        norm_epsilon (`float`, *optional*, defaults to 1e-05):
+            Epsilon value for the layer norm
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        bos_token_id (`int`, *optional*, defaults to 50256):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 50256):
+            The id of the "end-of-sequence" token.
+        rope_theta (`float`, *optional*, defaults to 10000.0):
+            The base period of the RoPE embeddings.
+        rope_scaling (`Dict`, *optional*):
+            Dictionary containing the scaling configuration for the RoPE embeddings. NOTE: if you apply new rope type
+            and you expect the model to work on longer `max_position_embeddings`, we recommend you to update this value
+            accordingly.
+            Expected contents:
+                `rope_type` (`str`):
+                    The sub-variant of RoPE to use. Can be one of ['default', 'linear', 'dynamic', 'yarn', 'longrope',
+                    'llama3'], with 'default' being the original RoPE implementation.
+                `factor` (`float`, *optional*):
+                    Used with all rope types except 'default'. The scaling factor to apply to the RoPE embeddings. In
+                    most scaling types, a `factor` of x will enable the model to handle sequences of length x *
+                    original maximum pre-trained length.
+                `original_max_position_embeddings` (`int`, *optional*):
+                    Used with 'dynamic', 'longrope' and 'llama3'. The original max position embeddings used during
+                    pretraining.
+                `attention_factor` (`float`, *optional*):
+                    Used with 'yarn' and 'longrope'. The scaling factor to be applied on the attention
+                    computation. If unspecified, it defaults to value recommended by the implementation, using the
+                    `factor` field to infer the suggested value.
+                `beta_fast` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for extrapolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 32.
+                `beta_slow` (`float`, *optional*):
+                    Only used with 'yarn'. Parameter to set the boundary for interpolation (only) in the linear
+                    ramp function. If unspecified, it defaults to 1.
+                `short_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to short contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `long_factor` (`list[float]`, *optional*):
+                    Only used with 'longrope'. The scaling factor to be applied to long contexts (<
+                    `original_max_position_embeddings`). Must be a list of numbers with the same length as the hidden
+                    size divided by the number of attention heads divided by 2
+                `low_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to low frequency components of the RoPE
+                `high_freq_factor` (`float`, *optional*):
+                    Only used with 'llama3'. Scaling factor applied to high frequency components of the RoPE
+        sliding_window (`int`, *optional*):
+            Sliding window attention window size. If not specified, will default to `None` (no sliding window).
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        residual_dropout (`float`, *optional*, defaults to 0.0):
+            Residual connection dropout value.
+        embedding_dropout (`float`, *optional*, defaults to 0.0):
+            Embedding dropout.
+        use_bias (`bool`, *optional*, defaults to `True`):
+            Whether to use bias term on linear layers of the model.
+
+
+    ```python
+    >>> from transformers import Starcoder2Model, Starcoder2Config
+
+    >>> # Initializing a Starcoder2 7B style configuration
+    >>> configuration = Starcoder2Config()
+
+    >>> # Initializing a model from the Starcoder2 7B style configuration
+    >>> model = Starcoder2Model(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "starcoder2"
+    keys_to_ignore_at_inference = ["past_key_values"]
+    # Default tensor parallel plan for base model `Starcoder2`
+    base_model_tp_plan = {
+        "layers.*.self_attn.q_proj": "colwise",
+        "layers.*.self_attn.k_proj": "colwise",
+        "layers.*.self_attn.v_proj": "colwise",
+        "layers.*.self_attn.o_proj": "rowwise",
+        "layers.*.mlp.c_fc": "colwise",
+        "layers.*.mlp.c_proj": "rowwise",
+    }
+    base_model_pp_plan = {
+        "embed_tokens": (["input_ids"], ["inputs_embeds"]),
+        "layers": (["hidden_states", "attention_mask"], ["hidden_states"]),
+        "norm": (["hidden_states"], ["hidden_states"]),
+    }
+
+    def __init__(
+        self,
+        vocab_size=49152,
+        hidden_size=3072,
+        intermediate_size=12288,
+        num_hidden_layers=30,
+        num_attention_heads=24,
+        num_key_value_heads=2,
+        hidden_act="gelu_pytorch_tanh",
+        max_position_embeddings=4096,
+        initializer_range=0.018042,
+        norm_epsilon=1e-5,
+        use_cache=True,
+        bos_token_id=50256,
+        eos_token_id=50256,
+        rope_theta=10000.0,
+        rope_scaling=None,
+        sliding_window=None,
+        attention_dropout=0.0,
+        residual_dropout=0.0,
+        embedding_dropout=0.0,
+        use_bias=True,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.max_position_embeddings = max_position_embeddings
+        self.hidden_size = hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.sliding_window = sliding_window
+        self.use_bias = use_bias
+        self.num_key_value_heads = num_key_value_heads
+        self.hidden_act = hidden_act
+        self.initializer_range = initializer_range
+        self.norm_epsilon = norm_epsilon
+        self.use_cache = use_cache
+        self.rope_theta = rope_theta
+        self.rope_scaling = rope_scaling
+        self.attention_dropout = attention_dropout
+        self.residual_dropout = residual_dropout
+        self.embedding_dropout = embedding_dropout
+        # Validate the correctness of rotary position embeddings parameters
+        # BC: if there is a 'type' field, move it to 'rope_type'.
+        if self.rope_scaling is not None and "type" in self.rope_scaling:
+            self.rope_scaling["rope_type"] = self.rope_scaling["type"]
+        rope_config_validation(self)
+
+        super().__init__(
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            **kwargs,
+        )
+
+
+__all__ = ["Starcoder2Config"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/starcoder2/modeling_starcoder2.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/starcoder2/modeling_starcoder2.py
new file mode 100644
index 0000000000000000000000000000000000000000..dfdfec22ca99cf1f205dcaa103ca448afe8498dd
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/starcoder2/modeling_starcoder2.py
@@ -0,0 +1,485 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/starcoder2/modular_starcoder2.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_starcoder2.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+# coding=utf-8
+# Copyright 2024 BigCode and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Callable, Optional, Union
+
+import torch
+from torch import nn
+
+from transformers.utils.generic import check_model_inputs
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...generation import GenerationMixin
+from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_layers import (
+    GenericForSequenceClassification,
+    GenericForTokenClassification,
+    GradientCheckpointingLayer,
+)
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast
+from ...modeling_rope_utils import ROPE_INIT_FUNCTIONS, dynamic_rope_update
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, auto_docstring, can_return_tuple
+from ...utils.deprecation import deprecate_kwarg
+from .configuration_starcoder2 import Starcoder2Config
+
+
+class Starcoder2MLP(nn.Module):
+    def __init__(self, config: Starcoder2Config):
+        super().__init__()
+        embed_dim = config.hidden_size
+        self.c_fc = nn.Linear(embed_dim, config.intermediate_size, bias=config.use_bias)
+        self.c_proj = nn.Linear(config.intermediate_size, embed_dim, bias=config.use_bias)
+        self.act = ACT2FN[config.hidden_act]
+        self.residual_dropout = config.residual_dropout
+
+    def forward(self, hidden_states: Optional[tuple[torch.FloatTensor]]) -> torch.FloatTensor:
+        hidden_states = self.c_fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.c_proj(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.residual_dropout, training=self.training)
+        return hidden_states
+
+
+def rotate_half(x):
+    """Rotates half the hidden dims of the input."""
+    x1 = x[..., : x.shape[-1] // 2]
+    x2 = x[..., x.shape[-1] // 2 :]
+    return torch.cat((-x2, x1), dim=-1)
+
+
+def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1):
+    """Applies Rotary Position Embedding to the query and key tensors.
+
+    Args:
+        q (`torch.Tensor`): The query tensor.
+        k (`torch.Tensor`): The key tensor.
+        cos (`torch.Tensor`): The cosine part of the rotary embedding.
+        sin (`torch.Tensor`): The sine part of the rotary embedding.
+        position_ids (`torch.Tensor`, *optional*):
+            Deprecated and unused.
+        unsqueeze_dim (`int`, *optional*, defaults to 1):
+            The 'unsqueeze_dim' argument specifies the dimension along which to unsqueeze cos[position_ids] and
+            sin[position_ids] so that they can be properly broadcasted to the dimensions of q and k. For example, note
+            that cos[position_ids] and sin[position_ids] have the shape [batch_size, seq_len, head_dim]. Then, if q and
+            k have the shape [batch_size, heads, seq_len, head_dim], then setting unsqueeze_dim=1 makes
+            cos[position_ids] and sin[position_ids] broadcastable to the shapes of q and k. Similarly, if q and k have
+            the shape [batch_size, seq_len, heads, head_dim], then set unsqueeze_dim=2.
+    Returns:
+        `tuple(torch.Tensor)` comprising of the query and key tensors rotated using the Rotary Position Embedding.
+    """
+    cos = cos.unsqueeze(unsqueeze_dim)
+    sin = sin.unsqueeze(unsqueeze_dim)
+    q_embed = (q * cos) + (rotate_half(q) * sin)
+    k_embed = (k * cos) + (rotate_half(k) * sin)
+    return q_embed, k_embed
+
+
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs: Unpack[TransformersKwargs],
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class Starcoder2Attention(nn.Module):
+    """Multi-headed attention from 'Attention Is All You Need' paper"""
+
+    def __init__(self, config: Starcoder2Config, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.head_dim = getattr(config, "head_dim", None) or config.hidden_size // config.num_attention_heads
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.scaling = self.head_dim**-0.5
+        self.attention_dropout = config.attention_dropout
+        self.is_causal = True
+        self.q_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.use_bias)
+        self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.use_bias)
+        self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.use_bias)
+        self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.use_bias)
+        self.residual_dropout = config.residual_dropout
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            sliding_window=getattr(self.config, "sliding_window", None),  # diff with Llama
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        attn_output = nn.functional.dropout(
+            attn_output, p=self.residual_dropout, training=self.training
+        )  # diff with Llama
+
+        return attn_output, attn_weights
+
+
+class Starcoder2DecoderLayer(GradientCheckpointingLayer):
+    def __init__(self, config: Starcoder2Config, layer_idx: int):
+        super().__init__()
+        self.hidden_size = config.hidden_size
+        self.self_attn = Starcoder2Attention(config=config, layer_idx=layer_idx)
+        self.mlp = Starcoder2MLP(config)
+        self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.norm_epsilon)
+        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.norm_epsilon)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        position_embeddings: Optional[tuple[torch.Tensor, torch.Tensor]] = None,  # necessary, but kept here for BC
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> torch.Tensor:
+        residual = hidden_states
+        hidden_states = self.input_layernorm(hidden_states)
+        # Self Attention
+        hidden_states, _ = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            position_embeddings=position_embeddings,
+            **kwargs,
+        )
+        hidden_states = residual + hidden_states
+
+        # Fully Connected
+        residual = hidden_states
+        hidden_states = self.post_attention_layernorm(hidden_states)
+        hidden_states = self.mlp(hidden_states)
+        hidden_states = residual + hidden_states
+        return hidden_states
+
+
+class Starcoder2RotaryEmbedding(nn.Module):
+    inv_freq: torch.Tensor  # fix linting for `register_buffer`
+
+    def __init__(self, config: Starcoder2Config, device=None):
+        super().__init__()
+        # BC: "rope_type" was originally "type"
+        if hasattr(config, "rope_scaling") and isinstance(config.rope_scaling, dict):
+            self.rope_type = config.rope_scaling.get("rope_type", config.rope_scaling.get("type"))
+        else:
+            self.rope_type = "default"
+        self.max_seq_len_cached = config.max_position_embeddings
+        self.original_max_seq_len = config.max_position_embeddings
+
+        self.config = config
+        self.rope_init_fn = ROPE_INIT_FUNCTIONS[self.rope_type]
+
+        inv_freq, self.attention_scaling = self.rope_init_fn(self.config, device)
+        self.register_buffer("inv_freq", inv_freq, persistent=False)
+        self.original_inv_freq = self.inv_freq
+
+    @torch.no_grad()
+    @dynamic_rope_update  # power user: used with advanced RoPE types (e.g. dynamic rope)
+    def forward(self, x, position_ids):
+        inv_freq_expanded = self.inv_freq[None, :, None].float().expand(position_ids.shape[0], -1, 1).to(x.device)
+        position_ids_expanded = position_ids[:, None, :].float()
+
+        device_type = x.device.type if isinstance(x.device.type, str) and x.device.type != "mps" else "cpu"
+        with torch.autocast(device_type=device_type, enabled=False):  # Force float32
+            freqs = (inv_freq_expanded.float() @ position_ids_expanded.float()).transpose(1, 2)
+            emb = torch.cat((freqs, freqs), dim=-1)
+            cos = emb.cos() * self.attention_scaling
+            sin = emb.sin() * self.attention_scaling
+
+        return cos.to(dtype=x.dtype), sin.to(dtype=x.dtype)
+
+
+@auto_docstring
+class Starcoder2PreTrainedModel(PreTrainedModel):
+    config: Starcoder2Config
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["Starcoder2DecoderLayer"]
+    _skip_keys_device_placement = ["past_key_values"]
+    _supports_flash_attn = True
+    _supports_sdpa = True
+    _supports_flex_attn = True
+
+    _can_compile_fullgraph = True
+    _supports_attention_backend = True
+    _can_record_outputs = {
+        "hidden_states": Starcoder2DecoderLayer,
+        "attentions": Starcoder2Attention,
+    }
+
+
+@auto_docstring
+class Starcoder2Model(Starcoder2PreTrainedModel):
+    def __init__(self, config: Starcoder2Config):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        self.layers = nn.ModuleList(
+            [Starcoder2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = nn.LayerNorm(config.hidden_size, eps=config.norm_epsilon)
+        self.rotary_emb = Starcoder2RotaryEmbedding(config=config)
+        self.gradient_checkpointing = False
+        self.embedding_dropout = config.embedding_dropout
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @check_model_inputs
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        mask_function = create_causal_mask if self.config.sliding_window is None else create_sliding_window_causal_mask
+        causal_mask = mask_function(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+
+        hidden_states = inputs_embeds
+        hidden_states = nn.functional.dropout(
+            hidden_states, p=self.embedding_dropout, training=self.training
+        )  # main diff with Llama
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            hidden_states = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+
+        hidden_states = self.norm(hidden_states)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+        )
+
+
+@auto_docstring
+class Starcoder2ForCausalLM(Starcoder2PreTrainedModel, GenerationMixin):
+    _tied_weights_keys = ["lm_head.weight"]
+    _tp_plan = {"lm_head": "colwise_rep"}
+    _pp_plan = {"lm_head": (["hidden_states"], ["logits"])}
+
+    def __init__(self, config):
+        super().__init__(config)
+        self.model = Starcoder2Model(config)
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @can_return_tuple
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Cache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> CausalLMOutputWithPast:
+        r"""
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, Starcoder2ForCausalLM
+
+        >>> model = Starcoder2ForCausalLM.from_pretrained("meta-starcoder2/Starcoder2-2-7b-hf")
+        >>> tokenizer = AutoTokenizer.from_pretrained("meta-starcoder2/Starcoder2-2-7b-hf")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+        outputs: BaseModelOutputWithPast = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            cache_position=cache_position,
+            **kwargs,
+        )
+
+        hidden_states = outputs.last_hidden_state
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits=logits, labels=labels, vocab_size=self.config.vocab_size, **kwargs)
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class Starcoder2ForSequenceClassification(GenericForSequenceClassification, Starcoder2PreTrainedModel):
+    pass
+
+
+class Starcoder2ForTokenClassification(GenericForTokenClassification, Starcoder2PreTrainedModel):
+    pass
+
+
+__all__ = [
+    "Starcoder2ForCausalLM",
+    "Starcoder2Model",
+    "Starcoder2PreTrainedModel",
+    "Starcoder2ForSequenceClassification",
+    "Starcoder2ForTokenClassification",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/starcoder2/modular_starcoder2.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/starcoder2/modular_starcoder2.py
new file mode 100644
index 0000000000000000000000000000000000000000..f37c75510abea044cd800f12adfa098eb6d418b2
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/starcoder2/modular_starcoder2.py
@@ -0,0 +1,238 @@
+# coding=utf-8
+# Copyright 2024 BigCode and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Starcoder2 model."""
+
+from typing import Callable, Optional, Union
+
+import torch
+from torch import nn
+
+from transformers.utils.generic import check_model_inputs
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache, DynamicCache
+from ...masking_utils import create_causal_mask, create_sliding_window_causal_mask
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_outputs import BaseModelOutputWithPast
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS
+from ...processing_utils import Unpack
+from ...utils import TransformersKwargs, logging
+from ...utils.deprecation import deprecate_kwarg
+from ..mistral.modeling_mistral import (
+    MistralAttention,
+    MistralDecoderLayer,
+    MistralForCausalLM,
+    MistralForSequenceClassification,
+    MistralForTokenClassification,
+    MistralModel,
+    MistralRotaryEmbedding,
+    apply_rotary_pos_emb,
+    eager_attention_forward,
+)
+from .configuration_starcoder2 import Starcoder2Config
+
+
+logger = logging.get_logger(__name__)
+
+
+class Starcoder2MLP(nn.Module):
+    def __init__(self, config: Starcoder2Config):
+        super().__init__()
+        embed_dim = config.hidden_size
+        self.c_fc = nn.Linear(embed_dim, config.intermediate_size, bias=config.use_bias)
+        self.c_proj = nn.Linear(config.intermediate_size, embed_dim, bias=config.use_bias)
+        self.act = ACT2FN[config.hidden_act]
+        self.residual_dropout = config.residual_dropout
+
+    def forward(self, hidden_states: Optional[tuple[torch.FloatTensor]]) -> torch.FloatTensor:
+        hidden_states = self.c_fc(hidden_states)
+        hidden_states = self.act(hidden_states)
+        hidden_states = self.c_proj(hidden_states)
+        hidden_states = nn.functional.dropout(hidden_states, p=self.residual_dropout, training=self.training)
+        return hidden_states
+
+
+class Starcoder2Attention(MistralAttention):
+    def __init__(self, config: Starcoder2Config, layer_idx: Optional[int] = None):
+        super().__init__(config=config, layer_idx=layer_idx)
+        self.residual_dropout = config.residual_dropout
+        self.q_proj = nn.Linear(config.hidden_size, config.num_attention_heads * self.head_dim, bias=config.use_bias)
+        self.k_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.use_bias)
+        self.v_proj = nn.Linear(config.hidden_size, config.num_key_value_heads * self.head_dim, bias=config.use_bias)
+        self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=config.use_bias)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        position_embeddings: tuple[torch.Tensor, torch.Tensor],
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[Cache] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
+
+        if past_key_values is not None:
+            # sin and cos are specific to RoPE models; cache_position needed for the static cache
+            cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_values.update(key_states, value_states, self.layer_idx, cache_kwargs)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            sliding_window=getattr(self.config, "sliding_window", None),  # diff with Llama
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        attn_output = nn.functional.dropout(
+            attn_output, p=self.residual_dropout, training=self.training
+        )  # diff with Llama
+
+        return attn_output, attn_weights
+
+
+class Starcoder2DecoderLayer(MistralDecoderLayer):
+    def __init__(self, config: Starcoder2Config, layer_idx: int):
+        super().__init__(config, layer_idx)
+        self.self_attn = Starcoder2Attention(config=config, layer_idx=layer_idx)
+        self.mlp = Starcoder2MLP(config)
+        self.input_layernorm = nn.LayerNorm(config.hidden_size, eps=config.norm_epsilon)
+        self.post_attention_layernorm = nn.LayerNorm(config.hidden_size, eps=config.norm_epsilon)
+
+
+class Starcoder2RotaryEmbedding(MistralRotaryEmbedding):
+    pass
+
+
+class Starcoder2Model(MistralModel):
+    def __init__(self, config: Starcoder2Config):
+        super().__init__(config)
+        self.layers = nn.ModuleList(
+            [Starcoder2DecoderLayer(config, layer_idx) for layer_idx in range(config.num_hidden_layers)]
+        )
+        self.norm = nn.LayerNorm(config.hidden_size, eps=config.norm_epsilon)
+        self.embedding_dropout = config.embedding_dropout
+
+    @check_model_inputs
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        **kwargs: Unpack[TransformersKwargs],
+    ) -> BaseModelOutputWithPast:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError("You must specify exactly one of input_ids or inputs_embeds")
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        if use_cache and past_key_values is None:
+            past_key_values = DynamicCache(config=self.config)
+
+        if cache_position is None:
+            past_seen_tokens = past_key_values.get_seq_length() if past_key_values is not None else 0
+            cache_position = torch.arange(
+                past_seen_tokens, past_seen_tokens + inputs_embeds.shape[1], device=inputs_embeds.device
+            )
+
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        mask_function = create_causal_mask if self.config.sliding_window is None else create_sliding_window_causal_mask
+        causal_mask = mask_function(
+            config=self.config,
+            input_embeds=inputs_embeds,
+            attention_mask=attention_mask,
+            cache_position=cache_position,
+            past_key_values=past_key_values,
+            position_ids=position_ids,
+        )
+
+        hidden_states = inputs_embeds
+        hidden_states = nn.functional.dropout(
+            hidden_states, p=self.embedding_dropout, training=self.training
+        )  # main diff with Llama
+
+        # create position embeddings to be shared across the decoder layers
+        position_embeddings = self.rotary_emb(hidden_states, position_ids)
+        for decoder_layer in self.layers[: self.config.num_hidden_layers]:
+            hidden_states = decoder_layer(
+                hidden_states,
+                attention_mask=causal_mask,
+                position_ids=position_ids,
+                past_key_values=past_key_values,
+                use_cache=use_cache,
+                cache_position=cache_position,
+                position_embeddings=position_embeddings,
+                **kwargs,
+            )
+
+        hidden_states = self.norm(hidden_states)
+
+        return BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+        )
+
+
+class Starcoder2ForCausalLM(MistralForCausalLM):
+    pass
+
+
+class Starcoder2ForSequenceClassification(MistralForSequenceClassification):
+    pass
+
+
+class Starcoder2ForTokenClassification(MistralForTokenClassification):
+    pass
+
+
+__all__ = [
+    "Starcoder2ForCausalLM",
+    "Starcoder2Model",
+    "Starcoder2PreTrainedModel",  # noqa: F822
+    "Starcoder2ForSequenceClassification",
+    "Starcoder2ForTokenClassification",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swiftformer/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swiftformer/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..da99e5539d39fac82edea3638686617083701dfc
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swiftformer/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swiftformer/__pycache__/configuration_swiftformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swiftformer/__pycache__/configuration_swiftformer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a44f22d992eafe1f8eb28af71ed61ff77dc57b32
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swiftformer/__pycache__/configuration_swiftformer.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swiftformer/__pycache__/modeling_swiftformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swiftformer/__pycache__/modeling_swiftformer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ec7ff9a4e4b7cb848a99e1382db1942df6e848b7
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swiftformer/__pycache__/modeling_swiftformer.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swiftformer/__pycache__/modeling_tf_swiftformer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swiftformer/__pycache__/modeling_tf_swiftformer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..45fdd45532e204eddab34fd0f44bfd28146287f4
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swiftformer/__pycache__/modeling_tf_swiftformer.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..91ff60c37473a3e1f0cade55144a5630228f03e4
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin/__pycache__/configuration_swin.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin/__pycache__/configuration_swin.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..00988d9dee0a99a543f35a3b59b016b4479c7c2e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin/__pycache__/configuration_swin.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin/__pycache__/modeling_swin.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin/__pycache__/modeling_swin.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..eb7070e14782b961a07ebd47130cedc8c71bc671
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin/__pycache__/modeling_swin.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin/__pycache__/modeling_tf_swin.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin/__pycache__/modeling_tf_swin.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a87e2def120d46685693d25107e04ac0a8000860
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin/__pycache__/modeling_tf_swin.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin2sr/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin2sr/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1bc06b02db87cc9af450085d66eadb4ceaf5128e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin2sr/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin2sr/__pycache__/configuration_swin2sr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin2sr/__pycache__/configuration_swin2sr.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..126cc37d332ecaa2ec4eb3413f49d84d6c07accb
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin2sr/__pycache__/configuration_swin2sr.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin2sr/__pycache__/image_processing_swin2sr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin2sr/__pycache__/image_processing_swin2sr.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..63c9c273ad4d955241a9fae02e8ac827d237dbbb
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin2sr/__pycache__/image_processing_swin2sr.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin2sr/__pycache__/image_processing_swin2sr_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin2sr/__pycache__/image_processing_swin2sr_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..12410b574a74338b16dd30e25e475ddc8a18407d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin2sr/__pycache__/image_processing_swin2sr_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin2sr/__pycache__/modeling_swin2sr.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin2sr/__pycache__/modeling_swin2sr.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..61eb559a40472ac3692ab01596a4206e9fb9e130
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swin2sr/__pycache__/modeling_swin2sr.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swinv2/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swinv2/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..da55b0e5a02ae76542e27d989bf0a69da04399b6
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swinv2/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swinv2/__pycache__/configuration_swinv2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swinv2/__pycache__/configuration_swinv2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..058ca7abff9e0db7df67f158134adc23d2b21731
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swinv2/__pycache__/configuration_swinv2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swinv2/__pycache__/modeling_swinv2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swinv2/__pycache__/modeling_swinv2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..671d74394c535a9dda8f18364186bdbd22179738
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/swinv2/__pycache__/modeling_swinv2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/switch_transformers/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/switch_transformers/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..40353cb1b6c2f5b50d7b1b68491448f3faf66a04
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/switch_transformers/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/switch_transformers/__pycache__/configuration_switch_transformers.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/switch_transformers/__pycache__/configuration_switch_transformers.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b34b87dd1fe29f1d4349439dee1d625172cca58b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/switch_transformers/__pycache__/configuration_switch_transformers.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/switch_transformers/__pycache__/modeling_switch_transformers.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/switch_transformers/__pycache__/modeling_switch_transformers.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..170ca883c7e5bce660deefdef87de5529f9f6a62
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/switch_transformers/__pycache__/modeling_switch_transformers.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..13e368498c2ba45a71b292903c99df50276005a9
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5/__pycache__/configuration_t5.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5/__pycache__/configuration_t5.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..524653c925dc393ae72ddbc1d8b333d2eba64587
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5/__pycache__/configuration_t5.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5/__pycache__/modeling_flax_t5.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5/__pycache__/modeling_flax_t5.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1caea927c68c812c26facbd93773547e95678124
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5/__pycache__/modeling_flax_t5.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5/__pycache__/modeling_tf_t5.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5/__pycache__/modeling_tf_t5.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bec3e0daa7636cd6862dede9395cdfdd5fd09ea2
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5/__pycache__/modeling_tf_t5.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5/__pycache__/tokenization_t5.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5/__pycache__/tokenization_t5.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a5e0c6f68235d14b3c40835dc0d21736c95a4274
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5/__pycache__/tokenization_t5.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5/__pycache__/tokenization_t5_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5/__pycache__/tokenization_t5_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..caf98dd8353c487f3a58acdd7f9cdd71d90092cd
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5/__pycache__/tokenization_t5_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5gemma/__pycache__/modeling_t5gemma.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5gemma/__pycache__/modeling_t5gemma.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0b5eaf8cc83ea4f105924263cc4737dcad566b1d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/t5gemma/__pycache__/modeling_t5gemma.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/tapas/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/tapas/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..61a460ad62e7840fbce6dc720b8fd51c2f54e57f
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/tapas/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/tapas/__pycache__/configuration_tapas.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/tapas/__pycache__/configuration_tapas.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9b98c0e8eb9532220c01d679746f4a560b2ee8d2
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/tapas/__pycache__/configuration_tapas.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/timm_backbone/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/timm_backbone/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5e3f1da42b306da2ee87b9061c00bc290426fb87
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/timm_backbone/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/timm_backbone/__pycache__/configuration_timm_backbone.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/timm_backbone/__pycache__/configuration_timm_backbone.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..03a24b6c9f448282a4ac6228a0ae833b8c5ac85a
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/timm_backbone/__pycache__/configuration_timm_backbone.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/timm_backbone/__pycache__/modeling_timm_backbone.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/timm_backbone/__pycache__/modeling_timm_backbone.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..11133600f961bc1e9f8c9f1ca87f18420c51f875
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/timm_backbone/__pycache__/modeling_timm_backbone.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d004da69a57429ff1386f5effd773dfef78a9ff1
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech/__pycache__/configuration_unispeech.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech/__pycache__/configuration_unispeech.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9b1f973f999df2ed4b9e7250ae191259eb1644f8
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech/__pycache__/configuration_unispeech.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech/__pycache__/modeling_unispeech.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech/__pycache__/modeling_unispeech.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..540d5dca3c01237e7bd009ac4c2e35198633aa69
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech/__pycache__/modeling_unispeech.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech/__pycache__/modular_unispeech.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech/__pycache__/modular_unispeech.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4d5d2c4a362cae3eecde428590db94f6887411f9
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/unispeech/__pycache__/modular_unispeech.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/upernet/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/upernet/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0544147da87417220aa0ee93a2a370364ade3854
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/upernet/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/upernet/__pycache__/configuration_upernet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/upernet/__pycache__/configuration_upernet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0f58214881db9fb940cf36be4d6e7c4508ccd396
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/upernet/__pycache__/configuration_upernet.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/upernet/__pycache__/modeling_upernet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/upernet/__pycache__/modeling_upernet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2efcfd85345eaf94c93744df9fc8aa8d37692976
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/upernet/__pycache__/modeling_upernet.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vaultgemma/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vaultgemma/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..51abf6ec5c6cbd3f8e69bd3d9921e206effd8671
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vaultgemma/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vaultgemma/__pycache__/modeling_vaultgemma.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vaultgemma/__pycache__/modeling_vaultgemma.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f29a6ff11f5c474a18b340726d233a23d621e503
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vaultgemma/__pycache__/modeling_vaultgemma.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vaultgemma/__pycache__/modular_vaultgemma.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vaultgemma/__pycache__/modular_vaultgemma.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..344ffc8298d850b144fce18e4e003bde54d30ae8
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vaultgemma/__pycache__/modular_vaultgemma.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/video_llava/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/video_llava/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b314828ae6ed66a28208db9cdfb694901df91ae7
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/video_llava/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/video_llava/__pycache__/configuration_video_llava.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/video_llava/__pycache__/configuration_video_llava.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0172734f1f1da2b184bf28eab2c5345d00d1d021
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/video_llava/__pycache__/configuration_video_llava.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/video_llava/__pycache__/image_processing_video_llava.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/video_llava/__pycache__/image_processing_video_llava.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ffabae14a80da959715cf8d93aaae7b0884f6981
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/video_llava/__pycache__/image_processing_video_llava.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/video_llava/__pycache__/modeling_video_llava.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/video_llava/__pycache__/modeling_video_llava.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..02141a09042ef8340a399654a1a8914fcded4c04
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/video_llava/__pycache__/modeling_video_llava.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/video_llava/__pycache__/processing_video_llava.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/video_llava/__pycache__/processing_video_llava.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..0fd674e88c672d61869b56d2b4318d90e3be8254
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/video_llava/__pycache__/processing_video_llava.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/video_llava/__pycache__/video_processing_video_llava.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/video_llava/__pycache__/video_processing_video_llava.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..bb345c021b1e817d554ddb6f13388b7593ea7189
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/video_llava/__pycache__/video_processing_video_llava.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vilt/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vilt/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7bafa9d7455e6917d2b920db09df28f9ddb12954
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vilt/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vilt/__pycache__/configuration_vilt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vilt/__pycache__/configuration_vilt.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5bba6a019764bedd9a1971d148331bcde50d84a3
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vilt/__pycache__/configuration_vilt.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vilt/__pycache__/feature_extraction_vilt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vilt/__pycache__/feature_extraction_vilt.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..82308a31d82a2255f06a1503ada96231c1098ca2
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vilt/__pycache__/feature_extraction_vilt.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vilt/__pycache__/image_processing_vilt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vilt/__pycache__/image_processing_vilt.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b8188a1cedf087967deccf8b86ada2dd2bbd4cf7
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vilt/__pycache__/image_processing_vilt.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vilt/__pycache__/image_processing_vilt_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vilt/__pycache__/image_processing_vilt_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..99b258fe0b534a23c52cd199f6c262bbe4b7a5ce
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vilt/__pycache__/image_processing_vilt_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vilt/__pycache__/modeling_vilt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vilt/__pycache__/modeling_vilt.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6f178f9249bed45568e97c54bb4b63cd9ec91eb5
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vilt/__pycache__/modeling_vilt.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vilt/__pycache__/processing_vilt.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vilt/__pycache__/processing_vilt.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..53b18fae2e083a7300eba00e29f71e1059a9a057
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vilt/__pycache__/processing_vilt.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e6ef5120c5c472c74a5818f0e6dcea460615d0c8
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit/__pycache__/configuration_vit.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit/__pycache__/configuration_vit.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..37e924e003cded2fdd84cd6d4aa9adadb317445e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit/__pycache__/configuration_vit.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit/__pycache__/feature_extraction_vit.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit/__pycache__/feature_extraction_vit.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f0c5551f80fcb7de8dd2c1179e14ef08a94f6bda
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit/__pycache__/feature_extraction_vit.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit/__pycache__/image_processing_vit.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit/__pycache__/image_processing_vit.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fb0f13f3daa4d1632cd7aa31ab023cf2df1cce92
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit/__pycache__/image_processing_vit.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit/__pycache__/image_processing_vit_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit/__pycache__/image_processing_vit_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a474898431eb5c82a7e86c919d9615e81ef882a9
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit/__pycache__/image_processing_vit_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit/__pycache__/modeling_flax_vit.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit/__pycache__/modeling_flax_vit.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f6ed4031431c54bb41ab8cd8dfae14c6ccd3a9ca
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit/__pycache__/modeling_flax_vit.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit/__pycache__/modeling_tf_vit.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit/__pycache__/modeling_tf_vit.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..05f0317d6cedf0cbe00b594e72433a7d75b71aa8
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit/__pycache__/modeling_tf_vit.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit/__pycache__/modeling_vit.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit/__pycache__/modeling_vit.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e74fd0870ef2814421468daf02cf2ec15015f0ae
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit/__pycache__/modeling_vit.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit_mae/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit_mae/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c6ed5d5606a37fe96bdce3b56df80f24c5fe2ee5
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit_mae/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit_mae/__pycache__/configuration_vit_mae.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit_mae/__pycache__/configuration_vit_mae.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c02dbafa1f58bb0bd7e497c5938d3586146289f8
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit_mae/__pycache__/configuration_vit_mae.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit_mae/__pycache__/modeling_tf_vit_mae.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit_mae/__pycache__/modeling_tf_vit_mae.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..be7dca835849d0cfb2a550a5b78fc1aa9a75daef
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit_mae/__pycache__/modeling_tf_vit_mae.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit_mae/__pycache__/modeling_vit_mae.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit_mae/__pycache__/modeling_vit_mae.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9fa4c53534e1d7c79e93065b530b0e4f93961533
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit_mae/__pycache__/modeling_vit_mae.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit_msn/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit_msn/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..534e3d2467b563bfbb0bd4ea5683431da48bd132
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit_msn/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit_msn/__pycache__/configuration_vit_msn.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit_msn/__pycache__/configuration_vit_msn.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d6481ddfed3d26487e0288c2450f894d9b148265
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit_msn/__pycache__/configuration_vit_msn.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit_msn/__pycache__/modeling_vit_msn.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit_msn/__pycache__/modeling_vit_msn.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..cde6a8cef0ecc42690d3a9d72920b0f8f2d2eb79
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vit_msn/__pycache__/modeling_vit_msn.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vitdet/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vitdet/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..fbef8784eec76673ea922205869a5d074069e81d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vitdet/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vitdet/__pycache__/configuration_vitdet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vitdet/__pycache__/configuration_vitdet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7f773916a30f7bc1223e159d7ddedde2f9415b87
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vitdet/__pycache__/configuration_vitdet.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vitdet/__pycache__/modeling_vitdet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vitdet/__pycache__/modeling_vitdet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ec7e679f736afbe19e5d7e5bd2e184193362be25
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vitdet/__pycache__/modeling_vitdet.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vitmatte/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vitmatte/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5c13e873dc165d9fe71779b4a92ac705b9cae1df
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vitmatte/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vitmatte/__pycache__/configuration_vitmatte.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vitmatte/__pycache__/configuration_vitmatte.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3336f81cf88dee7d30d3f9ad697fd3bd33b9eac0
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vitmatte/__pycache__/configuration_vitmatte.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vitmatte/__pycache__/image_processing_vitmatte.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vitmatte/__pycache__/image_processing_vitmatte.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f02d995b63c617b609339ffb586e6e233871dd55
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vitmatte/__pycache__/image_processing_vitmatte.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vitmatte/__pycache__/image_processing_vitmatte_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vitmatte/__pycache__/image_processing_vitmatte_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7053ad4179def53f8c89be228a5dc3d8f8604167
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vitmatte/__pycache__/image_processing_vitmatte_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vitmatte/__pycache__/modeling_vitmatte.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vitmatte/__pycache__/modeling_vitmatte.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..67a9a41cfd918d5e3a129e9dc0e98264708e4963
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vitmatte/__pycache__/modeling_vitmatte.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vits/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vits/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..687edfebb4aee98c87e59ee596197ae22e9da411
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vits/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vits/__pycache__/configuration_vits.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vits/__pycache__/configuration_vits.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c7fc602f3144f002294eed87336499669c2c4e4d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vits/__pycache__/configuration_vits.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vits/__pycache__/modeling_vits.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vits/__pycache__/modeling_vits.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f3ca9cae3dda1f80e8ad4441c3a26913689498ab
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vits/__pycache__/modeling_vits.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vits/__pycache__/tokenization_vits.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vits/__pycache__/tokenization_vits.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1845d1a515a0960cbaec9a58d4f2d1834fc2f419
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vits/__pycache__/tokenization_vits.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vjepa2/__pycache__/configuration_vjepa2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vjepa2/__pycache__/configuration_vjepa2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c95ff0ff28cda94d71305349b58e7b7ff87f9826
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vjepa2/__pycache__/configuration_vjepa2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vjepa2/__pycache__/modeling_vjepa2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vjepa2/__pycache__/modeling_vjepa2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..039c922db27a9bf0db05722fadd7791caefda4b3
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/vjepa2/__pycache__/modeling_vjepa2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/voxtral/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/voxtral/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..db989ad98d208354153506c66da7a6b6952a34b3
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/voxtral/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/voxtral/__pycache__/configuration_voxtral.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/voxtral/__pycache__/configuration_voxtral.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..713a494f071a1d335c5963ccf7a708d4f2eee675
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/voxtral/__pycache__/configuration_voxtral.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/voxtral/__pycache__/modeling_voxtral.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/voxtral/__pycache__/modeling_voxtral.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a4e16395c4c630c24029460677750d56f5c8dc66
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/voxtral/__pycache__/modeling_voxtral.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/voxtral/__pycache__/modular_voxtral.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/voxtral/__pycache__/modular_voxtral.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2aac3f17ef35e30a3139953c0bfcacd3eb7f6fd8
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/voxtral/__pycache__/modular_voxtral.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/voxtral/__pycache__/processing_voxtral.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/voxtral/__pycache__/processing_voxtral.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d37ca5fc4d752c3b3694f20596232c3400e43050
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/voxtral/__pycache__/processing_voxtral.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d76fc2067b5f907dcfee64bd1106bb7ea117bfdc
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2/__pycache__/configuration_wav2vec2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2/__pycache__/configuration_wav2vec2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1b476a79d3041867ecbee5714e5e71fe7ca8bc1e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2/__pycache__/configuration_wav2vec2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2/__pycache__/feature_extraction_wav2vec2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2/__pycache__/feature_extraction_wav2vec2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b49ccf49838e11150e063ab99d1910403d0ff3f3
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2/__pycache__/feature_extraction_wav2vec2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2/__pycache__/modeling_flax_wav2vec2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2/__pycache__/modeling_flax_wav2vec2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3e94aa079a9c6b67836de882b002b0f22a54c98b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2/__pycache__/modeling_flax_wav2vec2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2/__pycache__/modeling_tf_wav2vec2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2/__pycache__/modeling_tf_wav2vec2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..26e0db53d6fc11aeccec22fa576122cd4ec1d555
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2/__pycache__/modeling_tf_wav2vec2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2/__pycache__/processing_wav2vec2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2/__pycache__/processing_wav2vec2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2a0d809ec64758c3f8d41d06743880009b027cfa
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2/__pycache__/processing_wav2vec2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2/__pycache__/tokenization_wav2vec2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2/__pycache__/tokenization_wav2vec2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..6545de269255aeda4ab389232248a6d17c2d5bd9
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2/__pycache__/tokenization_wav2vec2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_conformer/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_conformer/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..54d0d9e2c3997c4b2865b5a47e7b64c39ed40b68
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_conformer/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_wav2vec2_conformer import *
+    from .modeling_wav2vec2_conformer import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..493e96f44037d6b846ad702e0c765335d3ecee08
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_conformer/configuration_wav2vec2_conformer.py
@@ -0,0 +1,360 @@
+# coding=utf-8
+# Copyright 2022 The Fairseq Authors and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Wav2Vec2Conformer model configuration"""
+
+import functools
+import operator
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class Wav2Vec2ConformerConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`Wav2Vec2ConformerModel`]. It is used to
+    instantiate an Wav2Vec2Conformer model according to the specified arguments, defining the model architecture.
+    Instantiating a configuration with the defaults will yield a similar configuration to that of the Wav2Vec2Conformer
+    [facebook/wav2vec2-conformer-rel-pos-large](https://huggingface.co/facebook/wav2vec2-conformer-rel-pos-large)
+    architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*):
+            Vocabulary size of the Wav2Vec2Conformer model. Defines the number of different tokens that can be
+            represented by the `inputs_ids` passed when calling [`Wav2Vec2ConformerModel`]. Vocabulary size of the
+            model. Defines the different tokens that can be represented by the *inputs_ids* passed to the forward
+            method of [`Wav2Vec2ConformerModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        activation_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for activations inside the fully connected layer.
+        attention_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        final_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the final projection layer of [`Wav2Vec2ConformerForCTC`].
+        layerdrop (`float`, *optional*, defaults to 0.1):
+            The LayerDrop probability. See the [LayerDrop paper](see https://huggingface.co/papers/1909.11556) for more
+            details.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        feat_extract_norm (`str`, *optional*, defaults to `"group"`):
+            The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group
+            normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
+            convolutional layers.
+        feat_proj_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for output of the feature encoder.
+        feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the 1D convolutional layers of the feature
+            extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for quantized feature encoder states.
+        conv_dim (`tuple[int]` or `list[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
+            A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
+            feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
+        conv_stride (`tuple[int]` or `list[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
+            A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
+            of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*.
+        conv_kernel (`tuple[int]` or `list[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
+            A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
+            length of *conv_kernel* defines the number of convolutional layers and has to match the length of
+            *conv_dim*.
+        conv_bias (`bool`, *optional*, defaults to `False`):
+            Whether the 1D convolutional layers have a bias.
+        num_conv_pos_embeddings (`int`, *optional*, defaults to 128):
+            Number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional
+            embeddings layer.
+        num_conv_pos_embedding_groups (`int`, *optional*, defaults to 16):
+            Number of groups of 1D convolutional positional embeddings layer.
+        apply_spec_augment (`bool`, *optional*, defaults to `True`):
+            Whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see
+            [SpecAugment: A Simple Data Augmentation Method for Automatic Speech
+            Recognition](https://huggingface.co/papers/1904.08779).
+        mask_time_prob (`float`, *optional*, defaults to 0.05):
+            Percentage (between 0 and 1) of all feature vectors along the time axis which will be masked. The masking
+            procedure generates ''mask_time_prob*len(time_axis)/mask_time_length'' independent masks over the axis. If
+            reasoning from the probability of each feature vector to be chosen as the start of the vector span to be
+            masked, *mask_time_prob* should be `prob_vector_start*mask_time_length`. Note that overlap may decrease the
+            actual percentage of masked vectors. This is only relevant if `apply_spec_augment is True`.
+        mask_time_length (`int`, *optional*, defaults to 10):
+            Length of vector span along the time axis.
+        mask_time_min_masks (`int`, *optional*, defaults to 2),:
+            The minimum number of masks of length `mask_feature_length` generated along the time axis, each time step,
+            irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_prob*len(time_axis)/mask_time_length <
+            mask_time_min_masks''
+        mask_feature_prob (`float`, *optional*, defaults to 0.0):
+            Percentage (between 0 and 1) of all feature vectors along the feature axis which will be masked. The
+            masking procedure generates ''mask_feature_prob*len(feature_axis)/mask_time_length'' independent masks over
+            the axis. If reasoning from the probability of each feature vector to be chosen as the start of the vector
+            span to be masked, *mask_feature_prob* should be `prob_vector_start*mask_feature_length`. Note that overlap
+            may decrease the actual percentage of masked vectors. This is only relevant if `apply_spec_augment is
+            True`.
+        mask_feature_length (`int`, *optional*, defaults to 10):
+            Length of vector span along the feature axis.
+        mask_feature_min_masks (`int`, *optional*, defaults to 0),:
+            The minimum number of masks of length `mask_feature_length` generated along the feature axis, each time
+            step, irrespectively of `mask_feature_prob`. Only relevant if
+            ''mask_feature_prob*len(feature_axis)/mask_feature_length < mask_feature_min_masks''
+        num_codevectors_per_group (`int`, *optional*, defaults to 320):
+            Number of entries in each quantization codebook (group).
+        num_codevector_groups (`int`, *optional*, defaults to 2):
+            Number of codevector groups for product codevector quantization.
+        contrastive_logits_temperature (`float`, *optional*, defaults to 0.1):
+            The temperature *kappa* in the contrastive loss.
+        feat_quantizer_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for the output of the feature encoder that's used by the quantizer.
+        num_negatives (`int`, *optional*, defaults to 100):
+            Number of negative samples for the contrastive loss.
+        codevector_dim (`int`, *optional*, defaults to 256):
+            Dimensionality of the quantized feature vectors.
+        proj_codevector_dim (`int`, *optional*, defaults to 256):
+            Dimensionality of the final projection of both the quantized and the transformer features.
+        diversity_loss_weight (`int`, *optional*, defaults to 0.1):
+            The weight of the codebook diversity loss component.
+        ctc_loss_reduction (`str`, *optional*, defaults to `"sum"`):
+            Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an
+            instance of [`Wav2Vec2ConformerForCTC`].
+        ctc_zero_infinity (`bool`, *optional*, defaults to `False`):
+            Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses mainly
+            occur when the inputs are too short to be aligned to the targets. Only relevant when training an instance
+            of [`Wav2Vec2ConformerForCTC`].
+        use_weighted_layer_sum (`bool`, *optional*, defaults to `False`):
+            Whether to use a weighted average of layer outputs with learned weights. Only relevant when using an
+            instance of [`Wav2Vec2ConformerForSequenceClassification`].
+        classifier_proj_size (`int`, *optional*, defaults to 256):
+            Dimensionality of the projection before token mean-pooling for classification.
+        tdnn_dim (`tuple[int]` or `list[int]`, *optional*, defaults to `(512, 512, 512, 512, 1500)`):
+            A tuple of integers defining the number of output channels of each 1D convolutional layer in the *TDNN*
+            module of the *XVector* model. The length of *tdnn_dim* defines the number of *TDNN* layers.
+        tdnn_kernel (`tuple[int]` or `list[int]`, *optional*, defaults to `(5, 3, 3, 1, 1)`):
+            A tuple of integers defining the kernel size of each 1D convolutional layer in the *TDNN* module of the
+            *XVector* model. The length of *tdnn_kernel* has to match the length of *tdnn_dim*.
+        tdnn_dilation (`tuple[int]` or `list[int]`, *optional*, defaults to `(1, 2, 3, 1, 1)`):
+            A tuple of integers defining the dilation factor of each 1D convolutional layer in *TDNN* module of the
+            *XVector* model. The length of *tdnn_dilation* has to match the length of *tdnn_dim*.
+        xvector_output_dim (`int`, *optional*, defaults to 512):
+            Dimensionality of the *XVector* embedding vectors.
+        add_adapter (`bool`, *optional*, defaults to `False`):
+            Whether a convolutional network should be stacked on top of the Wav2Vec2Conformer Encoder. Can be very
+            useful for warm-starting Wav2Vec2Conformer for SpeechEncoderDecoder models.
+        adapter_kernel_size (`int`, *optional*, defaults to 3):
+            Kernel size of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`.
+        adapter_stride (`int`, *optional*, defaults to 2):
+            Stride of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`.
+        num_adapter_layers (`int`, *optional*, defaults to 3):
+            Number of convolutional layers that should be used in the adapter network. Only relevant if `add_adapter is
+            True`.
+        output_hidden_size (`int`, *optional*):
+            Dimensionality of the encoder output layer. If not defined, this defaults to *hidden-size*. Only relevant
+            if `add_adapter is True`.
+        position_embeddings_type (`str`, *optional*, defaults to `"relative"`):
+            Can be specified to `relative` or `rotary` for relative or rotary position embeddings respectively. If left
+            `None` no relative position embedding is applied.
+        rotary_embedding_base (`int`, *optional*, defaults to 10000):
+            If `"rotary"` position embeddings are used, defines the size of the embedding base.
+        max_source_positions (`int`, *optional*, defaults to 5000):
+            if `"relative"` position embeddings are used, defines the maximum source input positions.
+        conv_depthwise_kernel_size (`int`, *optional*, defaults to 31):
+            Kernel size of convolutional depthwise 1D layer in Conformer blocks.
+        conformer_conv_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all convolutional layers in Conformer blocks.
+
+    Example:
+
+    ```python
+    >>> from transformers import Wav2Vec2ConformerConfig, Wav2Vec2ConformerModel
+
+    >>> # Initializing a Wav2Vec2Conformer facebook/wav2vec2-conformer-rel-pos-large style configuration
+    >>> configuration = Wav2Vec2ConformerConfig()
+
+    >>> # Initializing a model (with random weights) from the facebook/wav2vec2-conformer-rel-pos-large style configuration
+    >>> model = Wav2Vec2ConformerModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "wav2vec2-conformer"
+
+    def __init__(
+        self,
+        vocab_size=None,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout=0.1,
+        activation_dropout=0.1,
+        attention_dropout=0.1,
+        feat_proj_dropout=0.0,
+        feat_quantizer_dropout=0.0,
+        final_dropout=0.1,
+        layerdrop=0.1,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        feat_extract_norm="group",
+        feat_extract_activation="gelu",
+        conv_dim=(512, 512, 512, 512, 512, 512, 512),
+        conv_stride=(5, 2, 2, 2, 2, 2, 2),
+        conv_kernel=(10, 3, 3, 3, 3, 2, 2),
+        conv_bias=False,
+        num_conv_pos_embeddings=128,
+        num_conv_pos_embedding_groups=16,
+        apply_spec_augment=True,
+        mask_time_prob=0.05,
+        mask_time_length=10,
+        mask_time_min_masks=2,
+        mask_feature_prob=0.0,
+        mask_feature_length=10,
+        mask_feature_min_masks=0,
+        num_codevectors_per_group=320,
+        num_codevector_groups=2,
+        contrastive_logits_temperature=0.1,
+        num_negatives=100,
+        codevector_dim=256,
+        proj_codevector_dim=256,
+        diversity_loss_weight=0.1,
+        ctc_loss_reduction="sum",
+        ctc_zero_infinity=False,
+        use_weighted_layer_sum=False,
+        classifier_proj_size=256,
+        tdnn_dim=(512, 512, 512, 512, 1500),
+        tdnn_kernel=(5, 3, 3, 1, 1),
+        tdnn_dilation=(1, 2, 3, 1, 1),
+        xvector_output_dim=512,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        add_adapter=False,
+        adapter_kernel_size=3,
+        adapter_stride=2,
+        num_adapter_layers=3,
+        output_hidden_size=None,
+        position_embeddings_type="relative",
+        rotary_embedding_base=10000,
+        max_source_positions=5000,
+        conv_depthwise_kernel_size=31,
+        conformer_conv_dropout=0.1,
+        **kwargs,
+    ):
+        super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
+        self.hidden_size = hidden_size
+        self.feat_extract_norm = feat_extract_norm
+        self.feat_extract_activation = feat_extract_activation
+        self.conv_dim = list(conv_dim)
+        self.conv_stride = list(conv_stride)
+        self.conv_kernel = list(conv_kernel)
+        self.conv_bias = conv_bias
+        self.num_conv_pos_embeddings = num_conv_pos_embeddings
+        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
+        self.num_feat_extract_layers = len(self.conv_dim)
+        self.num_hidden_layers = num_hidden_layers
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.num_attention_heads = num_attention_heads
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.feat_proj_dropout = feat_proj_dropout
+        self.final_dropout = final_dropout
+        self.layerdrop = layerdrop
+        self.layer_norm_eps = layer_norm_eps
+        self.initializer_range = initializer_range
+        self.vocab_size = vocab_size
+        self.use_weighted_layer_sum = use_weighted_layer_sum
+        self.max_source_positions = max_source_positions
+        self.position_embeddings_type = position_embeddings_type
+        self.rotary_embedding_base = rotary_embedding_base
+
+        if (
+            (len(self.conv_stride) != self.num_feat_extract_layers)
+            or (len(self.conv_kernel) != self.num_feat_extract_layers)
+            or (len(self.conv_dim) != self.num_feat_extract_layers)
+        ):
+            raise ValueError(
+                "Configuration for convolutional layers is incorrect. It is required that `len(config.conv_dim)` =="
+                " `len(config.conv_stride)` == `len(config.conv_kernel)`, but is `len(config.conv_dim) ="
+                f" {len(self.conv_dim)}`, `len(config.conv_stride) = {len(self.conv_stride)}`,"
+                f" `len(config.conv_kernel) = {len(self.conv_kernel)}`."
+            )
+
+        # Conformer-block related
+        self.conv_depthwise_kernel_size = conv_depthwise_kernel_size
+        self.conformer_conv_dropout = conformer_conv_dropout
+
+        # fine-tuning config parameters for SpecAugment: https://huggingface.co/papers/1904.08779
+        self.apply_spec_augment = apply_spec_augment
+        self.mask_time_prob = mask_time_prob
+        self.mask_time_length = mask_time_length
+        self.mask_time_min_masks = mask_time_min_masks
+        self.mask_feature_prob = mask_feature_prob
+        self.mask_feature_length = mask_feature_length
+        self.mask_feature_min_masks = mask_feature_min_masks
+
+        # parameters for pretraining with codevector quantized representations
+        self.num_codevectors_per_group = num_codevectors_per_group
+        self.num_codevector_groups = num_codevector_groups
+        self.contrastive_logits_temperature = contrastive_logits_temperature
+        self.feat_quantizer_dropout = feat_quantizer_dropout
+        self.num_negatives = num_negatives
+        self.codevector_dim = codevector_dim
+        self.proj_codevector_dim = proj_codevector_dim
+        self.diversity_loss_weight = diversity_loss_weight
+
+        # ctc loss
+        self.ctc_loss_reduction = ctc_loss_reduction
+        self.ctc_zero_infinity = ctc_zero_infinity
+
+        # adapter
+        self.add_adapter = add_adapter
+        self.adapter_kernel_size = adapter_kernel_size
+        self.adapter_stride = adapter_stride
+        self.num_adapter_layers = num_adapter_layers
+        self.output_hidden_size = output_hidden_size or hidden_size
+
+        # SequenceClassification-specific parameter. Feel free to ignore for other classes.
+        self.classifier_proj_size = classifier_proj_size
+
+        # XVector-specific parameters. Feel free to ignore for other classes.
+        self.tdnn_dim = list(tdnn_dim)
+        self.tdnn_kernel = list(tdnn_kernel)
+        self.tdnn_dilation = list(tdnn_dilation)
+        self.xvector_output_dim = xvector_output_dim
+
+    @property
+    def inputs_to_logits_ratio(self):
+        return functools.reduce(operator.mul, self.conv_stride, 1)
+
+
+__all__ = ["Wav2Vec2ConformerConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..62357c8e0dcb34bf597e414c567491b38fff8e1d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_conformer/modeling_wav2vec2_conformer.py
@@ -0,0 +1,1928 @@
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+#           This file was automatically generated from src/transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py.
+#               Do NOT edit this file manually as any edits will be overwritten by the generation of
+#             the file from the modular. If any change should be done, please apply the change to the
+#                          modular_wav2vec2_conformer.py file directly. One of our CI enforces this.
+#                🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨🚨
+import math
+import warnings
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import numpy as np
+import torch
+from torch import nn
+from torch.nn import CrossEntropyLoss
+
+from ...activations import ACT2FN
+from ...integrations.deepspeed import is_deepspeed_zero3_enabled
+from ...integrations.fsdp import is_fsdp_managed_module
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import (
+    BaseModelOutput,
+    CausalLMOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+    Wav2Vec2BaseModelOutput,
+    XVectorOutput,
+)
+from ...modeling_utils import PreTrainedModel
+from ...utils import ModelOutput, auto_docstring, is_peft_available
+from .configuration_wav2vec2_conformer import Wav2Vec2ConformerConfig
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Output type of [`Wav2Vec2ConformerForPreTraining`], with potential hidden states and attentions.
+    """
+)
+class Wav2Vec2ConformerForPreTrainingOutput(ModelOutput):
+    r"""
+    loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
+        Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the [official
+        paper](https://huggingface.co/papers/2006.11477).
+    projected_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
+        Hidden-states of the model projected to *config.proj_codevector_dim* that can be used to predict the masked
+        projected quantized states.
+    projected_quantized_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
+        Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive
+        target vectors for contrastive loss.
+    codevector_perplexity (`torch.FloatTensor` of shape `(1,)`):
+        The perplexity of the codevector distribution, used to measure the diversity of the codebook.
+    contrastive_loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
+        The contrastive loss (L_m) as stated in the [official paper](https://huggingface.co/papers/2006.11477).
+    diversity_loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
+        The diversity loss (L_d) as stated in the [official paper](https://huggingface.co/papers/2006.11477).
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    projected_states: Optional[torch.FloatTensor] = None
+    projected_quantized_states: Optional[torch.FloatTensor] = None
+    codevector_perplexity: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    contrastive_loss: Optional[torch.FloatTensor] = None
+    diversity_loss: Optional[torch.FloatTensor] = None
+
+
+class Wav2Vec2ConformerSamePadLayer(nn.Module):
+    def __init__(self, num_conv_pos_embeddings):
+        super().__init__()
+        self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0
+
+    def forward(self, hidden_states):
+        if self.num_pad_remove > 0:
+            hidden_states = hidden_states[:, :, : -self.num_pad_remove]
+        return hidden_states
+
+
+class Wav2Vec2ConformerPositionalConvEmbedding(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            kernel_size=config.num_conv_pos_embeddings,
+            padding=config.num_conv_pos_embeddings // 2,
+            groups=config.num_conv_pos_embedding_groups,
+        )
+
+        weight_norm = nn.utils.weight_norm
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            weight_norm = nn.utils.parametrizations.weight_norm
+
+        if is_deepspeed_zero3_enabled():
+            import deepspeed
+
+            with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
+                self.conv = weight_norm(self.conv, name="weight", dim=2)
+            if hasattr(self.conv, "parametrizations"):
+                weight_g = self.conv.parametrizations.weight.original0
+                weight_v = self.conv.parametrizations.weight.original1
+            else:
+                weight_g = self.conv.weight_g
+                weight_v = self.conv.weight_v
+            deepspeed.zero.register_external_parameter(self, weight_v)
+            deepspeed.zero.register_external_parameter(self, weight_g)
+        else:
+            self.conv = weight_norm(self.conv, name="weight", dim=2)
+
+        self.padding = Wav2Vec2ConformerSamePadLayer(config.num_conv_pos_embeddings)
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.transpose(1, 2)
+
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.padding(hidden_states)
+        hidden_states = self.activation(hidden_states)
+
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states
+
+
+class Wav2Vec2ConformerRotaryPositionalEmbedding(nn.Module):
+    """Rotary positional embedding
+    Reference : https://blog.eleuther.ai/rotary-embeddings/ Paper: https://huggingface.co/papers/2104.09864
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        dim = config.hidden_size // config.num_attention_heads
+        base = config.rotary_embedding_base
+
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float() / dim))
+        self.register_buffer("inv_freq", inv_freq)
+        self.cached_sequence_length = None
+        self.cached_rotary_positional_embedding = None
+
+    def forward(self, hidden_states):
+        sequence_length = hidden_states.shape[1]
+
+        if sequence_length == self.cached_sequence_length and self.cached_rotary_positional_embedding is not None:
+            return self.cached_rotary_positional_embedding
+
+        self.cached_sequence_length = sequence_length
+        # Embeddings are computed in the dtype of the inv_freq constant
+        time_stamps = torch.arange(sequence_length).type_as(self.inv_freq)
+        freqs = torch.einsum("i,j->ij", time_stamps, self.inv_freq)
+        embeddings = torch.cat((freqs, freqs), dim=-1)
+
+        cos_embeddings = embeddings.cos()[:, None, None, :]
+        sin_embeddings = embeddings.sin()[:, None, None, :]
+        # Computed embeddings are cast to the dtype of the hidden state inputs
+        self.cached_rotary_positional_embedding = torch.stack([cos_embeddings, sin_embeddings]).type_as(hidden_states)
+        return self.cached_rotary_positional_embedding
+
+
+class Wav2Vec2ConformerRelPositionalEmbedding(nn.Module):
+    """Relative positional encoding module."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.max_len = config.max_source_positions
+        self.d_model = config.hidden_size
+        self.pe = None
+        self.extend_pe(torch.tensor(0.0).expand(1, self.max_len))
+
+    def extend_pe(self, x):
+        # Reset the positional encodings
+        if self.pe is not None:
+            # self.pe contains both positive and negative parts
+            # the length of self.pe is 2 * input_len - 1
+            if self.pe.size(1) >= x.size(1) * 2 - 1:
+                if self.pe.dtype != x.dtype or self.pe.device != x.device:
+                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+                return
+        # Suppose `i` is the position of query vector and `j` is the
+        # position of key vector. We use positive relative positions when keys
+        # are to the left (i>j) and negative relative positions otherwise (i<j).
+        pe_positive = torch.zeros(x.size(1), self.d_model)
+        pe_negative = torch.zeros(x.size(1), self.d_model)
+        position = torch.arange(0, x.size(1), dtype=torch.int64).float().unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.int64).float() * -(math.log(10000.0) / self.d_model)
+        )
+        pe_positive[:, 0::2] = torch.sin(position * div_term)
+        pe_positive[:, 1::2] = torch.cos(position * div_term)
+        pe_negative[:, 0::2] = torch.sin(-1 * position * div_term)
+        pe_negative[:, 1::2] = torch.cos(-1 * position * div_term)
+
+        # Reverse the order of positive indices and concat both positive and
+        # negative indices. This is used to support the shifting trick
+        # as in https://huggingface.co/papers/1901.02860
+        pe_positive = torch.flip(pe_positive, [0]).unsqueeze(0)
+        pe_negative = pe_negative[1:].unsqueeze(0)
+        pe = torch.cat([pe_positive, pe_negative], dim=1)
+        self.pe = pe.to(device=x.device, dtype=x.dtype)
+
+    def forward(self, hidden_states: torch.Tensor):
+        self.extend_pe(hidden_states)
+        start_idx = self.pe.size(1) // 2 - hidden_states.size(1) + 1
+        end_idx = self.pe.size(1) // 2 + hidden_states.size(1)
+        relative_position_embeddings = self.pe[:, start_idx:end_idx]
+
+        return relative_position_embeddings
+
+
+class Wav2Vec2ConformerNoLayerNormConvLayer(GradientCheckpointingLayer):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+
+        self.conv = nn.Conv1d(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias=config.conv_bias,
+        )
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+class Wav2Vec2ConformerLayerNormConvLayer(GradientCheckpointingLayer):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+
+        self.conv = nn.Conv1d(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias=config.conv_bias,
+        )
+        self.layer_norm = nn.LayerNorm(self.out_conv_dim, elementwise_affine=True)
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+
+        hidden_states = hidden_states.transpose(-2, -1)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = hidden_states.transpose(-2, -1)
+
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+class Wav2Vec2ConformerGroupNormConvLayer(GradientCheckpointingLayer):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+
+        self.conv = nn.Conv1d(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias=config.conv_bias,
+        )
+        self.activation = ACT2FN[config.feat_extract_activation]
+
+        self.layer_norm = nn.GroupNorm(num_groups=self.out_conv_dim, num_channels=self.out_conv_dim, affine=True)
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+class Wav2Vec2ConformerFeatureEncoder(nn.Module):
+    """Construct the features from raw audio waveform"""
+
+    def __init__(self, config):
+        super().__init__()
+
+        if config.feat_extract_norm == "group":
+            conv_layers = [Wav2Vec2ConformerGroupNormConvLayer(config, layer_id=0)] + [
+                Wav2Vec2ConformerNoLayerNormConvLayer(config, layer_id=i + 1)
+                for i in range(config.num_feat_extract_layers - 1)
+            ]
+        elif config.feat_extract_norm == "layer":
+            conv_layers = [
+                Wav2Vec2ConformerLayerNormConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers)
+            ]
+        else:
+            raise ValueError(
+                f"`config.feat_extract_norm` is {config.feat_extract_norm}, but has to be one of ['group', 'layer']"
+            )
+        self.conv_layers = nn.ModuleList(conv_layers)
+        self.gradient_checkpointing = False
+        self._requires_grad = True
+
+    def _freeze_parameters(self):
+        for param in self.parameters():
+            param.requires_grad = False
+        self._requires_grad = False
+
+    def forward(self, input_values):
+        hidden_states = input_values[:, None]
+
+        # make sure hidden_states require grad for gradient_checkpointing
+        if self._requires_grad and self.training:
+            hidden_states.requires_grad = True
+
+        for conv_layer in self.conv_layers:
+            hidden_states = conv_layer(hidden_states)
+
+        return hidden_states
+
+
+class Wav2Vec2ConformerFeatureProjection(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps)
+        self.projection = nn.Linear(config.conv_dim[-1], config.hidden_size)
+        self.dropout = nn.Dropout(config.feat_proj_dropout)
+
+    def forward(self, hidden_states):
+        # non-projected hidden states are needed for quantization
+        norm_hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.projection(norm_hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states, norm_hidden_states
+
+
+class Wav2Vec2ConformerFeedForward(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.intermediate_dropout = nn.Dropout(config.activation_dropout)
+
+        self.intermediate_dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+
+        self.output_dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.output_dropout = nn.Dropout(config.hidden_dropout)
+
+    def forward(self, hidden_states):
+        hidden_states = self.intermediate_dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        hidden_states = self.intermediate_dropout(hidden_states)
+
+        hidden_states = self.output_dense(hidden_states)
+        hidden_states = self.output_dropout(hidden_states)
+        return hidden_states
+
+
+class Wav2Vec2ConformerConvolutionModule(nn.Module):
+    """Convolution block used in the conformer block"""
+
+    def __init__(self, config):
+        super().__init__()
+        if (config.conv_depthwise_kernel_size - 1) % 2 == 1:
+            raise ValueError("`config.conv_depthwise_kernel_size` should be a odd number for 'SAME' padding")
+        self.layer_norm = nn.LayerNorm(config.hidden_size)
+        self.pointwise_conv1 = nn.Conv1d(
+            config.hidden_size,
+            2 * config.hidden_size,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+        )
+        self.glu = nn.GLU(dim=1)
+        self.depthwise_conv = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            config.conv_depthwise_kernel_size,
+            stride=1,
+            padding=(config.conv_depthwise_kernel_size - 1) // 2,
+            groups=config.hidden_size,
+            bias=False,
+        )
+        self.batch_norm = nn.BatchNorm1d(config.hidden_size)
+        self.activation = ACT2FN[config.hidden_act]
+        self.pointwise_conv2 = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+        )
+        self.dropout = nn.Dropout(config.conformer_conv_dropout)
+
+    def forward(self, hidden_states):
+        hidden_states = self.layer_norm(hidden_states)
+        # exchange the temporal dimension and the feature dimension
+        hidden_states = hidden_states.transpose(1, 2)
+
+        # GLU mechanism
+        # => (batch, 2*channel, dim)
+        hidden_states = self.pointwise_conv1(hidden_states)
+        # => (batch, channel, dim)
+        hidden_states = self.glu(hidden_states)
+
+        # 1D Depthwise Conv
+        hidden_states = self.depthwise_conv(hidden_states)
+        hidden_states = self.batch_norm(hidden_states)
+        hidden_states = self.activation(hidden_states)
+
+        hidden_states = self.pointwise_conv2(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states
+
+
+class Wav2Vec2ConformerSelfAttention(nn.Module):
+    """Construct an Wav2Vec2ConformerSelfAttention object.
+    Can be enhanced with rotary or relative position embeddings.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.head_size = config.hidden_size // config.num_attention_heads
+        self.num_heads = config.num_attention_heads
+        self.position_embeddings_type = config.position_embeddings_type
+
+        self.linear_q = nn.Linear(config.hidden_size, config.hidden_size)
+        self.linear_k = nn.Linear(config.hidden_size, config.hidden_size)
+        self.linear_v = nn.Linear(config.hidden_size, config.hidden_size)
+        self.linear_out = nn.Linear(config.hidden_size, config.hidden_size)
+
+        self.dropout = nn.Dropout(p=config.attention_dropout)
+
+        if self.position_embeddings_type == "relative":
+            # linear transformation for positional encoding
+            self.linear_pos = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
+            # these two learnable bias are used in matrix c and matrix d
+            # as described in https://huggingface.co/papers/1901.02860 Section 3.3
+            self.pos_bias_u = nn.Parameter(torch.zeros(self.num_heads, self.head_size))
+            self.pos_bias_v = nn.Parameter(torch.zeros(self.num_heads, self.head_size))
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        relative_position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        # self-attention mechanism
+        batch_size, sequence_length, hidden_size = hidden_states.size()
+
+        # make sure query/key states can be != value states
+        query_key_states = hidden_states
+        value_states = hidden_states
+
+        if self.position_embeddings_type == "rotary":
+            if relative_position_embeddings is None:
+                raise ValueError(
+                    "`relative_position_embeddings` has to be defined when `self.position_embeddings_type == 'rotary'"
+                )
+            query_key_states = self._apply_rotary_embedding(query_key_states, relative_position_embeddings)
+
+        # project query_key_states and value_states
+        query = self.linear_q(query_key_states).view(batch_size, -1, self.num_heads, self.head_size)
+        key = self.linear_k(query_key_states).view(batch_size, -1, self.num_heads, self.head_size)
+        value = self.linear_v(value_states).view(batch_size, -1, self.num_heads, self.head_size)
+
+        # => (batch, head, time1, d_k)
+        query = query.transpose(1, 2)
+        key = key.transpose(1, 2)
+        value = value.transpose(1, 2)
+
+        if self.position_embeddings_type == "relative":
+            if relative_position_embeddings is None:
+                raise ValueError(
+                    "`relative_position_embeddings` has to be defined when `self.position_embeddings_type =="
+                    " 'relative'"
+                )
+            # apply relative_position_embeddings to qk scores
+            # as proposed in Transformer_XL: https://huggingface.co/papers/1901.02860
+            scores = self._apply_relative_embeddings(
+                query=query, key=key, relative_position_embeddings=relative_position_embeddings
+            )
+        else:
+            scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.head_size)
+
+        # apply attention_mask if necessary
+        if attention_mask is not None:
+            scores = scores + attention_mask
+
+        # => (batch, head, time1, time2)
+        probs = torch.softmax(scores, dim=-1)
+        probs = self.dropout(probs)
+
+        # => (batch, head, time1, d_k)
+        hidden_states = torch.matmul(probs, value)
+
+        # => (batch, time1, hidden_size)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, self.num_heads * self.head_size)
+        hidden_states = self.linear_out(hidden_states)
+
+        return hidden_states, probs
+
+    def _apply_rotary_embedding(self, hidden_states, relative_position_embeddings):
+        batch_size, sequence_length, hidden_size = hidden_states.size()
+        hidden_states = hidden_states.view(batch_size, sequence_length, self.num_heads, self.head_size)
+
+        cos = relative_position_embeddings[0, :sequence_length, ...]
+        sin = relative_position_embeddings[1, :sequence_length, ...]
+
+        # rotate hidden_states with rotary embeddings
+        hidden_states = hidden_states.transpose(0, 1)
+        rotated_states_begin = hidden_states[..., : self.head_size // 2]
+        rotated_states_end = hidden_states[..., self.head_size // 2 :]
+        rotated_states = torch.cat((-rotated_states_end, rotated_states_begin), dim=rotated_states_begin.ndim - 1)
+        hidden_states = (hidden_states * cos) + (rotated_states * sin)
+        hidden_states = hidden_states.transpose(0, 1)
+
+        hidden_states = hidden_states.view(batch_size, sequence_length, self.num_heads * self.head_size)
+
+        return hidden_states
+
+    def _apply_relative_embeddings(self, query, key, relative_position_embeddings):
+        # 1. project positional embeddings
+        # => (batch, head, 2*time1-1, d_k)
+        proj_relative_position_embeddings = self.linear_pos(relative_position_embeddings)
+        proj_relative_position_embeddings = proj_relative_position_embeddings.view(
+            relative_position_embeddings.size(0), -1, self.num_heads, self.head_size
+        )
+        proj_relative_position_embeddings = proj_relative_position_embeddings.transpose(1, 2)
+        proj_relative_position_embeddings = proj_relative_position_embeddings.transpose(2, 3)
+
+        # 2. Add bias to query
+        # => (batch, head, time1, d_k)
+        query = query.transpose(1, 2)
+        q_with_bias_u = (query + self.pos_bias_u).transpose(1, 2)
+        q_with_bias_v = (query + self.pos_bias_v).transpose(1, 2)
+
+        # 3. attention score: first compute matrix a and matrix c
+        # as described in https://huggingface.co/papers/1901.02860 Section 3.3
+        # => (batch, head, time1, time2)
+        scores_ac = torch.matmul(q_with_bias_u, key.transpose(-2, -1))
+
+        # 4. then compute matrix b and matrix d
+        # => (batch, head, time1, 2*time1-1)
+        scores_bd = torch.matmul(q_with_bias_v, proj_relative_position_embeddings)
+
+        # 5. shift matrix b and matrix d
+        zero_pad = torch.zeros((*scores_bd.size()[:3], 1), device=scores_bd.device, dtype=scores_bd.dtype)
+        scores_bd_padded = torch.cat([zero_pad, scores_bd], dim=-1)
+        scores_bd_padded_shape = scores_bd.size()[:2] + (scores_bd.shape[3] + 1, scores_bd.shape[2])
+        scores_bd_padded = scores_bd_padded.view(*scores_bd_padded_shape)
+        scores_bd = scores_bd_padded[:, :, 1:].view_as(scores_bd)
+        scores_bd = scores_bd[:, :, :, : scores_bd.size(-1) // 2 + 1]
+
+        # 6. sum matrices
+        # => (batch, head, time1, time2)
+        scores = (scores_ac + scores_bd) / math.sqrt(self.head_size)
+
+        return scores
+
+
+class Wav2Vec2ConformerEncoderLayer(GradientCheckpointingLayer):
+    """Conformer block based on https://huggingface.co/papers/2005.08100."""
+
+    def __init__(self, config):
+        super().__init__()
+        embed_dim = config.hidden_size
+        dropout = config.attention_dropout
+
+        # Feed-forward 1
+        self.ffn1_layer_norm = nn.LayerNorm(embed_dim)
+        self.ffn1 = Wav2Vec2ConformerFeedForward(config)
+
+        # Self-Attention
+        self.self_attn_layer_norm = nn.LayerNorm(embed_dim)
+        self.self_attn_dropout = nn.Dropout(dropout)
+        self.self_attn = Wav2Vec2ConformerSelfAttention(config)
+
+        # Conformer Convolution
+        self.conv_module = Wav2Vec2ConformerConvolutionModule(config)
+
+        # Feed-forward 2
+        self.ffn2_layer_norm = nn.LayerNorm(embed_dim)
+        self.ffn2 = Wav2Vec2ConformerFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(embed_dim)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask: Optional[torch.Tensor] = None,
+        relative_position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ):
+        # 1. Feed-Forward 1 layer
+        residual = hidden_states
+        hidden_states = self.ffn1_layer_norm(hidden_states)
+        hidden_states = self.ffn1(hidden_states)
+        hidden_states = hidden_states * 0.5 + residual
+        residual = hidden_states
+
+        # 2. Self-Attention layer
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, attn_weigts = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            relative_position_embeddings=relative_position_embeddings,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self.self_attn_dropout(hidden_states)
+        hidden_states = hidden_states + residual
+
+        # 3. Convolutional Layer
+        residual = hidden_states
+        hidden_states = self.conv_module(hidden_states)
+        hidden_states = residual + hidden_states
+
+        # 4. Feed-Forward 2 Layer
+        residual = hidden_states
+        hidden_states = self.ffn2_layer_norm(hidden_states)
+        hidden_states = self.ffn2(hidden_states)
+        hidden_states = hidden_states * 0.5 + residual
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        return hidden_states, attn_weigts
+
+
+class Wav2Vec2ConformerEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+
+        if config.position_embeddings_type == "relative":
+            self.embed_positions = Wav2Vec2ConformerRelPositionalEmbedding(config)
+        elif config.position_embeddings_type == "rotary":
+            self.embed_positions = Wav2Vec2ConformerRotaryPositionalEmbedding(config)
+        else:
+            self.embed_positions = None
+
+        self.pos_conv_embed = Wav2Vec2ConformerPositionalConvEmbedding(config)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layers = nn.ModuleList([Wav2Vec2ConformerEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        if attention_mask is not None:
+            # make sure padded tokens output 0
+            expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
+            hidden_states[~expand_attention_mask] = 0.0
+
+            # extend attention_mask
+            attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
+            attention_mask = attention_mask * torch.finfo(hidden_states.dtype).min
+            attention_mask = attention_mask.expand(
+                attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
+            )
+
+        hidden_states = self.dropout(hidden_states)
+
+        if self.embed_positions is not None:
+            relative_position_embeddings = self.embed_positions(hidden_states)
+        else:
+            relative_position_embeddings = None
+
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
+
+        for i, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            dropout_probability = torch.rand([])
+
+            skip_the_layer = self.training and dropout_probability < self.config.layerdrop
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
+                layer_outputs = layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    relative_position_embeddings=relative_position_embeddings,
+                    output_attentions=output_attentions,
+                )
+                hidden_states = layer_outputs[0]
+
+            if skip_the_layer:
+                layer_outputs = (None, None)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        hidden_states = self.layer_norm(hidden_states)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class Wav2Vec2ConformerGumbelVectorQuantizer(nn.Module):
+    """
+    Vector quantization using gumbel softmax. See `[CATEGORICAL REPARAMETERIZATION WITH
+    GUMBEL-SOFTMAX](https://huggingface.co/papers/1611.01144) for more information.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        self.num_groups = config.num_codevector_groups
+        self.num_vars = config.num_codevectors_per_group
+
+        if config.codevector_dim % self.num_groups != 0:
+            raise ValueError(
+                f"`config.codevector_dim {config.codevector_dim} must be divisible "
+                f"by `config.num_codevector_groups` {self.num_groups} for concatenation"
+            )
+
+        # storage for codebook variables (codewords)
+        self.codevectors = nn.Parameter(
+            torch.FloatTensor(1, self.num_groups * self.num_vars, config.codevector_dim // self.num_groups)
+        )
+        self.weight_proj = nn.Linear(config.conv_dim[-1], self.num_groups * self.num_vars)
+
+        # can be decayed for training
+        self.temperature = 2
+
+    @staticmethod
+    def _compute_perplexity(probs, mask=None):
+        if mask is not None:
+            mask_extended = mask.flatten()[:, None, None].expand(probs.shape)
+            probs = torch.where(mask_extended, probs, torch.zeros_like(probs))
+            marginal_probs = probs.sum(dim=0) / mask.sum()
+        else:
+            marginal_probs = probs.mean(dim=0)
+
+        perplexity = torch.exp(-torch.sum(marginal_probs * torch.log(marginal_probs + 1e-7), dim=-1)).sum()
+        return perplexity
+
+    def forward(self, hidden_states, mask_time_indices=None):
+        batch_size, sequence_length, hidden_size = hidden_states.shape
+
+        # project to codevector dim
+        hidden_states = self.weight_proj(hidden_states)
+        hidden_states = hidden_states.view(batch_size * sequence_length * self.num_groups, -1)
+
+        if self.training:
+            # sample code vector probs via gumbel in differentiateable way
+            codevector_probs = nn.functional.gumbel_softmax(
+                hidden_states.float(), tau=self.temperature, hard=True
+            ).type_as(hidden_states)
+
+            # compute perplexity
+            codevector_soft_dist = torch.softmax(
+                hidden_states.view(batch_size * sequence_length, self.num_groups, -1).float(), dim=-1
+            )
+            perplexity = self._compute_perplexity(codevector_soft_dist, mask_time_indices)
+        else:
+            # take argmax in non-differentiable way
+            # comptute hard codevector distribution (one hot)
+            codevector_idx = hidden_states.argmax(dim=-1)
+            codevector_probs = hidden_states.new_zeros(hidden_states.shape).scatter_(
+                -1, codevector_idx.view(-1, 1), 1.0
+            )
+            codevector_probs = codevector_probs.view(batch_size * sequence_length, self.num_groups, -1)
+
+            perplexity = self._compute_perplexity(codevector_probs, mask_time_indices)
+
+        codevector_probs = codevector_probs.view(batch_size * sequence_length, -1)
+        # use probs to retrieve codevectors
+        codevectors_per_group = codevector_probs.unsqueeze(-1) * self.codevectors
+        codevectors = codevectors_per_group.view(batch_size * sequence_length, self.num_groups, self.num_vars, -1)
+        codevectors = codevectors.sum(-2).view(batch_size, sequence_length, -1)
+
+        return codevectors, perplexity
+
+
+class Wav2Vec2ConformerAdapter(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+
+        # feature dim might need to be down-projected
+        if config.output_hidden_size != config.hidden_size:
+            self.proj = nn.Linear(config.hidden_size, config.output_hidden_size)
+            self.proj_layer_norm = nn.LayerNorm(config.output_hidden_size)
+        else:
+            self.proj = self.proj_layer_norm = None
+
+        self.layers = nn.ModuleList(Wav2Vec2ConformerAdapterLayer(config) for _ in range(config.num_adapter_layers))
+        self.layerdrop = config.layerdrop
+
+    def forward(self, hidden_states):
+        # down project hidden_states if necessary
+        if self.proj is not None and self.proj_layer_norm is not None:
+            hidden_states = self.proj(hidden_states)
+            hidden_states = self.proj_layer_norm(hidden_states)
+
+        hidden_states = hidden_states.transpose(1, 2)
+
+        for layer in self.layers:
+            layerdrop_prob = np.random.random()
+            if not self.training or (layerdrop_prob > self.layerdrop):
+                hidden_states = layer(hidden_states)
+
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states
+
+
+class Wav2Vec2ConformerAdapterLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            config.output_hidden_size,
+            2 * config.output_hidden_size,
+            config.adapter_kernel_size,
+            stride=config.adapter_stride,
+            padding=1,
+        )
+
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = nn.functional.glu(hidden_states, dim=1)
+
+        return hidden_states
+
+
+@auto_docstring
+class Wav2Vec2ConformerPreTrainedModel(PreTrainedModel):
+    config: Wav2Vec2ConformerConfig
+    base_model_prefix = "wav2vec2_conformer"
+    main_input_name = "input_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        # Wav2Vec2ForPreTraining last 2 linear layers need standard Linear init.
+        if isinstance(module, Wav2Vec2ConformerForPreTraining):
+            module.project_hid.reset_parameters()
+            module.project_q.reset_parameters()
+            module.project_hid._is_hf_initialized = True
+            module.project_q._is_hf_initialized = True
+        # gumbel softmax requires special init
+        elif isinstance(module, Wav2Vec2ConformerGumbelVectorQuantizer):
+            module.weight_proj.weight.data.normal_(mean=0.0, std=1)
+            module.weight_proj.bias.data.zero_()
+            nn.init.uniform_(module.codevectors)
+        elif isinstance(module, Wav2Vec2ConformerSelfAttention):
+            if hasattr(module, "pos_bias_u"):
+                nn.init.xavier_uniform_(module.pos_bias_u)
+            if hasattr(module, "pos_bias_v"):
+                nn.init.xavier_uniform_(module.pos_bias_v)
+        elif isinstance(module, Wav2Vec2ConformerPositionalConvEmbedding):
+            nn.init.normal_(
+                module.conv.weight,
+                mean=0,
+                std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)),
+            )
+            nn.init.constant_(module.conv.bias, 0)
+        elif isinstance(module, Wav2Vec2ConformerFeatureProjection):
+            k = math.sqrt(1 / module.projection.in_features)
+            nn.init.uniform_(module.projection.weight, a=-k, b=k)
+            nn.init.uniform_(module.projection.bias, a=-k, b=k)
+        elif isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Conv1d):
+            nn.init.kaiming_normal_(module.weight)
+
+            if module.bias is not None:
+                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+                nn.init.uniform_(module.bias, a=-k, b=k)
+
+    def _get_feat_extract_output_lengths(
+        self, input_lengths: Union[torch.LongTensor, int], add_adapter: Optional[bool] = None
+    ):
+        """
+        Computes the output length of the convolutional layers
+        """
+
+        add_adapter = self.config.add_adapter if add_adapter is None else add_adapter
+
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
+
+        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
+            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
+
+        if add_adapter:
+            for _ in range(self.config.num_adapter_layers):
+                input_lengths = _conv_out_length(input_lengths, 1, self.config.adapter_stride)
+
+        return input_lengths
+
+    def _get_feature_vector_attention_mask(
+        self, feature_vector_length: int, attention_mask: torch.LongTensor, add_adapter=None
+    ):
+        # Effectively attention_mask.sum(-1), but not inplace to be able to run
+        # on inference mode.
+        non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1]
+
+        output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter)
+        output_lengths = output_lengths.to(torch.long)
+
+        batch_size = attention_mask.shape[0]
+
+        attention_mask = torch.zeros(
+            (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
+        )
+        # these two operations makes sure that all values before the output lengths idxs are attended to
+        attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
+        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
+        return attention_mask
+
+
+def _compute_mask_indices(
+    shape: tuple[int, int],
+    mask_prob: float,
+    mask_length: int,
+    attention_mask: Optional[torch.LongTensor] = None,
+    min_masks: int = 0,
+) -> np.ndarray:
+    """
+    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
+    ASR](https://huggingface.co/papers/1904.08779). Note that this method is not optimized to run on TPU and should be run on
+    CPU as part of the preprocessing during training.
+
+    Args:
+        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
+               the first element is the batch size and the second element is the length of the axis to span.
+        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
+                    independently generated mask spans of length `mask_length` is computed by
+                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
+                    actual percentage will be smaller.
+        mask_length: size of the mask
+        min_masks: minimum number of masked spans
+        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
+                        each batch dimension.
+    """
+    batch_size, sequence_length = shape
+
+    if mask_length < 1:
+        raise ValueError("`mask_length` has to be bigger than 0.")
+
+    if mask_length > sequence_length:
+        raise ValueError(
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
+            f" and `sequence_length`: {sequence_length}`"
+        )
+
+    # epsilon is used for probabilistic rounding
+    epsilon = np.random.rand(1).item()
+
+    def compute_num_masked_span(input_length):
+        """Given input length, compute how many spans should be masked"""
+        num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
+        num_masked_span = max(num_masked_span, min_masks)
+
+        # make sure num masked span <= sequence_length
+        if num_masked_span * mask_length > sequence_length:
+            num_masked_span = sequence_length // mask_length
+
+        # make sure num_masked span is also <= input_length - (mask_length - 1)
+        if input_length - (mask_length - 1) < num_masked_span:
+            num_masked_span = max(input_length - (mask_length - 1), 0)
+
+        return num_masked_span
+
+    # compute number of masked spans in batch
+    input_lengths = (
+        attention_mask.detach().sum(-1).tolist()
+        if attention_mask is not None
+        else [sequence_length for _ in range(batch_size)]
+    )
+
+    # SpecAugment mask to fill
+    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)
+    spec_aug_mask_idxs = []
+
+    max_num_masked_span = compute_num_masked_span(sequence_length)
+
+    if max_num_masked_span == 0:
+        return spec_aug_mask
+
+    for input_length in input_lengths:
+        # compute num of masked spans for this input
+        num_masked_span = compute_num_masked_span(input_length)
+
+        # get random indices to mask
+        spec_aug_mask_idx = np.random.choice(
+            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
+        )
+
+        # pick first sampled index that will serve as a dummy index to pad vector
+        # to ensure same dimension for all batches due to probabilistic rounding
+        # Picking first sample just pads those vectors twice.
+        if len(spec_aug_mask_idx) == 0:
+            # this case can only happen if `input_length` is strictly smaller then
+            # `sequence_length` in which case the last token has to be a padding
+            # token which we can use as a dummy mask id
+            dummy_mask_idx = sequence_length - 1
+        else:
+            dummy_mask_idx = spec_aug_mask_idx[0]
+
+        spec_aug_mask_idx = np.concatenate(
+            [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
+        )
+        spec_aug_mask_idxs.append(spec_aug_mask_idx)
+
+    spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)
+
+    # expand masked indices to masked spans
+    spec_aug_mask_idxs = np.broadcast_to(
+        spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
+
+    # add offset to the starting indexes so that indexes now create a span
+    offsets = np.arange(mask_length)[None, None, :]
+    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
+        batch_size, max_num_masked_span * mask_length
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
+
+    # ensure that we cannot have indices larger than sequence_length
+    if spec_aug_mask_idxs.max() > sequence_length - 1:
+        spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
+
+    # scatter indices to mask
+    np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
+
+    return spec_aug_mask
+
+
+Wav2Vec2ConformerBaseModelOutput = Wav2Vec2BaseModelOutput
+
+
+@auto_docstring
+class Wav2Vec2ConformerModel(Wav2Vec2ConformerPreTrainedModel):
+    def __init__(self, config: Wav2Vec2ConformerConfig):
+        super().__init__(config)
+        self.config = config
+        self.feature_extractor = Wav2Vec2ConformerFeatureEncoder(config)
+        self.feature_projection = Wav2Vec2ConformerFeatureProjection(config)
+
+        # model only needs masking vector if mask prob is > 0.0
+        if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
+            self.masked_spec_embed = nn.Parameter(torch.Tensor(config.hidden_size).uniform_())
+
+        self.encoder = Wav2Vec2ConformerEncoder(config)
+
+        self.adapter = Wav2Vec2ConformerAdapter(config) if config.add_adapter else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.feature_extractor._freeze_parameters()
+
+    def _mask_hidden_states(
+        self,
+        hidden_states: torch.FloatTensor,
+        mask_time_indices: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+    ):
+        """
+        Masks extracted features along time axis and/or along feature axis according to
+        [SpecAugment](https://huggingface.co/papers/1904.08779).
+        """
+
+        # `config.apply_spec_augment` can set masking to False
+        if not getattr(self.config, "apply_spec_augment", True):
+            return hidden_states
+
+        # generate indices & apply SpecAugment along time axis
+        batch_size, sequence_length, hidden_size = hidden_states.size()
+
+        if mask_time_indices is not None:
+            # apply SpecAugment along time axis with given mask_time_indices
+            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
+        elif self.config.mask_time_prob > 0 and self.training:
+            mask_time_indices = _compute_mask_indices(
+                (batch_size, sequence_length),
+                mask_prob=self.config.mask_time_prob,
+                mask_length=self.config.mask_time_length,
+                attention_mask=attention_mask,
+                min_masks=self.config.mask_time_min_masks,
+            )
+            mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
+            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
+
+        if self.config.mask_feature_prob > 0 and self.training:
+            # generate indices & apply SpecAugment along feature axis
+            mask_feature_indices = _compute_mask_indices(
+                (batch_size, hidden_size),
+                mask_prob=self.config.mask_feature_prob,
+                mask_length=self.config.mask_feature_length,
+                min_masks=self.config.mask_feature_min_masks,
+            )
+            mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
+            mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
+            hidden_states[mask_feature_indices] = 0
+
+        return hidden_states
+
+    @auto_docstring
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        mask_time_indices: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, Wav2Vec2ConformerBaseModelOutput]:
+        r"""
+        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
+            masked extracted features in *config.proj_codevector_dim* space.
+        """
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        extract_features = self.feature_extractor(input_values)
+        extract_features = extract_features.transpose(1, 2)
+
+        if attention_mask is not None:
+            # compute reduced attention_mask corresponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(
+                extract_features.shape[1], attention_mask, add_adapter=False
+            )
+
+        hidden_states, extract_features = self.feature_projection(extract_features)
+        hidden_states = self._mask_hidden_states(
+            hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
+        )
+
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = encoder_outputs[0]
+
+        if self.adapter is not None:
+            hidden_states = self.adapter(hidden_states)
+
+        if not return_dict:
+            return (hidden_states, extract_features) + encoder_outputs[1:]
+
+        return Wav2Vec2ConformerBaseModelOutput(
+            last_hidden_state=hidden_states,
+            extract_features=extract_features,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    Wav2Vec2Conformer Model with a quantizer and `VQ` head on top.
+    """
+)
+class Wav2Vec2ConformerForPreTraining(Wav2Vec2ConformerPreTrainedModel):
+    def __init__(self, config: Wav2Vec2ConformerConfig):
+        super().__init__(config)
+        self.wav2vec2_conformer = Wav2Vec2ConformerModel(config)
+        self.dropout_features = nn.Dropout(config.feat_quantizer_dropout)
+
+        self.quantizer = Wav2Vec2ConformerGumbelVectorQuantizer(config)
+
+        self.project_hid = nn.Linear(config.hidden_size, config.proj_codevector_dim)
+        self.project_q = nn.Linear(config.codevector_dim, config.proj_codevector_dim)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def set_gumbel_temperature(self, temperature: int):
+        """
+        Set the Gumbel softmax temperature to a given value. Only necessary for training
+        """
+        self.quantizer.temperature = temperature
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.wav2vec2_conformer.feature_extractor._freeze_parameters()
+
+    @staticmethod
+    def compute_contrastive_logits(
+        target_features: torch.FloatTensor,
+        negative_features: torch.FloatTensor,
+        predicted_features: torch.FloatTensor,
+        temperature: int = 0.1,
+    ):
+        """
+        Compute logits for contrastive loss based using cosine similarity as the distance measure between
+        `[positive_feature, negative_features]` and `[predicted_features]`. Additionally, temperature can be applied.
+        """
+        target_features = torch.cat([target_features, negative_features], dim=0)
+
+        logits = torch.cosine_similarity(predicted_features.float(), target_features.float(), dim=-1).type_as(
+            target_features
+        )
+
+        # apply temperature
+        logits = logits / temperature
+        return logits
+
+    @auto_docstring
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        mask_time_indices: Optional[torch.BoolTensor] = None,
+        sampled_negative_indices: Optional[torch.BoolTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, Wav2Vec2ConformerForPreTrainingOutput]:
+        r"""
+        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
+            masked extracted features in *config.proj_codevector_dim* space.
+        sampled_negative_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_negatives)`, *optional*):
+            Indices indicating which quantized target vectors are used as negative sampled vectors in contrastive loss.
+            Required input for pre-training.
+
+        Example:
+
+        ```python
+        >>> import torch
+        >>> from transformers import AutoFeatureExtractor, Wav2Vec2ConformerForPreTraining
+        >>> from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer import _compute_mask_indices, _sample_negative_indices
+        >>> from datasets import load_dataset
+
+        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2_conformer-base")
+        >>> model = Wav2Vec2ConformerForPreTraining.from_pretrained("facebook/wav2vec2_conformer-base")
+
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> input_values = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt").input_values  # Batch size 1
+
+        >>> # compute masked indices
+        >>> batch_size, raw_sequence_length = input_values.shape
+        >>> sequence_length = model._get_feat_extract_output_lengths(raw_sequence_length).item()
+        >>> mask_time_indices = _compute_mask_indices(
+        ...     shape=(batch_size, sequence_length), mask_prob=0.2, mask_length=2
+        ... )
+        >>> sampled_negative_indices = _sample_negative_indices(
+        ...     features_shape=(batch_size, sequence_length),
+        ...     num_negatives=model.config.num_negatives,
+        ...     mask_time_indices=mask_time_indices,
+        ... )
+        >>> mask_time_indices = torch.tensor(data=mask_time_indices, device=input_values.device, dtype=torch.long)
+        >>> sampled_negative_indices = torch.tensor(
+        ...     data=sampled_negative_indices, device=input_values.device, dtype=torch.long
+        ... )
+
+        >>> with torch.no_grad():
+        ...     outputs = model(input_values, mask_time_indices=mask_time_indices)
+
+        >>> # compute cosine similarity between predicted (=projected_states) and target (=projected_quantized_states)
+        >>> cosine_sim = torch.cosine_similarity(outputs.projected_states, outputs.projected_quantized_states, dim=-1)
+
+        >>> # show that cosine similarity is much higher than random
+        >>> cosine_sim[mask_time_indices.to(torch.bool)].mean() > 0.5
+        tensor(True)
+
+        >>> # for contrastive loss training model should be put into train mode
+        >>> model = model.train()
+        >>> loss = model(
+        ...     input_values, mask_time_indices=mask_time_indices, sampled_negative_indices=sampled_negative_indices
+        ... ).loss
+        ```"""
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if mask_time_indices is not None:
+            mask_time_indices = mask_time_indices.to(torch.bool)
+
+        outputs = self.wav2vec2_conformer(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            mask_time_indices=mask_time_indices,
+            return_dict=return_dict,
+        )
+
+        # 1. project all transformed features (including masked) to final vq dim
+        transformer_features = self.project_hid(outputs[0])
+
+        # 2. quantize all (unmasked) extracted features and project to final vq dim
+        extract_features = self.dropout_features(outputs[1])
+
+        if attention_mask is not None:
+            # compute reduced attention_mask corresponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(
+                extract_features.shape[1], attention_mask, add_adapter=False
+            )
+
+        quantized_features, codevector_perplexity = self.quantizer(
+            extract_features, mask_time_indices=mask_time_indices
+        )
+
+        quantized_features = quantized_features.to(self.project_q.weight.dtype)
+        quantized_features = self.project_q(quantized_features)
+
+        loss = contrastive_loss = diversity_loss = None
+        if sampled_negative_indices is not None:
+            batch_size, sequence_length, hidden_size = quantized_features.shape
+
+            # for training, we sample negatives
+            # 3. sample K negatives (distractors) quantized states for contrastive loss
+            # if attention_mask is passed, make sure that padded feature vectors cannot be sampled
+            # sample negative quantized vectors BTC => (BxT)C
+            negative_quantized_features = quantized_features.view(-1, hidden_size)[
+                sampled_negative_indices.long().view(-1)
+            ]
+            negative_quantized_features = negative_quantized_features.view(
+                batch_size, sequence_length, -1, hidden_size
+            ).permute(2, 0, 1, 3)
+
+            # 4. compute logits, corresponding to `logs = sim(c_t, [q_t, \sim{q}_t]) / \kappa`
+            # of equation (3) in https://huggingface.co/papers/2006.11477
+            logits = self.compute_contrastive_logits(
+                quantized_features[None, :],
+                negative_quantized_features,
+                transformer_features,
+                self.config.contrastive_logits_temperature,
+            )
+
+            # 5. if a negative vector is identical to the positive (i.e. when codebook utilization is low),
+            # its cosine similarity will be masked
+            neg_is_pos = (quantized_features == negative_quantized_features).all(-1)
+
+            if neg_is_pos.any():
+                logits[1:][neg_is_pos] = float("-inf")
+
+            # 6. compute contrastive loss \mathbf{L}_m = cross_entropy(logs) =
+            # -log(exp(sim(c_t, q_t)/\kappa) / \sum_{\sim{q}} exp(sim(c_t, \sim{q})/\kappa))
+            logits = logits.transpose(0, 2).reshape(-1, logits.size(0))
+            target = ((1 - mask_time_indices.long()) * -100).transpose(0, 1).flatten()
+
+            contrastive_loss = nn.functional.cross_entropy(logits.float(), target, reduction="sum")
+            # 7. compute diversity loss: \mathbf{L}_d
+            num_codevectors = self.config.num_codevectors_per_group * self.config.num_codevector_groups
+            diversity_loss = ((num_codevectors - codevector_perplexity) / num_codevectors) * mask_time_indices.sum()
+
+            # 8. \mathbf{L} = \mathbf{L}_m + \alpha * \mathbf{L}_d
+            loss = contrastive_loss + self.config.diversity_loss_weight * diversity_loss
+
+        if not return_dict:
+            if loss is not None:
+                return (loss, transformer_features, quantized_features, codevector_perplexity) + outputs[2:]
+            return (transformer_features, quantized_features, codevector_perplexity) + outputs[2:]
+
+        return Wav2Vec2ConformerForPreTrainingOutput(
+            loss=loss,
+            projected_states=transformer_features,
+            projected_quantized_states=quantized_features,
+            codevector_perplexity=codevector_perplexity,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            contrastive_loss=contrastive_loss,
+            diversity_loss=diversity_loss,
+        )
+
+
+_HIDDEN_STATES_START_POSITION = 2
+
+
+@auto_docstring(
+    custom_intro="""
+    Wav2Vec2Conformer Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).
+    """
+)
+class Wav2Vec2ConformerForCTC(Wav2Vec2ConformerPreTrainedModel):
+    def __init__(self, config, target_lang: Optional[str] = None):
+        r"""
+        target_lang (`str`, *optional*):
+            Language id of adapter weights. Adapter weights are stored in the format adapter.<lang>.safetensors or
+            adapter.<lang>.bin. Only relevant when using an instance of [`UniSpeechSatForCTC`] with adapters. Uses 'eng' by
+            default.
+        """
+        super().__init__(config)
+
+        self.wav2vec2_conformer = Wav2Vec2ConformerModel(config)
+        self.dropout = nn.Dropout(config.final_dropout)
+
+        self.target_lang = target_lang
+
+        if config.vocab_size is None:
+            raise ValueError(
+                f"You are trying to instantiate {self.__class__} with a configuration that "
+                "does not define the vocabulary size of the language model head. Please "
+                "instantiate the model as follows: `Wav2Vec2ConformerForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
+                "or define `vocab_size` of your model's configuration."
+            )
+        output_hidden_size = (
+            config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size
+        )
+        self.lm_head = nn.Linear(output_hidden_size, config.vocab_size)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.wav2vec2_conformer.feature_extractor._freeze_parameters()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[tuple, CausalLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
+            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
+            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
+            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
+            config.vocab_size - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if labels is not None and labels.max() >= self.config.vocab_size:
+            raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
+
+        outputs = self.wav2vec2_conformer(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        hidden_states = self.dropout(hidden_states)
+
+        logits = self.lm_head(hidden_states)
+
+        loss = None
+        if labels is not None:
+            # retrieve loss input_lengths from attention_mask
+            attention_mask = (
+                attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)
+            )
+            input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
+
+            # assuming that padded tokens are filled with -100
+            # when not being attended to
+            labels_mask = labels >= 0
+            target_lengths = labels_mask.sum(-1)
+            flattened_targets = labels.masked_select(labels_mask)
+
+            # ctc_loss doesn't support fp16
+            log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)
+
+            with torch.backends.cudnn.flags(enabled=False):
+                loss = nn.functional.ctc_loss(
+                    log_probs,
+                    flattened_targets,
+                    input_lengths,
+                    target_lengths,
+                    blank=self.config.pad_token_id,
+                    reduction=self.config.ctc_loss_reduction,
+                    zero_infinity=self.config.ctc_zero_infinity,
+                )
+
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+
+        return CausalLMOutput(
+            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
+        )
+
+
+@auto_docstring(
+    custom_intro="""
+    Wav2Vec2Conformer Model with a sequence classification head on top (a linear layer over the pooled output) for tasks like
+    SUPERB Keyword Spotting.
+    """
+)
+class Wav2Vec2ConformerForSequenceClassification(Wav2Vec2ConformerPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        if hasattr(config, "add_adapter") and config.add_adapter:
+            raise ValueError(
+                "Sequence classification does not support the use of Wav2Vec2Conformer adapters (config.add_adapter=True)"
+            )
+        self.wav2vec2_conformer = Wav2Vec2ConformerModel(config)
+        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
+        if config.use_weighted_layer_sum:
+            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
+        self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size)
+        self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.wav2vec2_conformer.feature_extractor._freeze_parameters()
+
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.wav2vec2_conformer.parameters():
+            param.requires_grad = False
+
+    @auto_docstring
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[tuple, SequenceClassifierOutput]:
+        r"""
+        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
+            into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
+            (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
+            To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion
+            into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2ConformerProcessor.__call__`] for details.
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+
+        outputs = self.wav2vec2_conformer(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = outputs[0]
+
+        hidden_states = self.projector(hidden_states)
+        if attention_mask is None:
+            pooled_output = hidden_states.mean(dim=1)
+        else:
+            padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
+            expand_padding_mask = padding_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
+            hidden_states[~expand_padding_mask] = 0.0
+            pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1)
+
+        logits = self.classifier(pooled_output)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
+
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+@auto_docstring
+class Wav2Vec2ConformerForAudioFrameClassification(Wav2Vec2ConformerPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        if hasattr(config, "add_adapter") and config.add_adapter:
+            raise ValueError(
+                "Audio frame classification does not support the use of Wav2Vec2Conformer adapters (config.add_adapter=True)"
+            )
+        self.wav2vec2_conformer = Wav2Vec2ConformerModel(config)
+        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
+        if config.use_weighted_layer_sum:
+            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        self.num_labels = config.num_labels
+
+        self.init_weights()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.wav2vec2_conformer.feature_extractor._freeze_parameters()
+
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.wav2vec2_conformer.parameters():
+            param.requires_grad = False
+
+    @auto_docstring
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, TokenClassifierOutput]:
+        r"""
+        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
+            into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
+            (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
+            To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion
+            into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2ConformerProcessor.__call__`] for details.
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+
+        outputs = self.wav2vec2_conformer(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = outputs[0]
+
+        logits = self.classifier(hidden_states)
+
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), torch.argmax(labels.view(-1, self.num_labels), axis=1))
+
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return output
+
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+class AMSoftmaxLoss(nn.Module):
+    def __init__(self, input_dim, num_labels, scale=30.0, margin=0.4):
+        super().__init__()
+        self.scale = scale
+        self.margin = margin
+        self.num_labels = num_labels
+        self.weight = nn.Parameter(torch.randn(input_dim, num_labels), requires_grad=True)
+        self.loss = nn.CrossEntropyLoss()
+
+    def forward(self, hidden_states, labels):
+        labels = labels.flatten()
+        weight = nn.functional.normalize(self.weight, dim=0)
+        hidden_states = nn.functional.normalize(hidden_states, dim=1)
+        cos_theta = torch.mm(hidden_states, weight)
+        psi = cos_theta - self.margin
+
+        onehot = nn.functional.one_hot(labels, self.num_labels)
+        logits = self.scale * torch.where(onehot.bool(), psi, cos_theta)
+        loss = self.loss(logits, labels)
+
+        return loss
+
+
+class TDNNLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.tdnn_dim[layer_id - 1] if layer_id > 0 else config.tdnn_dim[layer_id]
+        self.out_conv_dim = config.tdnn_dim[layer_id]
+        self.kernel_size = config.tdnn_kernel[layer_id]
+        self.dilation = config.tdnn_dilation[layer_id]
+
+        self.kernel = nn.Linear(self.in_conv_dim * self.kernel_size, self.out_conv_dim)
+        self.activation = nn.ReLU()
+
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        if is_peft_available():
+            from peft.tuners.lora import LoraLayer
+
+        if is_peft_available():
+            if isinstance(self.kernel, LoraLayer):
+                warnings.warn(
+                    "Detected LoRA on TDNNLayer. LoRA weights won't be applied due to optimization. "
+                    "You should exclude TDNNLayer from LoRA's target modules.",
+                )
+
+        # for backward compatibility, we keep nn.Linear but call F.conv1d for speed up
+        hidden_states = hidden_states.transpose(1, 2)
+        weight = self.kernel.weight.view(self.out_conv_dim, self.kernel_size, self.in_conv_dim).transpose(1, 2)
+        hidden_states = nn.functional.conv1d(hidden_states, weight, self.kernel.bias, dilation=self.dilation)
+        hidden_states = hidden_states.transpose(1, 2)
+
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+
+
+@auto_docstring(
+    custom_intro="""
+    Wav2Vec2Conformer Model with an XVector feature extraction head on top for tasks like Speaker Verification.
+    """
+)
+class Wav2Vec2ConformerForXVector(Wav2Vec2ConformerPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+
+        self.wav2vec2_conformer = Wav2Vec2ConformerModel(config)
+        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
+        if config.use_weighted_layer_sum:
+            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
+        self.projector = nn.Linear(config.hidden_size, config.tdnn_dim[0])
+
+        tdnn_layers = [TDNNLayer(config, i) for i in range(len(config.tdnn_dim))]
+        self.tdnn = nn.ModuleList(tdnn_layers)
+
+        self.feature_extractor = nn.Linear(config.tdnn_dim[-1] * 2, config.xvector_output_dim)
+        self.classifier = nn.Linear(config.xvector_output_dim, config.xvector_output_dim)
+
+        self.objective = AMSoftmaxLoss(config.xvector_output_dim, config.num_labels)
+
+        self.init_weights()
+
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.wav2vec2_conformer.feature_extractor._freeze_parameters()
+
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.wav2vec2_conformer.parameters():
+            param.requires_grad = False
+
+    def _get_tdnn_output_lengths(self, input_lengths: Union[torch.LongTensor, int]):
+        """
+        Computes the output length of the TDNN layers
+        """
+
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return (input_length - kernel_size) // stride + 1
+
+        for kernel_size in self.config.tdnn_kernel:
+            input_lengths = _conv_out_length(input_lengths, kernel_size, 1)
+
+        return input_lengths
+
+    @auto_docstring
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[tuple, XVectorOutput]:
+        r"""
+        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
+            into an array of type `list[float]`, a `numpy.ndarray` or a `torch.Tensor`, *e.g.* via the torchcodec library
+            (`pip install torchcodec`) or the soundfile library (`pip install soundfile`).
+            To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and conversion
+            into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2ConformerProcessor.__call__`] for details.
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+
+        outputs = self.wav2vec2_conformer(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = outputs[0]
+
+        hidden_states = self.projector(hidden_states)
+
+        for tdnn_layer in self.tdnn:
+            hidden_states = tdnn_layer(hidden_states)
+
+        # Statistic Pooling
+        if attention_mask is None:
+            mean_features = hidden_states.mean(dim=1)
+            std_features = hidden_states.std(dim=1)
+        else:
+            feat_extract_output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(dim=1))
+            tdnn_output_lengths = self._get_tdnn_output_lengths(feat_extract_output_lengths)
+            mean_features = []
+            std_features = []
+            for i, length in enumerate(tdnn_output_lengths):
+                mean_features.append(hidden_states[i, :length].mean(dim=0))
+                std_features.append(hidden_states[i, :length].std(dim=0))
+            mean_features = torch.stack(mean_features)
+            std_features = torch.stack(std_features)
+        statistic_pooling = torch.cat([mean_features, std_features], dim=-1)
+
+        output_embeddings = self.feature_extractor(statistic_pooling)
+        logits = self.classifier(output_embeddings)
+
+        loss = None
+        if labels is not None:
+            loss = self.objective(logits, labels)
+
+        if not return_dict:
+            output = (logits, output_embeddings) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+
+        return XVectorOutput(
+            loss=loss,
+            logits=logits,
+            embeddings=output_embeddings,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+
+__all__ = [
+    "Wav2Vec2ConformerForAudioFrameClassification",
+    "Wav2Vec2ConformerForCTC",
+    "Wav2Vec2ConformerForPreTraining",
+    "Wav2Vec2ConformerForSequenceClassification",
+    "Wav2Vec2ConformerForXVector",
+    "Wav2Vec2ConformerModel",
+    "Wav2Vec2ConformerPreTrainedModel",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..bfa6c20737d8060f8ceca29926f113c2fc176ac9
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_conformer/modular_wav2vec2_conformer.py
@@ -0,0 +1,728 @@
+import math
+from dataclasses import dataclass
+from typing import Optional, Union
+
+import torch
+from torch import nn
+
+from ...activations import ACT2FN
+from ...integrations.deepspeed import is_deepspeed_zero3_enabled
+from ...integrations.fsdp import is_fsdp_managed_module
+from ...modeling_layers import GradientCheckpointingLayer
+from ...modeling_outputs import BaseModelOutput, Wav2Vec2BaseModelOutput
+from ...modeling_utils import PreTrainedModel
+from ...utils import ModelOutput, auto_docstring, logging
+from ..wav2vec2.modeling_wav2vec2 import (
+    Wav2Vec2Adapter,
+    Wav2Vec2AdapterLayer,
+    Wav2Vec2FeatureEncoder,
+    Wav2Vec2FeatureProjection,
+    Wav2Vec2FeedForward,
+    Wav2Vec2ForAudioFrameClassification,
+    Wav2Vec2ForCTC,
+    Wav2Vec2ForPreTraining,
+    Wav2Vec2ForSequenceClassification,
+    Wav2Vec2ForXVector,
+    Wav2Vec2GumbelVectorQuantizer,
+    Wav2Vec2Model,
+    Wav2Vec2PositionalConvEmbedding,
+)
+from .configuration_wav2vec2_conformer import Wav2Vec2ConformerConfig
+
+
+logger = logging.get_logger(__name__)
+
+_HIDDEN_STATES_START_POSITION = 2
+
+
+@dataclass
+@auto_docstring(
+    custom_intro="""
+    Output type of [`Wav2Vec2ConformerForPreTraining`], with potential hidden states and attentions.
+    """
+)
+class Wav2Vec2ConformerForPreTrainingOutput(ModelOutput):
+    r"""
+    loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
+        Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the [official
+        paper](https://huggingface.co/papers/2006.11477).
+    projected_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
+        Hidden-states of the model projected to *config.proj_codevector_dim* that can be used to predict the masked
+        projected quantized states.
+    projected_quantized_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
+        Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive
+        target vectors for contrastive loss.
+    codevector_perplexity (`torch.FloatTensor` of shape `(1,)`):
+        The perplexity of the codevector distribution, used to measure the diversity of the codebook.
+    contrastive_loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
+        The contrastive loss (L_m) as stated in the [official paper](https://huggingface.co/papers/2006.11477).
+    diversity_loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
+        The diversity loss (L_d) as stated in the [official paper](https://huggingface.co/papers/2006.11477).
+    """
+
+    loss: Optional[torch.FloatTensor] = None
+    projected_states: Optional[torch.FloatTensor] = None
+    projected_quantized_states: Optional[torch.FloatTensor] = None
+    codevector_perplexity: Optional[torch.FloatTensor] = None
+    hidden_states: Optional[tuple[torch.FloatTensor]] = None
+    attentions: Optional[tuple[torch.FloatTensor]] = None
+    contrastive_loss: Optional[torch.FloatTensor] = None
+    diversity_loss: Optional[torch.FloatTensor] = None
+
+
+class Wav2Vec2ConformerPositionalConvEmbedding(Wav2Vec2PositionalConvEmbedding):
+    pass
+
+
+class Wav2Vec2ConformerRotaryPositionalEmbedding(nn.Module):
+    """Rotary positional embedding
+    Reference : https://blog.eleuther.ai/rotary-embeddings/ Paper: https://huggingface.co/papers/2104.09864
+    """
+
+    def __init__(self, config):
+        super().__init__()
+        dim = config.hidden_size // config.num_attention_heads
+        base = config.rotary_embedding_base
+
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2, dtype=torch.int64).float() / dim))
+        self.register_buffer("inv_freq", inv_freq)
+        self.cached_sequence_length = None
+        self.cached_rotary_positional_embedding = None
+
+    def forward(self, hidden_states):
+        sequence_length = hidden_states.shape[1]
+
+        if sequence_length == self.cached_sequence_length and self.cached_rotary_positional_embedding is not None:
+            return self.cached_rotary_positional_embedding
+
+        self.cached_sequence_length = sequence_length
+        # Embeddings are computed in the dtype of the inv_freq constant
+        time_stamps = torch.arange(sequence_length).type_as(self.inv_freq)
+        freqs = torch.einsum("i,j->ij", time_stamps, self.inv_freq)
+        embeddings = torch.cat((freqs, freqs), dim=-1)
+
+        cos_embeddings = embeddings.cos()[:, None, None, :]
+        sin_embeddings = embeddings.sin()[:, None, None, :]
+        # Computed embeddings are cast to the dtype of the hidden state inputs
+        self.cached_rotary_positional_embedding = torch.stack([cos_embeddings, sin_embeddings]).type_as(hidden_states)
+        return self.cached_rotary_positional_embedding
+
+
+class Wav2Vec2ConformerRelPositionalEmbedding(nn.Module):
+    """Relative positional encoding module."""
+
+    def __init__(self, config):
+        super().__init__()
+        self.max_len = config.max_source_positions
+        self.d_model = config.hidden_size
+        self.pe = None
+        self.extend_pe(torch.tensor(0.0).expand(1, self.max_len))
+
+    def extend_pe(self, x):
+        # Reset the positional encodings
+        if self.pe is not None:
+            # self.pe contains both positive and negative parts
+            # the length of self.pe is 2 * input_len - 1
+            if self.pe.size(1) >= x.size(1) * 2 - 1:
+                if self.pe.dtype != x.dtype or self.pe.device != x.device:
+                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+                return
+        # Suppose `i` is the position of query vector and `j` is the
+        # position of key vector. We use positive relative positions when keys
+        # are to the left (i>j) and negative relative positions otherwise (i<j).
+        pe_positive = torch.zeros(x.size(1), self.d_model)
+        pe_negative = torch.zeros(x.size(1), self.d_model)
+        position = torch.arange(0, x.size(1), dtype=torch.int64).float().unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.int64).float() * -(math.log(10000.0) / self.d_model)
+        )
+        pe_positive[:, 0::2] = torch.sin(position * div_term)
+        pe_positive[:, 1::2] = torch.cos(position * div_term)
+        pe_negative[:, 0::2] = torch.sin(-1 * position * div_term)
+        pe_negative[:, 1::2] = torch.cos(-1 * position * div_term)
+
+        # Reverse the order of positive indices and concat both positive and
+        # negative indices. This is used to support the shifting trick
+        # as in https://huggingface.co/papers/1901.02860
+        pe_positive = torch.flip(pe_positive, [0]).unsqueeze(0)
+        pe_negative = pe_negative[1:].unsqueeze(0)
+        pe = torch.cat([pe_positive, pe_negative], dim=1)
+        self.pe = pe.to(device=x.device, dtype=x.dtype)
+
+    def forward(self, hidden_states: torch.Tensor):
+        self.extend_pe(hidden_states)
+        start_idx = self.pe.size(1) // 2 - hidden_states.size(1) + 1
+        end_idx = self.pe.size(1) // 2 + hidden_states.size(1)
+        relative_position_embeddings = self.pe[:, start_idx:end_idx]
+
+        return relative_position_embeddings
+
+
+class Wav2Vec2ConformerFeatureEncoder(Wav2Vec2FeatureEncoder):
+    pass
+
+
+class Wav2Vec2ConformerFeatureProjection(Wav2Vec2FeatureProjection):
+    pass
+
+
+class Wav2Vec2ConformerFeedForward(Wav2Vec2FeedForward):
+    pass
+
+
+class Wav2Vec2ConformerConvolutionModule(nn.Module):
+    """Convolution block used in the conformer block"""
+
+    def __init__(self, config):
+        super().__init__()
+        if (config.conv_depthwise_kernel_size - 1) % 2 == 1:
+            raise ValueError("`config.conv_depthwise_kernel_size` should be a odd number for 'SAME' padding")
+        self.layer_norm = nn.LayerNorm(config.hidden_size)
+        self.pointwise_conv1 = nn.Conv1d(
+            config.hidden_size,
+            2 * config.hidden_size,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+        )
+        self.glu = nn.GLU(dim=1)
+        self.depthwise_conv = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            config.conv_depthwise_kernel_size,
+            stride=1,
+            padding=(config.conv_depthwise_kernel_size - 1) // 2,
+            groups=config.hidden_size,
+            bias=False,
+        )
+        self.batch_norm = nn.BatchNorm1d(config.hidden_size)
+        self.activation = ACT2FN[config.hidden_act]
+        self.pointwise_conv2 = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+        )
+        self.dropout = nn.Dropout(config.conformer_conv_dropout)
+
+    def forward(self, hidden_states):
+        hidden_states = self.layer_norm(hidden_states)
+        # exchange the temporal dimension and the feature dimension
+        hidden_states = hidden_states.transpose(1, 2)
+
+        # GLU mechanism
+        # => (batch, 2*channel, dim)
+        hidden_states = self.pointwise_conv1(hidden_states)
+        # => (batch, channel, dim)
+        hidden_states = self.glu(hidden_states)
+
+        # 1D Depthwise Conv
+        hidden_states = self.depthwise_conv(hidden_states)
+        hidden_states = self.batch_norm(hidden_states)
+        hidden_states = self.activation(hidden_states)
+
+        hidden_states = self.pointwise_conv2(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states
+
+
+class Wav2Vec2ConformerSelfAttention(nn.Module):
+    """Construct an Wav2Vec2ConformerSelfAttention object.
+    Can be enhanced with rotary or relative position embeddings.
+    """
+
+    def __init__(self, config):
+        super().__init__()
+
+        self.head_size = config.hidden_size // config.num_attention_heads
+        self.num_heads = config.num_attention_heads
+        self.position_embeddings_type = config.position_embeddings_type
+
+        self.linear_q = nn.Linear(config.hidden_size, config.hidden_size)
+        self.linear_k = nn.Linear(config.hidden_size, config.hidden_size)
+        self.linear_v = nn.Linear(config.hidden_size, config.hidden_size)
+        self.linear_out = nn.Linear(config.hidden_size, config.hidden_size)
+
+        self.dropout = nn.Dropout(p=config.attention_dropout)
+
+        if self.position_embeddings_type == "relative":
+            # linear transformation for positional encoding
+            self.linear_pos = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
+            # these two learnable bias are used in matrix c and matrix d
+            # as described in https://huggingface.co/papers/1901.02860 Section 3.3
+            self.pos_bias_u = nn.Parameter(torch.zeros(self.num_heads, self.head_size))
+            self.pos_bias_v = nn.Parameter(torch.zeros(self.num_heads, self.head_size))
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        relative_position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        # self-attention mechanism
+        batch_size, sequence_length, hidden_size = hidden_states.size()
+
+        # make sure query/key states can be != value states
+        query_key_states = hidden_states
+        value_states = hidden_states
+
+        if self.position_embeddings_type == "rotary":
+            if relative_position_embeddings is None:
+                raise ValueError(
+                    "`relative_position_embeddings` has to be defined when `self.position_embeddings_type == 'rotary'"
+                )
+            query_key_states = self._apply_rotary_embedding(query_key_states, relative_position_embeddings)
+
+        # project query_key_states and value_states
+        query = self.linear_q(query_key_states).view(batch_size, -1, self.num_heads, self.head_size)
+        key = self.linear_k(query_key_states).view(batch_size, -1, self.num_heads, self.head_size)
+        value = self.linear_v(value_states).view(batch_size, -1, self.num_heads, self.head_size)
+
+        # => (batch, head, time1, d_k)
+        query = query.transpose(1, 2)
+        key = key.transpose(1, 2)
+        value = value.transpose(1, 2)
+
+        if self.position_embeddings_type == "relative":
+            if relative_position_embeddings is None:
+                raise ValueError(
+                    "`relative_position_embeddings` has to be defined when `self.position_embeddings_type =="
+                    " 'relative'"
+                )
+            # apply relative_position_embeddings to qk scores
+            # as proposed in Transformer_XL: https://huggingface.co/papers/1901.02860
+            scores = self._apply_relative_embeddings(
+                query=query, key=key, relative_position_embeddings=relative_position_embeddings
+            )
+        else:
+            scores = torch.matmul(query, key.transpose(-2, -1)) / math.sqrt(self.head_size)
+
+        # apply attention_mask if necessary
+        if attention_mask is not None:
+            scores = scores + attention_mask
+
+        # => (batch, head, time1, time2)
+        probs = torch.softmax(scores, dim=-1)
+        probs = self.dropout(probs)
+
+        # => (batch, head, time1, d_k)
+        hidden_states = torch.matmul(probs, value)
+
+        # => (batch, time1, hidden_size)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, self.num_heads * self.head_size)
+        hidden_states = self.linear_out(hidden_states)
+
+        return hidden_states, probs
+
+    def _apply_rotary_embedding(self, hidden_states, relative_position_embeddings):
+        batch_size, sequence_length, hidden_size = hidden_states.size()
+        hidden_states = hidden_states.view(batch_size, sequence_length, self.num_heads, self.head_size)
+
+        cos = relative_position_embeddings[0, :sequence_length, ...]
+        sin = relative_position_embeddings[1, :sequence_length, ...]
+
+        # rotate hidden_states with rotary embeddings
+        hidden_states = hidden_states.transpose(0, 1)
+        rotated_states_begin = hidden_states[..., : self.head_size // 2]
+        rotated_states_end = hidden_states[..., self.head_size // 2 :]
+        rotated_states = torch.cat((-rotated_states_end, rotated_states_begin), dim=rotated_states_begin.ndim - 1)
+        hidden_states = (hidden_states * cos) + (rotated_states * sin)
+        hidden_states = hidden_states.transpose(0, 1)
+
+        hidden_states = hidden_states.view(batch_size, sequence_length, self.num_heads * self.head_size)
+
+        return hidden_states
+
+    def _apply_relative_embeddings(self, query, key, relative_position_embeddings):
+        # 1. project positional embeddings
+        # => (batch, head, 2*time1-1, d_k)
+        proj_relative_position_embeddings = self.linear_pos(relative_position_embeddings)
+        proj_relative_position_embeddings = proj_relative_position_embeddings.view(
+            relative_position_embeddings.size(0), -1, self.num_heads, self.head_size
+        )
+        proj_relative_position_embeddings = proj_relative_position_embeddings.transpose(1, 2)
+        proj_relative_position_embeddings = proj_relative_position_embeddings.transpose(2, 3)
+
+        # 2. Add bias to query
+        # => (batch, head, time1, d_k)
+        query = query.transpose(1, 2)
+        q_with_bias_u = (query + self.pos_bias_u).transpose(1, 2)
+        q_with_bias_v = (query + self.pos_bias_v).transpose(1, 2)
+
+        # 3. attention score: first compute matrix a and matrix c
+        # as described in https://huggingface.co/papers/1901.02860 Section 3.3
+        # => (batch, head, time1, time2)
+        scores_ac = torch.matmul(q_with_bias_u, key.transpose(-2, -1))
+
+        # 4. then compute matrix b and matrix d
+        # => (batch, head, time1, 2*time1-1)
+        scores_bd = torch.matmul(q_with_bias_v, proj_relative_position_embeddings)
+
+        # 5. shift matrix b and matrix d
+        zero_pad = torch.zeros((*scores_bd.size()[:3], 1), device=scores_bd.device, dtype=scores_bd.dtype)
+        scores_bd_padded = torch.cat([zero_pad, scores_bd], dim=-1)
+        scores_bd_padded_shape = scores_bd.size()[:2] + (scores_bd.shape[3] + 1, scores_bd.shape[2])
+        scores_bd_padded = scores_bd_padded.view(*scores_bd_padded_shape)
+        scores_bd = scores_bd_padded[:, :, 1:].view_as(scores_bd)
+        scores_bd = scores_bd[:, :, :, : scores_bd.size(-1) // 2 + 1]
+
+        # 6. sum matrices
+        # => (batch, head, time1, time2)
+        scores = (scores_ac + scores_bd) / math.sqrt(self.head_size)
+
+        return scores
+
+
+class Wav2Vec2ConformerEncoderLayer(GradientCheckpointingLayer):
+    """Conformer block based on https://huggingface.co/papers/2005.08100."""
+
+    def __init__(self, config):
+        super().__init__()
+        embed_dim = config.hidden_size
+        dropout = config.attention_dropout
+
+        # Feed-forward 1
+        self.ffn1_layer_norm = nn.LayerNorm(embed_dim)
+        self.ffn1 = Wav2Vec2ConformerFeedForward(config)
+
+        # Self-Attention
+        self.self_attn_layer_norm = nn.LayerNorm(embed_dim)
+        self.self_attn_dropout = nn.Dropout(dropout)
+        self.self_attn = Wav2Vec2ConformerSelfAttention(config)
+
+        # Conformer Convolution
+        self.conv_module = Wav2Vec2ConformerConvolutionModule(config)
+
+        # Feed-forward 2
+        self.ffn2_layer_norm = nn.LayerNorm(embed_dim)
+        self.ffn2 = Wav2Vec2ConformerFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(embed_dim)
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask: Optional[torch.Tensor] = None,
+        relative_position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ):
+        # 1. Feed-Forward 1 layer
+        residual = hidden_states
+        hidden_states = self.ffn1_layer_norm(hidden_states)
+        hidden_states = self.ffn1(hidden_states)
+        hidden_states = hidden_states * 0.5 + residual
+        residual = hidden_states
+
+        # 2. Self-Attention layer
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, attn_weigts = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            relative_position_embeddings=relative_position_embeddings,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self.self_attn_dropout(hidden_states)
+        hidden_states = hidden_states + residual
+
+        # 3. Convolutional Layer
+        residual = hidden_states
+        hidden_states = self.conv_module(hidden_states)
+        hidden_states = residual + hidden_states
+
+        # 4. Feed-Forward 2 Layer
+        residual = hidden_states
+        hidden_states = self.ffn2_layer_norm(hidden_states)
+        hidden_states = self.ffn2(hidden_states)
+        hidden_states = hidden_states * 0.5 + residual
+        hidden_states = self.final_layer_norm(hidden_states)
+
+        return hidden_states, attn_weigts
+
+
+class Wav2Vec2ConformerEncoder(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+
+        if config.position_embeddings_type == "relative":
+            self.embed_positions = Wav2Vec2ConformerRelPositionalEmbedding(config)
+        elif config.position_embeddings_type == "rotary":
+            self.embed_positions = Wav2Vec2ConformerRotaryPositionalEmbedding(config)
+        else:
+            self.embed_positions = None
+
+        self.pos_conv_embed = Wav2Vec2ConformerPositionalConvEmbedding(config)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layers = nn.ModuleList([Wav2Vec2ConformerEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+
+        if attention_mask is not None:
+            # make sure padded tokens output 0
+            expand_attention_mask = attention_mask.unsqueeze(-1).repeat(1, 1, hidden_states.shape[2])
+            hidden_states[~expand_attention_mask] = 0.0
+
+            # extend attention_mask
+            attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
+            attention_mask = attention_mask * torch.finfo(hidden_states.dtype).min
+            attention_mask = attention_mask.expand(
+                attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
+            )
+
+        hidden_states = self.dropout(hidden_states)
+
+        if self.embed_positions is not None:
+            relative_position_embeddings = self.embed_positions(hidden_states)
+        else:
+            relative_position_embeddings = None
+
+        synced_gpus = is_deepspeed_zero3_enabled() or is_fsdp_managed_module(self)
+
+        for i, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+
+            # add LayerDrop (see https://huggingface.co/papers/1909.11556 for description)
+            dropout_probability = torch.rand([])
+
+            skip_the_layer = self.training and dropout_probability < self.config.layerdrop
+            if not skip_the_layer or synced_gpus:
+                # under fsdp or deepspeed zero3 all gpus must run in sync
+                layer_outputs = layer(
+                    hidden_states,
+                    attention_mask=attention_mask,
+                    relative_position_embeddings=relative_position_embeddings,
+                    output_attentions=output_attentions,
+                )
+                hidden_states = layer_outputs[0]
+
+            if skip_the_layer:
+                layer_outputs = (None, None)
+
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+
+        hidden_states = self.layer_norm(hidden_states)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+
+
+class Wav2Vec2ConformerGumbelVectorQuantizer(Wav2Vec2GumbelVectorQuantizer):
+    pass
+
+
+class Wav2Vec2ConformerAdapter(Wav2Vec2Adapter):
+    pass
+
+
+class Wav2Vec2ConformerAdapterLayer(Wav2Vec2AdapterLayer):
+    pass
+
+
+@auto_docstring
+class Wav2Vec2ConformerPreTrainedModel(PreTrainedModel):
+    config: Wav2Vec2ConformerConfig
+    base_model_prefix = "wav2vec2_conformer"
+    main_input_name = "input_values"
+    supports_gradient_checkpointing = True
+
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        # Wav2Vec2ForPreTraining last 2 linear layers need standard Linear init.
+        if isinstance(module, Wav2Vec2ConformerForPreTraining):
+            module.project_hid.reset_parameters()
+            module.project_q.reset_parameters()
+            module.project_hid._is_hf_initialized = True
+            module.project_q._is_hf_initialized = True
+        # gumbel softmax requires special init
+        elif isinstance(module, Wav2Vec2ConformerGumbelVectorQuantizer):
+            module.weight_proj.weight.data.normal_(mean=0.0, std=1)
+            module.weight_proj.bias.data.zero_()
+            nn.init.uniform_(module.codevectors)
+        elif isinstance(module, Wav2Vec2ConformerSelfAttention):
+            if hasattr(module, "pos_bias_u"):
+                nn.init.xavier_uniform_(module.pos_bias_u)
+            if hasattr(module, "pos_bias_v"):
+                nn.init.xavier_uniform_(module.pos_bias_v)
+        elif isinstance(module, Wav2Vec2ConformerPositionalConvEmbedding):
+            nn.init.normal_(
+                module.conv.weight,
+                mean=0,
+                std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)),
+            )
+            nn.init.constant_(module.conv.bias, 0)
+        elif isinstance(module, Wav2Vec2ConformerFeatureProjection):
+            k = math.sqrt(1 / module.projection.in_features)
+            nn.init.uniform_(module.projection.weight, a=-k, b=k)
+            nn.init.uniform_(module.projection.bias, a=-k, b=k)
+        elif isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Conv1d):
+            nn.init.kaiming_normal_(module.weight)
+
+            if module.bias is not None:
+                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+                nn.init.uniform_(module.bias, a=-k, b=k)
+
+    def _get_feat_extract_output_lengths(
+        self, input_lengths: Union[torch.LongTensor, int], add_adapter: Optional[bool] = None
+    ):
+        """
+        Computes the output length of the convolutional layers
+        """
+
+        add_adapter = self.config.add_adapter if add_adapter is None else add_adapter
+
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
+
+        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
+            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
+
+        if add_adapter:
+            for _ in range(self.config.num_adapter_layers):
+                input_lengths = _conv_out_length(input_lengths, 1, self.config.adapter_stride)
+
+        return input_lengths
+
+    def _get_feature_vector_attention_mask(
+        self, feature_vector_length: int, attention_mask: torch.LongTensor, add_adapter=None
+    ):
+        # Effectively attention_mask.sum(-1), but not inplace to be able to run
+        # on inference mode.
+        non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1]
+
+        output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter)
+        output_lengths = output_lengths.to(torch.long)
+
+        batch_size = attention_mask.shape[0]
+
+        attention_mask = torch.zeros(
+            (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
+        )
+        # these two operations makes sure that all values before the output lengths idxs are attended to
+        attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
+        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
+        return attention_mask
+
+
+WAV2VEC2_CONFORMER_START_DOCSTRING = None  # will be automatically redefined
+
+
+Wav2Vec2ConformerBaseModelOutput = Wav2Vec2BaseModelOutput
+
+
+class Wav2Vec2ConformerModel(Wav2Vec2ConformerPreTrainedModel, Wav2Vec2Model):
+    def __init__(self, config: Wav2Vec2ConformerConfig):
+        Wav2Vec2ConformerPreTrainedModel.__init__(self, config)
+        self.config = config
+        self.feature_extractor = Wav2Vec2ConformerFeatureEncoder(config)
+        self.feature_projection = Wav2Vec2ConformerFeatureProjection(config)
+
+        # model only needs masking vector if mask prob is > 0.0
+        if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
+            self.masked_spec_embed = nn.Parameter(torch.Tensor(config.hidden_size).uniform_())
+
+        self.encoder = Wav2Vec2ConformerEncoder(config)
+
+        self.adapter = Wav2Vec2ConformerAdapter(config) if config.add_adapter else None
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    def freeze_feature_extractor(self):
+        raise AttributeError("Not needed for Wav2Vec2Conformer")
+
+
+class Wav2Vec2ConformerForPreTraining(Wav2Vec2ForPreTraining):
+    def __init__(self, config: Wav2Vec2ConformerConfig):
+        super().__init__(config)
+
+    def freeze_feature_extractor(self):
+        raise AttributeError("Not needed for Wav2Vec2Conformer")
+
+
+class Wav2Vec2ConformerForCTC(Wav2Vec2ForCTC):
+    def __init__(self, config, target_lang: Optional[str] = None):
+        r"""
+        target_lang (`str`, *optional*):
+            Language id of adapter weights. Adapter weights are stored in the format adapter.<lang>.safetensors or
+            adapter.<lang>.bin. Only relevant when using an instance of [`UniSpeechSatForCTC`] with adapters. Uses 'eng' by
+            default.
+        """
+        super().__init__(config)
+
+    def tie_weights(self):
+        raise AttributeError("Not needed for Wav2Vec2Conformer")
+
+    def freeze_feature_extractor(self):
+        raise AttributeError("Not needed for Wav2Vec2Conformer")
+
+    def freeze_base_model(self):
+        raise AttributeError("Not needed for Wav2Vec2Conformer")
+
+
+class Wav2Vec2ConformerForSequenceClassification(Wav2Vec2ForSequenceClassification):
+    def __init__(self, config):
+        super().__init__(config)
+
+    def freeze_feature_extractor(self):
+        raise AttributeError("Not needed for Wav2Vec2Conformer")
+
+
+class Wav2Vec2ConformerForAudioFrameClassification(Wav2Vec2ForAudioFrameClassification):
+    def __init__(self, config):
+        super().__init__(config)
+
+    def freeze_feature_extractor(self):
+        raise AttributeError("Not needed for Wav2Vec2Conformer")
+
+
+class Wav2Vec2ConformerForXVector(Wav2Vec2ForXVector):
+    def __init__(self, config):
+        super().__init__(config)
+
+    def freeze_feature_extractor(self):
+        raise AttributeError("Not needed for Wav2Vec2Conformer")
+
+
+__all__ = [
+    "Wav2Vec2ConformerForAudioFrameClassification",
+    "Wav2Vec2ConformerForCTC",
+    "Wav2Vec2ConformerForPreTraining",
+    "Wav2Vec2ConformerForSequenceClassification",
+    "Wav2Vec2ConformerForXVector",
+    "Wav2Vec2ConformerModel",
+    "Wav2Vec2ConformerPreTrainedModel",
+]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_with_lm/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_with_lm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..95fd10dab7092fb8571a332b1ecf2e9a334d8fc4
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_with_lm/__init__.py
@@ -0,0 +1,26 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .processing_wav2vec2_with_lm import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
new file mode 100644
index 0000000000000000000000000000000000000000..beb22ca86749f01cde572cad7fe0b19e01f1e415
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wav2vec2_with_lm/processing_wav2vec2_with_lm.py
@@ -0,0 +1,659 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Speech processor class for Wav2Vec2
+"""
+
+import os
+import warnings
+from collections.abc import Iterable
+from contextlib import contextmanager, nullcontext
+from dataclasses import dataclass
+from multiprocessing import Pool, get_context, get_start_method
+from typing import TYPE_CHECKING, Optional, Union
+
+import numpy as np
+
+from ...processing_utils import ProcessorMixin
+from ...utils import ModelOutput, logging, requires_backends
+
+
+logger = logging.get_logger(__name__)
+
+
+if TYPE_CHECKING:
+    from pyctcdecode import BeamSearchDecoderCTC
+
+    from ...feature_extraction_utils import FeatureExtractionMixin
+    from ...tokenization_utils import PreTrainedTokenizerBase
+
+
+ListOfDict = list[dict[str, Union[int, str]]]
+
+
+@dataclass
+class Wav2Vec2DecoderWithLMOutput(ModelOutput):
+    """
+    Output type of [`Wav2Vec2DecoderWithLM`], with transcription.
+
+    Args:
+        text (list of `str` or `str`):
+            Decoded logits in text from. Usually the speech transcription.
+        logit_score (list of `float` or `float`):
+            Total logit score of the beams associated with produced text.
+        lm_score (list of `float`):
+            Fused lm_score of the beams associated with produced text.
+        word_offsets (list of `list[dict[str, Union[int, str]]]` or `list[dict[str, Union[int, str]]]`):
+            Offsets of the decoded words. In combination with sampling rate and model downsampling rate word offsets
+            can be used to compute time stamps for each word.
+    """
+
+    text: Union[list[list[str]], list[str], str]
+    logit_score: Union[list[list[float]], list[float], float] = None
+    lm_score: Union[list[list[float]], list[float], float] = None
+    word_offsets: Union[list[list[ListOfDict]], list[ListOfDict], ListOfDict] = None
+
+
+class Wav2Vec2ProcessorWithLM(ProcessorMixin):
+    r"""
+    Constructs a Wav2Vec2 processor which wraps a Wav2Vec2 feature extractor, a Wav2Vec2 CTC tokenizer and a decoder
+    with language model support into a single processor for language model boosted speech recognition decoding.
+
+    Args:
+        feature_extractor ([`Wav2Vec2FeatureExtractor`] or [`SeamlessM4TFeatureExtractor`]):
+            An instance of [`Wav2Vec2FeatureExtractor`] or [`SeamlessM4TFeatureExtractor`]. The feature extractor is a required input.
+        tokenizer ([`Wav2Vec2CTCTokenizer`]):
+            An instance of [`Wav2Vec2CTCTokenizer`]. The tokenizer is a required input.
+        decoder (`pyctcdecode.BeamSearchDecoderCTC`):
+            An instance of [`pyctcdecode.BeamSearchDecoderCTC`]. The decoder is a required input.
+    """
+
+    feature_extractor_class = "AutoFeatureExtractor"
+    tokenizer_class = "Wav2Vec2CTCTokenizer"
+
+    def __init__(
+        self,
+        feature_extractor: "FeatureExtractionMixin",
+        tokenizer: "PreTrainedTokenizerBase",
+        decoder: "BeamSearchDecoderCTC",
+    ):
+        from pyctcdecode import BeamSearchDecoderCTC
+
+        super().__init__(feature_extractor, tokenizer)
+        if not isinstance(decoder, BeamSearchDecoderCTC):
+            raise TypeError(f"`decoder` has to be of type {BeamSearchDecoderCTC.__class__}, but is {type(decoder)}")
+
+        if feature_extractor.__class__.__name__ not in ["Wav2Vec2FeatureExtractor", "SeamlessM4TFeatureExtractor"]:
+            raise ValueError(
+                f"`feature_extractor` has to be of type `Wav2Vec2FeatureExtractor` or `SeamlessM4TFeatureExtractor`, but is {type(feature_extractor)}"
+            )
+
+        # make sure that decoder's alphabet and tokenizer's vocab match in content
+        missing_decoder_tokens = self.get_missing_alphabet_tokens(decoder, tokenizer)
+        if len(missing_decoder_tokens) > 0:
+            raise ValueError(
+                f"The tokens {missing_decoder_tokens} are defined in the tokenizer's "
+                "vocabulary, but not in the decoder's alphabet. "
+                f"Make sure to include {missing_decoder_tokens} in the decoder's alphabet."
+            )
+
+        self.decoder = decoder
+        self.current_processor = self.feature_extractor
+        self._in_target_context_manager = False
+
+    def save_pretrained(self, save_directory):
+        super().save_pretrained(save_directory)
+        self.decoder.save_to_dir(save_directory)
+
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        r"""
+        Instantiate a [`Wav2Vec2ProcessorWithLM`] from a pretrained Wav2Vec2 processor.
+
+        <Tip>
+
+        This class method is simply calling the feature extractor's
+        [`~feature_extraction_utils.FeatureExtractionMixin.from_pretrained`], Wav2Vec2CTCTokenizer's
+        [`~tokenization_utils_base.PreTrainedTokenizerBase.from_pretrained`], and
+        [`pyctcdecode.BeamSearchDecoderCTC.load_from_hf_hub`].
+
+        Please refer to the docstrings of the methods above for more information.
+
+        </Tip>
+
+        Args:
+            pretrained_model_name_or_path (`str` or `os.PathLike`):
+                This can be either:
+
+                - a string, the *model id* of a pretrained feature_extractor hosted inside a model repo on
+                  huggingface.co.
+                - a path to a *directory* containing a feature extractor file saved using the
+                  [`~SequenceFeatureExtractor.save_pretrained`] method, e.g., `./my_model_directory/`.
+                - a path or url to a saved feature extractor JSON *file*, e.g.,
+                  `./my_model_directory/preprocessor_config.json`.
+            **kwargs
+                Additional keyword arguments passed along to both [`SequenceFeatureExtractor`] and
+                [`PreTrainedTokenizer`]
+        """
+        requires_backends(cls, "pyctcdecode")
+        from pyctcdecode import BeamSearchDecoderCTC
+
+        feature_extractor, tokenizer = super()._get_arguments_from_pretrained(pretrained_model_name_or_path, **kwargs)
+
+        if os.path.isdir(pretrained_model_name_or_path) or os.path.isfile(pretrained_model_name_or_path):
+            unigram_encoding = kwargs.get("unigram_encoding", "utf-8")
+            decoder = BeamSearchDecoderCTC.load_from_dir(pretrained_model_name_or_path, unigram_encoding)
+        else:
+            # BeamSearchDecoderCTC has no auto class
+            kwargs.pop("_from_auto", None)
+            # snapshot_download has no `trust_remote_code` flag
+            kwargs.pop("trust_remote_code", None)
+
+            # make sure that only relevant filenames are downloaded
+            language_model_filenames = os.path.join(BeamSearchDecoderCTC._LANGUAGE_MODEL_SERIALIZED_DIRECTORY, "*")
+            alphabet_filename = BeamSearchDecoderCTC._ALPHABET_SERIALIZED_FILENAME
+            allow_patterns = [language_model_filenames, alphabet_filename]
+
+            decoder = BeamSearchDecoderCTC.load_from_hf_hub(
+                pretrained_model_name_or_path, allow_patterns=allow_patterns, **kwargs
+            )
+
+        # set language model attributes
+        for attribute in ["alpha", "beta", "unk_score_offset", "score_boundary"]:
+            value = kwargs.pop(attribute, None)
+
+            if value is not None:
+                cls._set_language_model_attribute(decoder, attribute, value)
+
+        # make sure that decoder's alphabet and tokenizer's vocab match in content
+        missing_decoder_tokens = cls.get_missing_alphabet_tokens(decoder, tokenizer)
+        if len(missing_decoder_tokens) > 0:
+            raise ValueError(
+                f"The tokens {missing_decoder_tokens} are defined in the tokenizer's "
+                "vocabulary, but not in the decoder's alphabet. "
+                f"Make sure to include {missing_decoder_tokens} in the decoder's alphabet."
+            )
+
+        return cls(feature_extractor=feature_extractor, tokenizer=tokenizer, decoder=decoder)
+
+    @staticmethod
+    def _set_language_model_attribute(decoder: "BeamSearchDecoderCTC", attribute: str, value: float):
+        setattr(decoder.model_container[decoder._model_key], attribute, value)
+
+    @property
+    def language_model(self):
+        return self.decoder.model_container[self.decoder._model_key]
+
+    @staticmethod
+    def get_missing_alphabet_tokens(decoder, tokenizer):
+        from pyctcdecode.alphabet import BLANK_TOKEN_PTN, UNK_TOKEN, UNK_TOKEN_PTN
+
+        # we need to make sure that all of the tokenizer's except the special tokens
+        # are present in the decoder's alphabet. Retrieve missing alphabet token
+        # from decoder
+        tokenizer_vocab_list = list(tokenizer.get_vocab().keys())
+
+        # replace special tokens
+        for i, token in enumerate(tokenizer_vocab_list):
+            if BLANK_TOKEN_PTN.match(token):
+                tokenizer_vocab_list[i] = ""
+            if token == tokenizer.word_delimiter_token:
+                tokenizer_vocab_list[i] = " "
+            if UNK_TOKEN_PTN.match(token):
+                tokenizer_vocab_list[i] = UNK_TOKEN
+
+        # are any of the extra tokens no special tokenizer tokens?
+        missing_tokens = set(tokenizer_vocab_list) - set(decoder._alphabet.labels)
+
+        return missing_tokens
+
+    def __call__(self, *args, **kwargs):
+        """
+        When used in normal mode, this method forwards all its arguments to the feature extractor's
+        [`~FeatureExtractionMixin.__call__`] and returns its output. If used in the context
+        [`~Wav2Vec2ProcessorWithLM.as_target_processor`] this method forwards all its arguments to
+        Wav2Vec2CTCTokenizer's [`~Wav2Vec2CTCTokenizer.__call__`]. Please refer to the docstring of the above two
+        methods for more information.
+        """
+        # For backward compatibility
+        if self._in_target_context_manager:
+            return self.current_processor(*args, **kwargs)
+
+        if "raw_speech" in kwargs:
+            warnings.warn("Using `raw_speech` as a keyword argument is deprecated. Use `audio` instead.")
+            audio = kwargs.pop("raw_speech")
+        else:
+            audio = kwargs.pop("audio", None)
+        sampling_rate = kwargs.pop("sampling_rate", None)
+        text = kwargs.pop("text", None)
+        if len(args) > 0:
+            audio = args[0]
+            args = args[1:]
+
+        if audio is None and text is None:
+            raise ValueError("You need to specify either an `audio` or `text` input to process.")
+
+        if audio is not None:
+            inputs = self.feature_extractor(audio, *args, sampling_rate=sampling_rate, **kwargs)
+        if text is not None:
+            encodings = self.tokenizer(text, **kwargs)
+
+        if text is None:
+            return inputs
+        elif audio is None:
+            return encodings
+        else:
+            inputs["labels"] = encodings["input_ids"]
+            return inputs
+
+    def pad(self, *args, **kwargs):
+        """
+        When used in normal mode, this method forwards all its arguments to the feature extractor's
+        [`~FeatureExtractionMixin.pad`] and returns its output. If used in the context
+        [`~Wav2Vec2ProcessorWithLM.as_target_processor`] this method forwards all its arguments to
+        Wav2Vec2CTCTokenizer's [`~Wav2Vec2CTCTokenizer.pad`]. Please refer to the docstring of the above two methods
+        for more information.
+        """
+        # For backward compatibility
+        if self._in_target_context_manager:
+            return self.current_processor.pad(*args, **kwargs)
+
+        input_features = kwargs.pop("input_features", None)
+        labels = kwargs.pop("labels", None)
+        if len(args) > 0:
+            input_features = args[0]
+            args = args[1:]
+
+        if input_features is not None:
+            input_features = self.feature_extractor.pad(input_features, *args, **kwargs)
+        if labels is not None:
+            labels = self.tokenizer.pad(labels, **kwargs)
+
+        if labels is None:
+            return input_features
+        elif input_features is None:
+            return labels
+        else:
+            input_features["labels"] = labels["input_ids"]
+            return input_features
+
+    def batch_decode(
+        self,
+        logits: np.ndarray,
+        pool: Optional[Pool] = None,
+        num_processes: Optional[int] = None,
+        beam_width: Optional[int] = None,
+        beam_prune_logp: Optional[float] = None,
+        token_min_logp: Optional[float] = None,
+        hotwords: Optional[Iterable[str]] = None,
+        hotword_weight: Optional[float] = None,
+        alpha: Optional[float] = None,
+        beta: Optional[float] = None,
+        unk_score_offset: Optional[float] = None,
+        lm_score_boundary: Optional[bool] = None,
+        output_word_offsets: bool = False,
+        n_best: int = 1,
+    ):
+        """
+        Batch decode output logits to audio transcription with language model support.
+
+        <Tip>
+
+        This function makes use of Python's multiprocessing. Currently, multiprocessing is available only on Unix
+        systems (see this [issue](https://github.com/kensho-technologies/pyctcdecode/issues/65)).
+
+        If you are decoding multiple batches, consider creating a `Pool` and passing it to `batch_decode`. Otherwise,
+        `batch_decode` will be very slow since it will create a fresh `Pool` for each call. See usage example below.
+
+        </Tip>
+
+        Args:
+            logits (`np.ndarray`):
+                The logits output vector of the model representing the log probabilities for each token.
+            pool (`multiprocessing.Pool`, *optional*):
+                An optional user-managed pool. If not set, one will be automatically created and closed. The pool
+                should be instantiated *after* `Wav2Vec2ProcessorWithLM`. Otherwise, the LM won't be available to the
+                pool's sub-processes.
+
+                <Tip>
+
+                Currently, only pools created with a 'fork' context can be used. If a 'spawn' pool is passed, it will
+                be ignored and sequential decoding will be used instead.
+
+                </Tip>
+
+            num_processes (`int`, *optional*):
+                If `pool` is not set, number of processes on which the function should be parallelized over. Defaults
+                to the number of available CPUs.
+            beam_width (`int`, *optional*):
+                Maximum number of beams at each step in decoding. Defaults to pyctcdecode's DEFAULT_BEAM_WIDTH.
+            beam_prune_logp (`int`, *optional*):
+                Beams that are much worse than best beam will be pruned Defaults to pyctcdecode's DEFAULT_PRUNE_LOGP.
+            token_min_logp (`int`, *optional*):
+                Tokens below this logp are skipped unless they are argmax of frame Defaults to pyctcdecode's
+                DEFAULT_MIN_TOKEN_LOGP.
+            hotwords (`list[str]`, *optional*):
+                List of words with extra importance, can be OOV for LM
+            hotword_weight (`int`, *optional*):
+                Weight factor for hotword importance Defaults to pyctcdecode's DEFAULT_HOTWORD_WEIGHT.
+            alpha (`float`, *optional*):
+                Weight for language model during shallow fusion
+            beta (`float`, *optional*):
+                Weight for length score adjustment of during scoring
+            unk_score_offset (`float`, *optional*):
+                Amount of log score offset for unknown tokens
+            lm_score_boundary (`bool`, *optional*):
+                Whether to have kenlm respect boundaries when scoring
+            output_word_offsets (`bool`, *optional*, defaults to `False`):
+                Whether or not to output word offsets. Word offsets can be used in combination with the sampling rate
+                and model downsampling rate to compute the time-stamps of transcribed words.
+            n_best (`int`, *optional*, defaults to `1`):
+                Number of best hypotheses to return. If `n_best` is greater than 1, the returned `text` will be a list
+                of lists of strings, `logit_score` will be a list of lists of floats, and `lm_score` will be a list of
+                lists of floats, where the length of the outer list will correspond to the batch size and the length of
+                the inner list will correspond to the number of returned hypotheses . The value should be >= 1.
+
+                <Tip>
+
+                Please take a look at the Example of [`~Wav2Vec2ProcessorWithLM.decode`] to better understand how to
+                make use of `output_word_offsets`. [`~Wav2Vec2ProcessorWithLM.batch_decode`] works the same way with
+                batched output.
+
+                </Tip>
+
+        Returns:
+            [`~models.wav2vec2.Wav2Vec2DecoderWithLMOutput`].
+
+        Example:
+            See [Decoding multiple audios](#decoding-multiple-audios).
+        """
+
+        from pyctcdecode.constants import (
+            DEFAULT_BEAM_WIDTH,
+            DEFAULT_HOTWORD_WEIGHT,
+            DEFAULT_MIN_TOKEN_LOGP,
+            DEFAULT_PRUNE_LOGP,
+        )
+
+        # set defaults
+        beam_width = beam_width if beam_width is not None else DEFAULT_BEAM_WIDTH
+        beam_prune_logp = beam_prune_logp if beam_prune_logp is not None else DEFAULT_PRUNE_LOGP
+        token_min_logp = token_min_logp if token_min_logp is not None else DEFAULT_MIN_TOKEN_LOGP
+        hotword_weight = hotword_weight if hotword_weight is not None else DEFAULT_HOTWORD_WEIGHT
+
+        # reset params at every forward call. It's just a `set` method in pyctcdecode
+        self.decoder.reset_params(
+            alpha=alpha, beta=beta, unk_score_offset=unk_score_offset, lm_score_boundary=lm_score_boundary
+        )
+
+        # create multiprocessing pool and list numpy arrays
+        # filter out logits padding
+        logits_list = [array[(array != -100.0).all(axis=-1)] for array in logits]
+
+        # create a pool if necessary while also using it as a context manager to close itself
+        if pool is None:
+            # fork is safe to use only on Unix, see "Contexts and start methods" section on
+            # multiprocessing's docs (https://docs.python.org/3/library/multiprocessing.html#contexts-and-start-methods)
+            default_context = get_start_method()
+
+            if default_context == "fork":
+                cm = pool = get_context().Pool(num_processes)
+            else:
+                logger.warning(
+                    "Parallel batch decoding is not currently supported in this platform. "
+                    "Falling back to sequential decoding."
+                )
+                cm = nullcontext()
+        else:
+            # pool is managed by the user, so we don't need to close it
+            cm = nullcontext()
+
+            if num_processes is not None:
+                logger.warning(
+                    "Parameter `num_process` was passed, but it will be ignored since `pool` was also specified."
+                )
+
+        # pyctcdecode
+        with cm:
+            decoded_beams = self.decoder.decode_beams_batch(
+                pool=pool,
+                logits_list=logits_list,
+                beam_width=beam_width,
+                beam_prune_logp=beam_prune_logp,
+                token_min_logp=token_min_logp,
+                hotwords=hotwords,
+                hotword_weight=hotword_weight,
+            )
+
+        # extract text and scores
+        batch_texts, logit_scores, lm_scores, word_offsets = [], [], [], []
+
+        for d in decoded_beams:
+            batch_texts.append([beam[0] for beam in d])
+            logit_scores.append([beam[-2] for beam in d])
+            lm_scores.append([beam[-1] for beam in d])
+
+            # word_offsets.append([{"word": t[0], "start_offset": t[1][0], "end_offset": t[1][1]} for t in d[0][1]])
+
+            word_offsets.append(
+                [
+                    [
+                        {"word": word, "start_offset": start_offset, "end_offset": end_offset}
+                        for word, (start_offset, end_offset) in beam[1]
+                    ]
+                    for beam in d
+                ]
+            )
+
+        word_offsets = word_offsets if output_word_offsets else None
+
+        if n_best == 1:
+            return Wav2Vec2DecoderWithLMOutput(
+                text=[hyps[0] for hyps in batch_texts],
+                logit_score=[hyps[0] for hyps in logit_scores],
+                lm_score=[hyps[0] for hyps in lm_scores],
+                word_offsets=[hyps[0] for hyps in word_offsets] if word_offsets is not None else None,
+            )
+        else:
+            return Wav2Vec2DecoderWithLMOutput(
+                text=[hyps[:n_best] for hyps in batch_texts],
+                logit_score=[hyps[:n_best] for hyps in logit_scores],
+                lm_score=[hyps[:n_best] for hyps in lm_scores],
+                word_offsets=[hyps[:n_best] for hyps in word_offsets] if word_offsets is not None else None,
+            )
+
+    def decode(
+        self,
+        logits: np.ndarray,
+        beam_width: Optional[int] = None,
+        beam_prune_logp: Optional[float] = None,
+        token_min_logp: Optional[float] = None,
+        hotwords: Optional[Iterable[str]] = None,
+        hotword_weight: Optional[float] = None,
+        alpha: Optional[float] = None,
+        beta: Optional[float] = None,
+        unk_score_offset: Optional[float] = None,
+        lm_score_boundary: Optional[bool] = None,
+        output_word_offsets: bool = False,
+        n_best: int = 1,
+    ):
+        """
+        Decode output logits to audio transcription with language model support.
+
+        Args:
+            logits (`np.ndarray`):
+                The logits output vector of the model representing the log probabilities for each token.
+            beam_width (`int`, *optional*):
+                Maximum number of beams at each step in decoding. Defaults to pyctcdecode's DEFAULT_BEAM_WIDTH.
+            beam_prune_logp (`int`, *optional*):
+                A threshold to prune beams with log-probs less than best_beam_logp + beam_prune_logp. The value should
+                be <= 0. Defaults to pyctcdecode's DEFAULT_PRUNE_LOGP.
+            token_min_logp (`int`, *optional*):
+                Tokens with log-probs below token_min_logp are skipped unless they are have the maximum log-prob for an
+                utterance. Defaults to pyctcdecode's DEFAULT_MIN_TOKEN_LOGP.
+            hotwords (`list[str]`, *optional*):
+                List of words with extra importance which can be missing from the LM's vocabulary, e.g. ["huggingface"]
+            hotword_weight (`int`, *optional*):
+                Weight multiplier that boosts hotword scores. Defaults to pyctcdecode's DEFAULT_HOTWORD_WEIGHT.
+            alpha (`float`, *optional*):
+                Weight for language model during shallow fusion
+            beta (`float`, *optional*):
+                Weight for length score adjustment of during scoring
+            unk_score_offset (`float`, *optional*):
+                Amount of log score offset for unknown tokens
+            lm_score_boundary (`bool`, *optional*):
+                Whether to have kenlm respect boundaries when scoring
+            output_word_offsets (`bool`, *optional*, defaults to `False`):
+                Whether or not to output word offsets. Word offsets can be used in combination with the sampling rate
+                and model downsampling rate to compute the time-stamps of transcribed words.
+            n_best (`int`, *optional*, defaults to `1`):
+                Number of best hypotheses to return. If `n_best` is greater than 1, the returned `text` will be a list
+                of strings, `logit_score` will be a list of floats, and `lm_score` will be a list of floats, where the
+                length of these lists will correspond to the number of returned hypotheses. The value should be >= 1.
+
+                <Tip>
+
+                Please take a look at the example below to better understand how to make use of `output_word_offsets`.
+
+                </Tip>
+
+        Returns:
+            [`~models.wav2vec2.Wav2Vec2DecoderWithLMOutput`].
+
+        Example:
+
+        ```python
+        >>> # Let's see how to retrieve time steps for a model
+        >>> from transformers import AutoTokenizer, AutoProcessor, AutoModelForCTC
+        >>> from datasets import load_dataset
+        >>> import datasets
+        >>> import torch
+
+        >>> # import model, feature extractor, tokenizer
+        >>> model = AutoModelForCTC.from_pretrained("patrickvonplaten/wav2vec2-base-100h-with-lm")
+        >>> processor = AutoProcessor.from_pretrained("patrickvonplaten/wav2vec2-base-100h-with-lm")
+
+        >>> # load first sample of English common_voice
+        >>> dataset = load_dataset("mozilla-foundation/common_voice_11_0", "en", split="train", streaming=True)
+        >>> dataset = dataset.cast_column("audio", datasets.Audio(sampling_rate=16_000))
+        >>> dataset_iter = iter(dataset)
+        >>> sample = next(dataset_iter)
+
+        >>> # forward sample through model to get greedily predicted transcription ids
+        >>> input_values = processor(sample["audio"]["array"], return_tensors="pt").input_values
+        >>> with torch.no_grad():
+        ...     logits = model(input_values).logits[0].cpu().numpy()
+
+        >>> # retrieve word stamps (analogous commands for `output_char_offsets`)
+        >>> outputs = processor.decode(logits, output_word_offsets=True)
+        >>> # compute `time_offset` in seconds as product of downsampling ratio and sampling_rate
+        >>> time_offset = model.config.inputs_to_logits_ratio / processor.feature_extractor.sampling_rate
+
+        >>> word_offsets = [
+        ...     {
+        ...         "word": d["word"],
+        ...         "start_time": round(d["start_offset"] * time_offset, 2),
+        ...         "end_time": round(d["end_offset"] * time_offset, 2),
+        ...     }
+        ...     for d in outputs.word_offsets
+        ... ]
+        >>> # compare word offsets with audio `en_train_0/common_voice_en_19121553.mp3` online on the dataset viewer:
+        >>> # https://huggingface.co/datasets/mozilla-foundation/common_voice_11_0/viewer/en
+        >>> word_offsets[:4]
+        [{'word': 'THE', 'start_time': 0.68, 'end_time': 0.78}, {'word': 'TRACK', 'start_time': 0.88, 'end_time': 1.1}, {'word': 'APPEARS', 'start_time': 1.18, 'end_time': 1.66}, {'word': 'ON', 'start_time': 1.86, 'end_time': 1.92}]
+        ```"""
+
+        from pyctcdecode.constants import (
+            DEFAULT_BEAM_WIDTH,
+            DEFAULT_HOTWORD_WEIGHT,
+            DEFAULT_MIN_TOKEN_LOGP,
+            DEFAULT_PRUNE_LOGP,
+        )
+
+        # set defaults
+        beam_width = beam_width if beam_width is not None else DEFAULT_BEAM_WIDTH
+        beam_prune_logp = beam_prune_logp if beam_prune_logp is not None else DEFAULT_PRUNE_LOGP
+        token_min_logp = token_min_logp if token_min_logp is not None else DEFAULT_MIN_TOKEN_LOGP
+        hotword_weight = hotword_weight if hotword_weight is not None else DEFAULT_HOTWORD_WEIGHT
+
+        # reset params at every forward call. It's just a `set` method in pyctcdecode
+        self.decoder.reset_params(
+            alpha=alpha, beta=beta, unk_score_offset=unk_score_offset, lm_score_boundary=lm_score_boundary
+        )
+
+        # pyctcdecode
+        decoded_beams = self.decoder.decode_beams(
+            logits,
+            beam_width=beam_width,
+            beam_prune_logp=beam_prune_logp,
+            token_min_logp=token_min_logp,
+            hotwords=hotwords,
+            hotword_weight=hotword_weight,
+        )
+
+        word_offsets = None
+        if output_word_offsets:
+            word_offsets = [
+                [
+                    {"word": word, "start_offset": start_offset, "end_offset": end_offset}
+                    for word, (start_offset, end_offset) in beam[2]
+                ]
+                for beam in decoded_beams
+            ]
+        logit_scores = [beam[-2] for beam in decoded_beams]
+
+        lm_scores = [beam[-1] for beam in decoded_beams]
+
+        hypotheses = [beam[0] for beam in decoded_beams]
+
+        if n_best > len(decoded_beams):
+            logger.info(
+                "N-best size is larger than the number of generated hypotheses, all hypotheses will be returned."
+            )
+
+        if n_best == 1:
+            return Wav2Vec2DecoderWithLMOutput(
+                text=hypotheses[0],
+                logit_score=logit_scores[0],
+                lm_score=lm_scores[0],
+                word_offsets=word_offsets[0] if word_offsets is not None else None,
+            )
+        else:
+            return Wav2Vec2DecoderWithLMOutput(
+                text=hypotheses[:n_best],
+                logit_score=logit_scores[:n_best],
+                lm_score=lm_scores[:n_best],
+                word_offsets=word_offsets[:n_best] if word_offsets is not None else None,
+            )
+
+    @contextmanager
+    def as_target_processor(self):
+        """
+        Temporarily sets the processor for processing the target. Useful for encoding the labels when fine-tuning
+        Wav2Vec2.
+        """
+        warnings.warn(
+            "`as_target_processor` is deprecated and will be removed in v5 of Transformers. You can process your "
+            "labels by using the argument `text` of the regular `__call__` method (either in the same call as "
+            "your audio inputs, or in a separate call."
+        )
+        self._in_target_context_manager = True
+        self.current_processor = self.tokenizer
+        yield
+        self.current_processor = self.feature_extractor
+        self._in_target_context_manager = False
+
+
+__all__ = ["Wav2Vec2ProcessorWithLM"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wavlm/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wavlm/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..d230bf3f0924b436e1b9cd9ea509a989cf5bdb8b
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wavlm/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_wavlm import *
+    from .modeling_wavlm import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wavlm/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wavlm/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dd241e4524f2e28ef39d5d2bc7c940b10bf879ba
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wavlm/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wavlm/__pycache__/configuration_wavlm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wavlm/__pycache__/configuration_wavlm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4141f877c6571c7b0263064c97e9dc31ce1072bb
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wavlm/__pycache__/configuration_wavlm.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wavlm/__pycache__/modeling_wavlm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wavlm/__pycache__/modeling_wavlm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..f02b9b878618494636cd0bd68e1b855af9ba2d9b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wavlm/__pycache__/modeling_wavlm.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wavlm/__pycache__/modular_wavlm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wavlm/__pycache__/modular_wavlm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4e062c1b82560126b7b05fe4830757cf1bc700e0
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wavlm/__pycache__/modular_wavlm.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wavlm/configuration_wavlm.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wavlm/configuration_wavlm.py
new file mode 100644
index 0000000000000000000000000000000000000000..a137792e2a8da48684738ea396e6704dc849613a
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/wavlm/configuration_wavlm.py
@@ -0,0 +1,337 @@
+# coding=utf-8
+# Copyright 2021 The Fairseq Authors, Microsoft Research, and The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""WavLM model configuration"""
+
+import functools
+import operator
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class WavLMConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`WavLMModel`]. It is used to instantiate an WavLM
+    model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
+    defaults will yield a similar configuration to that of the WavLM
+    [microsoft/wavlm-base](https://huggingface.co/microsoft/wavlm-base) architecture.
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32):
+            Vocabulary size of the WavLM model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`WavLMModel`]. Vocabulary size of the model. Defines the different tokens
+            that can be represented by the *inputs_ids* passed to the forward method of [`WavLMModel`].
+        hidden_size (`int`, *optional*, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (`int`, *optional*, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (`int`, *optional*, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (`int`, *optional*, defaults to 3072):
+            Dimensionality of the "intermediate" (i.e., feed-forward) layer in the Transformer encoder.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`,
+            `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        hidden_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        activation_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for activations inside the fully connected layer.
+        attention_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        final_dropout (`float`, *optional*, defaults to 0.1):
+            The dropout probability for the final projection layer of [`WavLMForCTC`].
+        layerdrop (`float`, *optional*, defaults to 0.1):
+            The LayerDrop probability. See the [LayerDrop paper](see https://huggingface.co/papers/1909.11556) for more
+            details.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (`float`, *optional*, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        feat_extract_norm (`str`, *optional*, defaults to `"group"`):
+            The norm to be applied to 1D convolutional layers in feature encoder. One of `"group"` for group
+            normalization of only the first 1D convolutional layer or `"layer"` for layer normalization of all 1D
+            convolutional layers.
+        feat_proj_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout probability for output of the feature encoder.
+        feat_extract_activation (`str, `optional`, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the 1D convolutional layers of the feature
+            extractor. If string, `"gelu"`, `"relu"`, `"selu"` and `"gelu_new"` are supported.
+        conv_dim (`tuple[int]` or `list[int]`, *optional*, defaults to `(512, 512, 512, 512, 512, 512, 512)`):
+            A tuple of integers defining the number of input and output channels of each 1D convolutional layer in the
+            feature encoder. The length of *conv_dim* defines the number of 1D convolutional layers.
+        conv_stride (`tuple[int]` or `list[int]`, *optional*, defaults to `(5, 2, 2, 2, 2, 2, 2)`):
+            A tuple of integers defining the stride of each 1D convolutional layer in the feature encoder. The length
+            of *conv_stride* defines the number of convolutional layers and has to match the length of *conv_dim*.
+        conv_kernel (`tuple[int]` or `list[int]`, *optional*, defaults to `(10, 3, 3, 3, 3, 3, 3)`):
+            A tuple of integers defining the kernel size of each 1D convolutional layer in the feature encoder. The
+            length of *conv_kernel* defines the number of convolutional layers and has to match the length of
+            *conv_dim*.
+        conv_bias (`bool`, *optional*, defaults to `False`):
+            Whether the 1D convolutional layers have a bias.
+        num_conv_pos_embeddings (`int`, *optional*, defaults to 128):
+            Number of convolutional positional embeddings. Defines the kernel size of 1D convolutional positional
+            embeddings layer.
+        num_conv_pos_embedding_groups (`int`, *optional*, defaults to 16):
+            Number of groups of 1D convolutional positional embeddings layer.
+        do_stable_layer_norm (`bool`, *optional*, defaults to `False`):
+            Whether to apply *stable* layer norm architecture of the Transformer encoder. `do_stable_layer_norm is
+            True` corresponds to applying layer norm before the attention layer, whereas `do_stable_layer_norm is
+            False` corresponds to applying layer norm after the attention layer.
+        apply_spec_augment (`bool`, *optional*, defaults to `True`):
+            Whether to apply *SpecAugment* data augmentation to the outputs of the feature encoder. For reference see
+            [SpecAugment: A Simple Data Augmentation Method for Automatic Speech
+            Recognition](https://huggingface.co/papers/1904.08779).
+        mask_time_prob (`float`, *optional*, defaults to 0.05):
+            Probability of each feature vector along the time axis to be chosen as the start of the vector span to be
+            masked. Approximately `mask_time_prob * sequence_length // mask_time_length` feature vectors will be masked
+            along the time axis. This is only relevant if `apply_spec_augment is True`.
+        mask_time_length (`int`, *optional*, defaults to 10):
+            Length of vector span along the time axis.
+        mask_time_min_masks (`int`, *optional*, defaults to 2),:
+            The minimum number of masks of length `mask_feature_length` generated along the time axis, each time step,
+            irrespectively of `mask_feature_prob`. Only relevant if ''mask_time_prob*len(time_axis)/mask_time_length <
+            mask_time_min_masks''
+        mask_feature_prob (`float`, *optional*, defaults to 0.0):
+            Probability of each feature vector along the feature axis to be chosen as the start of the vector span to
+            be masked. Approximately `mask_time_prob * hidden_size // mask_time_length` feature vectors will be masked
+            along the time axis. This is only relevant if `apply_spec_augment is True`.
+        mask_feature_length (`int`, *optional*, defaults to 10):
+            Length of vector span along the feature axis.
+        num_codevectors_per_group (`int`, *optional*, defaults to 320):
+            Number of entries in each quantization codebook (group).
+        num_codevector_groups (`int`, *optional*, defaults to 2):
+            Number of codevector groups for product codevector quantization.
+        contrastive_logits_temperature (`float`, *optional*, defaults to 0.1):
+            The temperature *kappa* in the contrastive loss.
+        num_negatives (`int`, *optional*, defaults to 100):
+            Number of negative samples for the contrastive loss.
+        codevector_dim (`int`, *optional*, defaults to 256):
+            Dimensionality of the quantized feature vectors.
+        proj_codevector_dim (`int`, *optional*, defaults to 256):
+            Dimensionality of the final projection of both the quantized and the transformer features.
+        diversity_loss_weight (`int`, *optional*, defaults to 0.1):
+            The weight of the codebook diversity loss component.
+        ctc_loss_reduction (`str`, *optional*, defaults to `"mean"`):
+            Specifies the reduction to apply to the output of `torch.nn.CTCLoss`. Only relevant when training an
+            instance of [`WavLMForCTC`].
+        ctc_zero_infinity (`bool`, *optional*, defaults to `False`):
+            Whether to zero infinite losses and the associated gradients of `torch.nn.CTCLoss`. Infinite losses mainly
+            occur when the inputs are too short to be aligned to the targets. Only relevant when training an instance
+            of [`WavLMForCTC`].
+        use_weighted_layer_sum (`bool`, *optional*, defaults to `False`):
+            Whether to use a weighted average of layer outputs with learned weights. Only relevant when using an
+            instance of [`WavLMForSequenceClassification`].
+        classifier_proj_size (`int`, *optional*, defaults to 256):
+            Dimensionality of the projection before token mean-pooling for classification.
+        tdnn_dim (`tuple[int]` or `list[int]`, *optional*, defaults to `(512, 512, 512, 512, 1500)`):
+            A tuple of integers defining the number of output channels of each 1D convolutional layer in the *TDNN*
+            module of the *XVector* model. The length of *tdnn_dim* defines the number of *TDNN* layers.
+        tdnn_kernel (`tuple[int]` or `list[int]`, *optional*, defaults to `(5, 3, 3, 1, 1)`):
+            A tuple of integers defining the kernel size of each 1D convolutional layer in the *TDNN* module of the
+            *XVector* model. The length of *tdnn_kernel* has to match the length of *tdnn_dim*.
+        tdnn_dilation (`tuple[int]` or `list[int]`, *optional*, defaults to `(1, 2, 3, 1, 1)`):
+            A tuple of integers defining the dilation factor of each 1D convolutional layer in *TDNN* module of the
+            *XVector* model. The length of *tdnn_dilation* has to match the length of *tdnn_dim*.
+        xvector_output_dim (`int`, *optional*, defaults to 512):
+            Dimensionality of the *XVector* embedding vectors.
+        add_adapter (`bool`, *optional*, defaults to `False`):
+            Whether a convolutional network should be stacked on top of the Wav2Vec2 Encoder. Can be very useful for
+            warm-starting Wav2Vec2 for SpeechEncoderDecoder models.
+        adapter_kernel_size (`int`, *optional*, defaults to 3):
+            Kernel size of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`.
+        adapter_stride (`int`, *optional*, defaults to 2):
+            Stride of the convolutional layers in the adapter network. Only relevant if `add_adapter is True`.
+        num_adapter_layers (`int`, *optional*, defaults to 3):
+            Number of convolutional layers that should be used in the adapter network. Only relevant if `add_adapter is
+            True`.
+        output_hidden_size (`int`, *optional*):
+            Dimensionality of the encoder output layer. If not defined, this defaults to *hidden-size*. Only relevant
+            if `add_adapter is True`.
+
+    Example:
+
+    ```python
+
+    ```
+
+    Example:
+
+    ```python
+    >>> from transformers import WavLMConfig, WavLMModel
+
+    >>> # Initializing a WavLM facebook/wavlm-base-960h style configuration
+    >>> configuration = WavLMConfig()
+
+    >>> # Initializing a model (with random weights) from the facebook/wavlm-base-960h style configuration
+    >>> model = WavLMModel(configuration)
+
+    >>> # Accessing the model configuration
+    >>> configuration = model.config
+    ```"""
+
+    model_type = "wavlm"
+
+    def __init__(
+        self,
+        vocab_size=32,
+        hidden_size=768,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout=0.1,
+        activation_dropout=0.1,
+        attention_dropout=0.1,
+        feat_proj_dropout=0.0,
+        final_dropout=0.1,
+        layerdrop=0.1,
+        initializer_range=0.02,
+        layer_norm_eps=1e-5,
+        feat_extract_norm="group",
+        feat_extract_activation="gelu",
+        conv_dim=(512, 512, 512, 512, 512, 512, 512),
+        conv_stride=(5, 2, 2, 2, 2, 2, 2),
+        conv_kernel=(10, 3, 3, 3, 3, 2, 2),
+        conv_bias=False,
+        num_conv_pos_embeddings=128,
+        num_conv_pos_embedding_groups=16,
+        num_buckets=320,
+        max_bucket_distance=800,
+        do_stable_layer_norm=False,
+        apply_spec_augment=True,
+        mask_time_prob=0.05,
+        mask_time_length=10,
+        mask_time_min_masks=2,
+        mask_feature_prob=0.0,
+        mask_feature_length=10,
+        num_codevectors_per_group=320,
+        num_codevector_groups=2,
+        contrastive_logits_temperature=0.1,
+        num_negatives=100,
+        codevector_dim=256,
+        proj_codevector_dim=256,
+        diversity_loss_weight=0.1,
+        ctc_loss_reduction="mean",
+        ctc_zero_infinity=False,
+        use_weighted_layer_sum=False,
+        classifier_proj_size=256,
+        tdnn_dim=(512, 512, 512, 512, 1500),
+        tdnn_kernel=(5, 3, 3, 1, 1),
+        tdnn_dilation=(1, 2, 3, 1, 1),
+        xvector_output_dim=512,
+        num_ctc_classes=80,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        add_adapter=False,
+        adapter_kernel_size=3,
+        adapter_stride=2,
+        num_adapter_layers=3,
+        output_hidden_size=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs, pad_token_id=pad_token_id, bos_token_id=bos_token_id, eos_token_id=eos_token_id)
+        self.hidden_size = hidden_size
+        self.feat_extract_norm = feat_extract_norm
+        self.feat_extract_activation = feat_extract_activation
+        self.conv_dim = list(conv_dim)
+        self.conv_stride = list(conv_stride)
+        self.conv_kernel = list(conv_kernel)
+        self.conv_bias = conv_bias
+        self.num_buckets = num_buckets
+        self.max_bucket_distance = max_bucket_distance
+        self.num_conv_pos_embeddings = num_conv_pos_embeddings
+        self.num_conv_pos_embedding_groups = num_conv_pos_embedding_groups
+        self.num_feat_extract_layers = len(self.conv_dim)
+        self.num_hidden_layers = num_hidden_layers
+        self.intermediate_size = intermediate_size
+        self.hidden_act = hidden_act
+        self.num_attention_heads = num_attention_heads
+        self.hidden_dropout = hidden_dropout
+        self.attention_dropout = attention_dropout
+        self.activation_dropout = activation_dropout
+        self.feat_proj_dropout = feat_proj_dropout
+        self.final_dropout = final_dropout
+        self.layerdrop = layerdrop
+        self.layer_norm_eps = layer_norm_eps
+        self.initializer_range = initializer_range
+        self.num_ctc_classes = num_ctc_classes
+        self.vocab_size = vocab_size
+        self.do_stable_layer_norm = do_stable_layer_norm
+        self.use_weighted_layer_sum = use_weighted_layer_sum
+        self.classifier_proj_size = classifier_proj_size
+
+        if (
+            (len(self.conv_stride) != self.num_feat_extract_layers)
+            or (len(self.conv_kernel) != self.num_feat_extract_layers)
+            or (len(self.conv_dim) != self.num_feat_extract_layers)
+        ):
+            raise ValueError(
+                "Configuration for convolutional layers is incorrect. It is required that `len(config.conv_dim)` =="
+                " `len(config.conv_stride)` == `len(config.conv_kernel)`, but is `len(config.conv_dim) ="
+                f" {len(self.conv_dim)}`, `len(config.conv_stride) = {len(self.conv_stride)}`,"
+                f" `len(config.conv_kernel) = {len(self.conv_kernel)}`."
+            )
+
+        # fine-tuning config parameters for SpecAugment: https://huggingface.co/papers/1904.08779
+        self.apply_spec_augment = apply_spec_augment
+        self.mask_time_prob = mask_time_prob
+        self.mask_time_length = mask_time_length
+        self.mask_time_min_masks = mask_time_min_masks
+        self.mask_feature_prob = mask_feature_prob
+        self.mask_feature_length = mask_feature_length
+
+        # parameters for pretraining with codevector quantized representations
+        self.num_codevectors_per_group = num_codevectors_per_group
+        self.num_codevector_groups = num_codevector_groups
+        self.contrastive_logits_temperature = contrastive_logits_temperature
+        self.num_negatives = num_negatives
+        self.codevector_dim = codevector_dim
+        self.proj_codevector_dim = proj_codevector_dim
+        self.diversity_loss_weight = diversity_loss_weight
+
+        # ctc loss
+        self.ctc_loss_reduction = ctc_loss_reduction
+        self.ctc_zero_infinity = ctc_zero_infinity
+
+        # adapter
+        self.add_adapter = add_adapter
+        self.adapter_kernel_size = adapter_kernel_size
+        self.adapter_stride = adapter_stride
+        self.num_adapter_layers = num_adapter_layers
+        self.output_hidden_size = output_hidden_size or hidden_size
+
+        # SequenceClassification-specific parameter. Feel free to ignore for other classes.
+        self.classifier_proj_size = classifier_proj_size
+
+        # XVector-specific parameters. Feel free to ignore for other classes.
+        self.tdnn_dim = list(tdnn_dim)
+        self.tdnn_kernel = list(tdnn_kernel)
+        self.tdnn_dilation = list(tdnn_dilation)
+        self.xvector_output_dim = xvector_output_dim
+
+    @property
+    def inputs_to_logits_ratio(self):
+        return functools.reduce(operator.mul, self.conv_stride, 1)
+
+
+__all__ = ["WavLMConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/whisper/__pycache__/english_normalizer.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/whisper/__pycache__/english_normalizer.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..b71c5c63e23670419c1b0c97f8bb0a9213d84eab
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/whisper/__pycache__/english_normalizer.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/whisper/__pycache__/feature_extraction_whisper.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/whisper/__pycache__/feature_extraction_whisper.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8fc628177ebbd5b34d85e1c85ec0824a69f30d77
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/whisper/__pycache__/feature_extraction_whisper.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/whisper/__pycache__/generation_whisper.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/whisper/__pycache__/generation_whisper.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3fc4757392daf84222bbc29dbc6c9da0e0249a87
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/whisper/__pycache__/generation_whisper.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/whisper/__pycache__/modeling_whisper.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/whisper/__pycache__/modeling_whisper.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..2c1c3fa29edc0a982f638b9c9459df11cac82fef
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/whisper/__pycache__/modeling_whisper.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/whisper/__pycache__/tokenization_whisper.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/whisper/__pycache__/tokenization_whisper.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..79493a1fced057403a6d50f08cf614f2df4228b2
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/whisper/__pycache__/tokenization_whisper.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/x_clip/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/x_clip/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a1c3a3aa68efdaa624dde59fb3fae6b3d479bcc0
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/x_clip/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/x_clip/__pycache__/configuration_x_clip.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/x_clip/__pycache__/configuration_x_clip.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..3a782794fc81127a57dae5cabf60ad0c55abcc61
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/x_clip/__pycache__/configuration_x_clip.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/x_clip/__pycache__/modeling_x_clip.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/x_clip/__pycache__/modeling_x_clip.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8946c31fdb0e6b7d8189bd139a5496d373daaa5e
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/x_clip/__pycache__/modeling_x_clip.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/x_clip/__pycache__/processing_x_clip.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/x_clip/__pycache__/processing_x_clip.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..dffe7a4b70dcba7e15e5782e74990de68936a36c
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/x_clip/__pycache__/processing_x_clip.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xglm/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xglm/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d668f03813127910b42fb3bf2f8e8c7375381f0d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xglm/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xglm/__pycache__/configuration_xglm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xglm/__pycache__/configuration_xglm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5793f62222eca9fb99ed98650e17ee4946a1c196
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xglm/__pycache__/configuration_xglm.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xglm/__pycache__/modeling_flax_xglm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xglm/__pycache__/modeling_flax_xglm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..9ca7203d5f09064fc7e02c27fe4e22ba26f557cc
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xglm/__pycache__/modeling_flax_xglm.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xglm/__pycache__/modeling_tf_xglm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xglm/__pycache__/modeling_tf_xglm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..23a184e84fdd9f7cda75026b5ce4d0d2a174c2b2
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xglm/__pycache__/modeling_tf_xglm.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xglm/__pycache__/modeling_xglm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xglm/__pycache__/modeling_xglm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..4bfdb86334885bd1d78908cbeead4e870b3f76a7
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xglm/__pycache__/modeling_xglm.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xglm/__pycache__/tokenization_xglm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xglm/__pycache__/tokenization_xglm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..939bf84ffce488ad72f1117ba107c76d097c9b3d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xglm/__pycache__/tokenization_xglm.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xglm/__pycache__/tokenization_xglm_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xglm/__pycache__/tokenization_xglm_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..a47535ffbe8dff81b391c0319b164ff40ea1a142
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xglm/__pycache__/tokenization_xglm_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlnet/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlnet/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..e5359628fbcc9e60703dbc9a97528b51a60ab07c
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlnet/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlnet/__pycache__/configuration_xlnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlnet/__pycache__/configuration_xlnet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..7ccf9e62084a629193b011319490905ab5e41496
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlnet/__pycache__/configuration_xlnet.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlnet/__pycache__/modeling_tf_xlnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlnet/__pycache__/modeling_tf_xlnet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..1127b7b3e54f9b92e0860b288ae7e1791302078d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlnet/__pycache__/modeling_tf_xlnet.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlnet/__pycache__/tokenization_xlnet.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlnet/__pycache__/tokenization_xlnet.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..89d8cc26022cb976b50fc9f51bce43b01c4a0b2c
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlnet/__pycache__/tokenization_xlnet.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlnet/__pycache__/tokenization_xlnet_fast.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlnet/__pycache__/tokenization_xlnet_fast.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..ad2d74a14f252ff4469b9eae812462e642cdf0bf
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlnet/__pycache__/tokenization_xlnet_fast.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlstm/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlstm/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..5c26dd9a5e2106e07559a5a19c3e46fdd42c20c0
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlstm/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlstm/__pycache__/configuration_xlstm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlstm/__pycache__/configuration_xlstm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..d2481db319e01b8e658d75a4a00f737183535875
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlstm/__pycache__/configuration_xlstm.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlstm/__pycache__/modeling_xlstm.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlstm/__pycache__/modeling_xlstm.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..8db6ee7fed0a48fb0f12c2577d4d44b1366c177b
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/xlstm/__pycache__/modeling_xlstm.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba/__init__.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..48a233755de26862db8489d083ffc7f2809e821d
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba/__init__.py
@@ -0,0 +1,27 @@
+# Copyright 2024 The HuggingFace Team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from typing import TYPE_CHECKING
+
+from ...utils import _LazyModule
+from ...utils.import_utils import define_import_structure
+
+
+if TYPE_CHECKING:
+    from .configuration_zamba import *
+    from .modeling_zamba import *
+else:
+    import sys
+
+    _file = globals()["__file__"]
+    sys.modules[__name__] = _LazyModule(__name__, _file, define_import_structure(_file), module_spec=__spec__)
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba/configuration_zamba.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba/configuration_zamba.py
new file mode 100644
index 0000000000000000000000000000000000000000..3de9a19d1bd73e52b5f9819bbe67cc816bc10503
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba/configuration_zamba.py
@@ -0,0 +1,227 @@
+# coding=utf-8
+# Copyright 2024 Zyphra Technologies and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Zamba model configuration"""
+
+import math
+
+from ...configuration_utils import PretrainedConfig
+from ...utils import logging
+
+
+logger = logging.get_logger(__name__)
+
+
+class ZambaConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a [`ZambaModel`]. It is used to instantiate a
+    Zamba model according to the specified arguments, defining the model architecture. Instantiating a configuration
+    with the defaults will yield a similar configuration to that of the Zamba-v0.1 model.
+
+    [Zyphra/Zamba-7B-v1](https://huggingface.co/Zyphra/Zamba-7B-v1)
+
+    Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
+    documentation from [`PretrainedConfig`] for more information.
+
+
+    Args:
+        vocab_size (`int`, *optional*, defaults to 32000):
+            Vocabulary size of the Zamba model. Defines the number of different tokens that can be represented by the
+            `inputs_ids` passed when calling [`ZambaModel`]
+        tie_word_embeddings (`bool`, *optional*, defaults to `True`):
+            Whether the model's input and output word embeddings should be tied. Note that this is only relevant if the
+            model has a output word embedding layer.
+        hidden_size (`int`, *optional*, defaults to 3712):
+            Dimension of the hidden representations.
+        attention_hidden_size (`int`, *optional*):
+            Dimension of the hidden representations of the inputs to the Attention layer.
+        intermediate_size (`int`, *optional*, defaults to 14848):
+            Dimension of the MLP representations.
+        num_hidden_layers (`int`, *optional*, defaults to 76):
+            Number of hidden layers in the model.
+        num_attention_heads (`int`, *optional*, defaults to 16):
+            Number of attention heads for each attention layer in the Transformer decoder.
+        attention_head_dim (`int`, *optional*):
+            Dimension of the attention head in the Transformer decoder.
+        num_key_value_heads (`int`, *optional*, defaults to 16):
+            This is the number of key_value heads that should be used to implement Grouped Query Attention. If
+            `num_key_value_heads=None`, the model will use Multi Head Attention (MHA), if
+            `num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
+            converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
+            by meanpooling all the original heads within that group. For more details, check out [this
+            paper](https://huggingface.co/papers/2305.13245).
+        n_mamba_heads (`int`, *optional*, defaults to 2):
+            Number of mamba heads for each mamba layer.
+        hidden_act (`str` or `function`, *optional*, defaults to `"gelu"`):
+            The non-linear activation function (function or string) in the decoder.
+        hidden_mamba_act (`str` or `function`, *optional*, defaults to `"silu"`):
+            The non-linear activation function (function or string) in the mamba layer.
+        initializer_range (`float`, *optional*, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        rms_norm_eps (`float`, *optional*, defaults to 1e-05):
+            The epsilon used by the rms normalization layers.
+        use_cache (`bool`, *optional*, defaults to `True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if `config.is_decoder=True`.
+        num_logits_to_keep (`int` or `None`, *optional*, defaults to 1):
+            Number of prompt logits to calculate during generation. If `None`, all logits will be calculated. If an
+            integer value, only last `num_logits_to_keep` logits will be calculated. Default is 1 because only the
+            logits of the last prompt token are needed for generation. For long sequences, the logits for the entire
+            sequence may use a lot of memory so, setting `num_logits_to_keep=1` will reduce memory footprint
+            significantly.
+        pad_token_id (`int`, *optional*, defaults to 0):
+            The id of the padding token.
+        bos_token_id (`int`, *optional*, defaults to 1):
+            The id of the "beginning-of-sequence" token.
+        eos_token_id (`int`, *optional*, defaults to 2):
+            The id of the "end-of-sequence" token.
+        max_position_embeddings (`int`, *optional*, defaults to 4096):
+            This value doesn't have any real effect. The maximum sequence length that this model is intended to be
+            used with. It can be used with longer sequences, but performance may degrade.
+        attention_dropout (`float`, *optional*, defaults to 0.0):
+            The dropout ratio for the attention probabilities.
+        attn_layer_period (`int`, *optional*, defaults to 6):
+            Once in this many layers, we will have a shared attention layer
+        attn_layer_offset (`int`, *optional*, defaults to 4):
+            Offset of the shared attention layer
+        use_mamba_kernels (`bool`, *optional*, defaults to `True`):
+            Flag indicating whether or not to use the fast mamba kernels. These are available only if `mamba-ssm` and
+            `causal-conv1d` are installed, and the mamba modules are running on a CUDA device. Raises ValueError if
+            `True` and kernels are not available
+        mamba_d_state (`int`, *optional*, defaults to 16):
+            The dimension the mamba state space latents
+        mamba_d_conv (`int`, *optional*, defaults to 4):
+            The size of the mamba convolution kernel
+        mamba_expand (`int`, *optional*, defaults to 2):
+            Expanding factor (relative to hidden_size) used to determine the mamba intermediate size
+        mamba_dt_rank (`Union[int,str]`, *optional*, defaults to `"auto"`):
+            Rank of the mamba discretization projection matrix. `"auto"` means that it will default to `math.ceil(self.hidden_size / 16)`
+        time_step_min (`float`, *optional*, defaults to 0.001):
+            Minimum `time_step` used to bound `dt_proj_bias`.
+        time_step_max (`float`, *optional*, defaults to 0.1):
+            Maximum `time_step` used to bound `dt_proj_bias`.
+        time_step_floor (`float`, *optional*, defaults to 0.0001):
+            Minimum clamping value of the `dt_proj.bias` layer initialization.
+        mamba_conv_bias (`bool`, *optional*, defaults to `True`):
+            Flag indicating whether or not to use bias in the convolution layer of the mamba mixer block.
+        mamba_proj_bias (`bool`, *optional*, defaults to `False`):
+            Flag indicating whether or not to use bias in the input and output projections (["in_proj", "out_proj"]) of the mamba mixer block
+
+    """
+
+    model_type = "zamba"
+    keys_to_ignore_at_inference = ["past_key_values"]
+
+    def __init__(
+        self,
+        vocab_size=32000,
+        tie_word_embeddings=True,
+        hidden_size=3712,
+        attention_hidden_size=None,
+        intermediate_size=14848,
+        num_hidden_layers=76,
+        num_attention_heads=16,
+        attention_head_dim=None,
+        num_key_value_heads=16,
+        n_mamba_heads=2,
+        hidden_act="gelu",
+        hidden_mamba_act="silu",
+        initializer_range=0.02,
+        rms_norm_eps=1e-5,
+        use_cache=True,
+        num_logits_to_keep=1,
+        pad_token_id=0,
+        bos_token_id=1,
+        eos_token_id=2,
+        max_position_embeddings=4096,
+        attention_dropout=0.0,
+        attn_layer_period=6,
+        attn_layer_offset=4,
+        use_mamba_kernels=True,
+        mamba_d_state=16,
+        mamba_d_conv=4,
+        mamba_expand=2,
+        mamba_dt_rank="auto",
+        time_step_min=0.001,
+        time_step_max=0.1,
+        time_step_floor=1e-4,
+        mamba_conv_bias=True,
+        mamba_proj_bias=False,
+        **kwargs,
+    ):
+        self.vocab_size = vocab_size
+        self.tie_word_embeddings = tie_word_embeddings
+        self.hidden_size = hidden_size
+        if attention_hidden_size is None:
+            self.attention_hidden_size = 2 * hidden_size
+        else:
+            self.attention_hidden_size = attention_hidden_size
+        self.intermediate_size = intermediate_size
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        if attention_head_dim is None:
+            self.attention_head_dim = 2 * self.hidden_size // self.num_attention_heads
+        else:
+            self.attention_head_dim = attention_head_dim
+        self.max_position_embeddings = max_position_embeddings
+        self.attention_dropout = attention_dropout
+
+        self.num_key_value_heads = num_key_value_heads
+        self.n_mamba_heads = n_mamba_heads
+        self.hidden_act = hidden_act
+        self.hidden_mamba_act = hidden_mamba_act
+        self.initializer_range = initializer_range
+        self.rms_norm_eps = rms_norm_eps
+
+        self.use_cache = use_cache
+        self.num_logits_to_keep = num_logits_to_keep
+
+        self.attn_layer_period = attn_layer_period
+        self.attn_layer_offset = attn_layer_offset
+
+        self.use_mamba_kernels = use_mamba_kernels
+        self.mamba_d_state = mamba_d_state
+        self.mamba_d_conv = mamba_d_conv
+        self.mamba_expand = mamba_expand
+        self.mamba_dt_rank = math.ceil(self.hidden_size / 16) if mamba_dt_rank == "auto" else mamba_dt_rank
+        self.time_step_min = time_step_min
+        self.time_step_max = time_step_max
+        self.time_step_floor = time_step_floor
+        self.mamba_conv_bias = mamba_conv_bias
+        self.mamba_proj_bias = mamba_proj_bias
+
+        self.layers_block_type = self._layers_block_type(num_hidden_layers, attn_layer_period, attn_layer_offset)
+
+        assert (self.mamba_expand * self.hidden_size) % self.n_mamba_heads == 0, (
+            "`intermediate_size` should be divisible by `n_mamba_heads`."
+        )
+
+        super().__init__(
+            pad_token_id=pad_token_id,
+            bos_token_id=bos_token_id,
+            eos_token_id=eos_token_id,
+            tie_word_embeddings=tie_word_embeddings,
+            **kwargs,
+        )
+
+    def _layers_block_type(self, num_hidden_layers, attn_layer_period, attn_layer_offset):
+        layers = [
+            "mamba",
+            "mamba",
+            "hybrid",
+        ] + ["hybrid" if i % attn_layer_period == attn_layer_offset else "mamba" for i in range(num_hidden_layers - 3)]
+        return layers
+
+
+__all__ = ["ZambaConfig"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba/modeling_zamba.py b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba/modeling_zamba.py
new file mode 100644
index 0000000000000000000000000000000000000000..dc95e1e550faa51a0c9a973dfc660d405f85fd28
--- /dev/null
+++ b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba/modeling_zamba.py
@@ -0,0 +1,1316 @@
+# coding=utf-8
+# Copyright 2024 Zyphra Technologies and the HuggingFace Inc. team. All rights reserved.
+#
+# This code is based on EleutherAI's GPT-NeoX library and the GPT-NeoX
+# and OPT implementations in this library. It has been modified from its
+# original forms to accommodate minor architectural differences compared
+# to GPT-NeoX and OPT used by the Meta AI team that trained the model.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""PyTorch Zamba model."""
+
+import math
+from typing import Any, Callable, Optional, Union
+
+import torch
+from torch import nn
+from torch.nn import BCEWithLogitsLoss, CrossEntropyLoss, MSELoss
+
+from ...activations import ACT2FN
+from ...cache_utils import Cache
+from ...generation import GenerationMixin
+from ...modeling_attn_mask_utils import AttentionMaskConverter
+from ...modeling_flash_attention_utils import FlashAttentionKwargs
+from ...modeling_outputs import BaseModelOutputWithPast, CausalLMOutputWithPast, SequenceClassifierOutputWithPast
+from ...modeling_utils import ALL_ATTENTION_FUNCTIONS, PreTrainedModel
+from ...processing_utils import Unpack
+from ...utils import auto_docstring, logging
+from ...utils.deprecation import deprecate_kwarg
+from ...utils.import_utils import is_causal_conv1d_available, is_mamba_ssm_available
+from .configuration_zamba import ZambaConfig
+
+
+if is_mamba_ssm_available():
+    from mamba_ssm.ops.selective_scan_interface import mamba_inner_fn, selective_scan_fn
+    from mamba_ssm.ops.triton.selective_state_update import selective_state_update
+else:
+    selective_state_update, selective_scan_fn, mamba_inner_fn = None, None, None
+
+if is_causal_conv1d_available():
+    from causal_conv1d import causal_conv1d_fn, causal_conv1d_update
+else:
+    causal_conv1d_update, causal_conv1d_fn = None, None
+
+is_fast_path_available = all(
+    (selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)
+)
+
+
+logger = logging.get_logger(__name__)
+
+
+# Copied from transformers.models.llama.modeling_llama.LlamaRMSNorm with Llama->Zamba
+class ZambaRMSNorm(nn.Module):
+    def __init__(self, hidden_size, eps=1e-6):
+        """
+        ZambaRMSNorm is equivalent to T5LayerNorm
+        """
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(hidden_size))
+        self.variance_epsilon = eps
+
+    def forward(self, hidden_states):
+        input_dtype = hidden_states.dtype
+        hidden_states = hidden_states.to(torch.float32)
+        variance = hidden_states.pow(2).mean(-1, keepdim=True)
+        hidden_states = hidden_states * torch.rsqrt(variance + self.variance_epsilon)
+        return self.weight * hidden_states.to(input_dtype)
+
+    def extra_repr(self):
+        return f"{tuple(self.weight.shape)}, eps={self.variance_epsilon}"
+
+
+# Copied from transformers.models.llama.modeling_llama.repeat_kv
+def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
+    """
+    This is the equivalent of torch.repeat_interleave(x, dim=1, repeats=n_rep). The hidden states go from (batch,
+    num_key_value_heads, seqlen, head_dim) to (batch, num_attention_heads, seqlen, head_dim)
+    """
+    batch, num_key_value_heads, slen, head_dim = hidden_states.shape
+    if n_rep == 1:
+        return hidden_states
+    hidden_states = hidden_states[:, :, None, :, :].expand(batch, num_key_value_heads, n_rep, slen, head_dim)
+    return hidden_states.reshape(batch, num_key_value_heads * n_rep, slen, head_dim)
+
+
+class ZambaHybridDynamicCache:
+    """
+    A dynamic cache that can handle both the attention cache (which has a seq_len dimension) and the mamba cache
+    (which has a constant shape regardless of seq_len).
+
+    This cache has two sets of lists of tensors: `key_cache` and `value_cache` for attention cache and `conv_states`
+    and `ssm_states` for mamba cache. Each of these lists has `num_layers` tensors. The expected shape for each tensor
+    For attention layers, `key_cache` and `value_cache` have a shape of `(batch_size, num_heads, seq_len, head_dim)`,
+    while `conv_states` and `ssm_states` have a shape of `(batch_size, 0)` (empty tensors).
+    For mamba layers, `key_cache` and `value_cache` have a shape of `(batch_size, 0)` (empty tensors),
+    while `conv_states` represents the convolution state and has a shape of `(batch_size, d_inner, d_conv)`,
+    and `ssm_states` represents the ssm state and has a shape of `(batch_size, d_inner, d_state)`.
+    """
+
+    is_compileable = False
+
+    def __init__(self, config, batch_size, dtype=torch.float16, device=None):
+        self.dtype = dtype
+        self.is_compileable = False
+        self.layers_block_type = config.layers_block_type
+        self.has_previous_state = False  # only used by mamba
+        self.intermediate_size = config.mamba_expand * config.hidden_size
+        self.ssm_state_size = config.mamba_d_state
+        self.conv_kernel_size = config.mamba_d_conv
+        self.n_mamba_heads = config.n_mamba_heads
+        self.conv_states = []
+        self.ssm_states = []
+        self.transformer_layers = []
+        self._modules = {}
+        self._parameters = {}
+        self._buffers = {}
+        for i in range(config.num_hidden_layers):
+            self.conv_states += [
+                torch.zeros(batch_size, self.intermediate_size, self.conv_kernel_size, device=device, dtype=dtype)
+            ]
+            cache_shape = (
+                batch_size,
+                self.n_mamba_heads,
+                self.intermediate_size // self.n_mamba_heads,
+                self.ssm_state_size,
+            )
+            self.ssm_states += [torch.zeros(cache_shape, device=device, dtype=dtype)]
+            if self.layers_block_type[i] == "hybrid":
+                self.transformer_layers.append(i)
+
+        self.key_cache = [torch.tensor([[]] * batch_size, device=device) for _ in range(config.num_hidden_layers)]
+        self.value_cache = [torch.tensor([[]] * batch_size, device=device) for _ in range(config.num_hidden_layers)]
+
+    def __len__(self):
+        return len(self.key_cache)
+
+    def __getitem__(self, layer_idx: int) -> tuple[torch.Tensor, torch.Tensor]:
+        return self.key_cache[layer_idx], self.value_cache[layer_idx]
+
+    # Copied from transformers.models.jamba.modeling_jamba.HybridMambaAttentionDynamicCache.update
+    def update(
+        self,
+        key_states: torch.Tensor,
+        value_states: torch.Tensor,
+        layer_idx: int,
+        cache_kwargs: Optional[dict[str, Any]] = None,
+    ) -> tuple[torch.Tensor, torch.Tensor]:
+        # Update the cache
+        if self.key_cache[layer_idx].shape[-1] == 0:
+            self.key_cache[layer_idx] = key_states
+            self.value_cache[layer_idx] = value_states
+        else:
+            self.key_cache[layer_idx] = torch.cat([self.key_cache[layer_idx], key_states], dim=2)
+            self.value_cache[layer_idx] = torch.cat([self.value_cache[layer_idx], value_states], dim=2)
+
+        return self.key_cache[layer_idx], self.value_cache[layer_idx]
+
+    # Copied from transformers.models.jamba.modeling_jamba.HybridMambaAttentionDynamicCache.reorder_cache
+    def reorder_cache(self, beam_idx: torch.LongTensor):
+        """Reorders the cache for beam search, given the selected beam indices."""
+        for layer_idx in range(len(self.key_cache)):
+            device = self.key_cache[layer_idx].device
+            self.key_cache[layer_idx] = self.key_cache[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.value_cache[layer_idx].device
+            self.value_cache[layer_idx] = self.value_cache[layer_idx].index_select(0, beam_idx.to(device))
+
+            device = self.conv_states[layer_idx].device
+            self.conv_states[layer_idx] = self.conv_states[layer_idx].index_select(0, beam_idx.to(device))
+            device = self.ssm_states[layer_idx].device
+            self.ssm_states[layer_idx] = self.ssm_states[layer_idx].index_select(0, beam_idx.to(device))
+
+    # Copied from transformers.models.jamba.modeling_jamba.HybridMambaAttentionDynamicCache.get_seq_length
+    def get_seq_length(self, layer_idx: Optional[int] = 0) -> int:
+        """Returns the sequence length of the cached states. A layer index can be optionally passed."""
+        # take any layer that contains cache and not empty tensor
+        layer_idx = self.transformer_layers[0] if layer_idx not in self.transformer_layers else layer_idx
+        if len(self.key_cache) <= layer_idx:
+            return 0
+        return self.key_cache[layer_idx].shape[-2]
+
+
+def eager_attention_forward(
+    module: nn.Module,
+    query: torch.Tensor,
+    key: torch.Tensor,
+    value: torch.Tensor,
+    attention_mask: Optional[torch.Tensor],
+    scaling: float,
+    dropout: float = 0.0,
+    **kwargs,
+):
+    key_states = repeat_kv(key, module.num_key_value_groups)
+    value_states = repeat_kv(value, module.num_key_value_groups)
+
+    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
+    if attention_mask is not None:
+        causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
+        attn_weights = attn_weights + causal_mask
+
+    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
+    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=module.training)
+    attn_output = torch.matmul(attn_weights, value_states)
+    attn_output = attn_output.transpose(1, 2).contiguous()
+
+    return attn_output, attn_weights
+
+
+class ZambaAttention(nn.Module):
+    """
+    Multi-headed attention from 'Attention Is All You Need' paper. Modified to use sliding window attention: Longformer
+    and "Generating Long Sequences with Sparse Transformers".
+
+    Adapted from transformers.models.mistral.modeling_mistral.MistralAttention:
+    The input dimension here is attention_hidden_size = 2 * hidden_size, and head_dim = attention_hidden_size // num_heads.
+    The extra factor of 2 comes from the input being the concatenation of original_hidden_states with the output of the previous (mamba) layer
+    (see fig. 2 in https://huggingface.co/papers/2405.16712).
+    Additionally, replaced
+    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim) with
+    attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim/2)
+    """
+
+    def __init__(self, config: ZambaConfig, layer_idx: int):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+
+        self.attention_hidden_size = config.attention_hidden_size
+        self.head_dim = config.attention_head_dim
+        self.num_key_value_groups = config.num_attention_heads // config.num_key_value_heads
+        self.max_position_embeddings = config.max_position_embeddings
+        self.scaling = (self.head_dim / 2) ** -0.5
+        self.is_causal = True
+        self.attention_dropout = config.attention_dropout
+
+        self.q_proj = nn.Linear(config.attention_hidden_size, config.num_attention_heads * self.head_dim, bias=False)
+        self.k_proj = nn.Linear(config.attention_hidden_size, config.num_key_value_heads * self.head_dim, bias=False)
+        self.v_proj = nn.Linear(config.attention_hidden_size, config.num_key_value_heads * self.head_dim, bias=False)
+        self.o_proj = nn.Linear(config.num_attention_heads * self.head_dim, config.hidden_size, bias=False)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        layer_idx: int,
+        attention_mask: Optional[torch.Tensor],
+        past_key_values: Optional[ZambaHybridDynamicCache] = None,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[tuple[torch.Tensor]]]:
+        input_shape = hidden_states.shape[:-1]
+        hidden_shape = (*input_shape, -1, self.head_dim)
+
+        query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+        value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
+
+        if past_key_values is not None:
+            key_states, value_states = past_key_values.update(key_states, value_states, layer_idx)
+
+        attention_interface: Callable = eager_attention_forward
+        if self.config._attn_implementation != "eager":
+            attention_interface = ALL_ATTENTION_FUNCTIONS[self.config._attn_implementation]
+
+        attn_output, attn_weights = attention_interface(
+            self,
+            query_states,
+            key_states,
+            value_states,
+            attention_mask,
+            dropout=0.0 if not self.training else self.attention_dropout,
+            scaling=self.scaling,
+            **kwargs,
+        )
+
+        attn_output = attn_output.reshape(*input_shape, -1).contiguous()
+        attn_output = self.o_proj(attn_output)
+        return attn_output, attn_weights
+
+
+class ZambaMambaMixer(nn.Module):
+    """
+    Compute ∆, A, B, C, and D the state space parameters and compute the `contextualized_states`.
+    A, D are input independent (see Mamba paper [1] Section 3.5.2 "Interpretation of A" for why A isn't selective)
+    ∆, B, C are input-dependent (this is a key difference between Mamba and the linear time invariant S4,
+    and is why Mamba is called **selective** state spaces)
+
+    This module differs from `transformers.models.mamba.modeling_mamba.MambaMixer` in two ways:
+    - Added multi-head: the output of `self.in_proj` is split into `self.n_mamba_heads` heads, and each head
+    undergoes an independent forward pass, identical to the original `MambaMixer`, up until the pre-activations of
+    `self.out_proj`. The pre-activations, coming from different mamba heads, are then concatenated and fed into `self.out_proj`.
+    """
+
+    def __init__(self, config: ZambaConfig, layer_idx):
+        super().__init__()
+        self.config = config
+        self.layer_idx = layer_idx
+        self.hidden_size = config.hidden_size
+        self.ssm_state_size = config.mamba_d_state
+        self.conv_kernel_size = config.mamba_d_conv
+        self.intermediate_size = config.mamba_expand * config.hidden_size
+        self.time_step_rank = config.mamba_dt_rank
+        self.n_mamba_heads = config.n_mamba_heads
+        self.mamba_head_dim = self.intermediate_size // self.n_mamba_heads
+        self.use_conv_bias = config.mamba_conv_bias
+        self.use_bias = config.mamba_proj_bias
+        self.conv1d = nn.Conv1d(
+            in_channels=self.intermediate_size,
+            out_channels=self.intermediate_size,
+            bias=self.use_conv_bias,
+            kernel_size=self.conv_kernel_size,
+            groups=self.intermediate_size,
+            padding=self.conv_kernel_size - 1,
+        )
+
+        self.activation = config.hidden_mamba_act
+        self.act = ACT2FN[config.hidden_mamba_act]
+
+        self.use_fast_kernels = config.use_mamba_kernels
+
+        # projection of the input hidden states
+        self.in_proj = nn.Linear(self.hidden_size, self.intermediate_size * 2, bias=self.use_bias)
+        # weight associated to the selective projection used to make dt, B and C input dependent
+        # each mamba head is processed independently
+        self.x_proj_weight = nn.Parameter(
+            torch.zeros(
+                self.n_mamba_heads,
+                self.time_step_rank + self.ssm_state_size * 2,
+                self.mamba_head_dim,
+            )
+        )
+        # time step projection (discretization)
+        self.dt_proj_weight = nn.Parameter(
+            (torch.zeros(self.n_mamba_heads, self.mamba_head_dim, self.time_step_rank) - 0.5)
+            * 2
+            / self.time_step_rank**0.5
+        )
+        self.dt_proj_bias = nn.Parameter(torch.zeros(self.n_mamba_heads, self.mamba_head_dim))
+
+        # S4D real initialization. These are not discretized!
+        # The core is to load them, compute the discrete states, then write the updated state. Keeps the memory bounded
+        A = torch.arange(1, self.ssm_state_size + 1, dtype=torch.float32)[None, :]
+        A = A.expand(self.intermediate_size, -1).contiguous()
+        self.A_log = nn.Parameter(torch.log(A).reshape(self.n_mamba_heads, self.mamba_head_dim, -1))
+        self.D = nn.Parameter(torch.ones(self.n_mamba_heads, self.mamba_head_dim))
+        self.out_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=self.use_bias)
+
+        if not is_fast_path_available:
+            logger.warning_once(
+                "The fast path is not available because one of `(selective_state_update, selective_scan_fn, causal_conv1d_fn, causal_conv1d_update, mamba_inner_fn)`"
+                " is None. To install follow https://github.com/state-spaces/mamba/#installation and"
+                " https://github.com/Dao-AILab/causal-conv1d. If you want to use the naive implementation, set `use_mamba_kernels=False` in the model config"
+            )
+
+    def cuda_kernels_forward(
+        self, hidden_states: torch.Tensor, cache_params: ZambaHybridDynamicCache = None, attention_mask=None
+    ):
+        batch_size, seq_len, _ = hidden_states.shape
+        use_precomputed_states = cache_params is not None and cache_params.has_previous_state and seq_len == 1
+
+        # 1. Gated linear projection
+        projected_states = self.in_proj(hidden_states).transpose(1, 2)
+
+        hidden_states, gate = projected_states.view(batch_size, -1, 2, seq_len).chunk(2, dim=2)
+        hidden_states = hidden_states.squeeze(2).contiguous()
+        gate = gate.squeeze(2)
+        gate = gate.reshape(batch_size, self.n_mamba_heads, -1, seq_len).transpose(0, 1)
+
+        # 2. Convolution sequence transformation
+        conv_weights = self.conv1d.weight.view(self.conv1d.weight.size(0), self.conv1d.weight.size(2))
+        if use_precomputed_states:
+            hidden_states = causal_conv1d_update(
+                hidden_states.squeeze(-1),
+                cache_params.conv_states[self.layer_idx],
+                conv_weights,
+                self.conv1d.bias,
+                self.activation,
+            )
+            hidden_states = hidden_states.unsqueeze(-1)
+        else:
+            if attention_mask is not None and not torch.all(attention_mask == 1):
+                hidden_states = hidden_states * attention_mask.unsqueeze(1)
+            if cache_params is not None:
+                conv_states = nn.functional.pad(hidden_states, (self.conv_kernel_size - hidden_states.shape[-1], 0))
+                cache_params.conv_states[self.layer_idx].copy_(conv_states)
+            hidden_states = causal_conv1d_fn(hidden_states, conv_weights, self.conv1d.bias, activation=self.activation)
+            if attention_mask is not None and not torch.all(attention_mask == 1):
+                hidden_states = hidden_states * attention_mask.unsqueeze(1)
+
+        # 3. SSM sequence transformation
+        # 3.a. input varying initialization of time_step, B and C
+
+        hidden_states = hidden_states.reshape(-1, self.n_mamba_heads, self.mamba_head_dim, seq_len).transpose(0, 1)
+        ssm_parameters = (self.x_proj_weight[:, None, :, :] @ hidden_states).transpose(-1, -2)
+
+        time_step, B, C = torch.split(
+            ssm_parameters, [self.time_step_rank, self.ssm_state_size, self.ssm_state_size], dim=-1
+        )
+
+        discrete_time_step = self.dt_proj_weight[:, None] @ time_step.transpose(-1, -2)
+
+        A = -torch.exp(self.A_log.float())
+
+        # 3.c perform the recurrence y ← SSM(A, B, C)(x)
+        time_proj_bias = self.dt_proj_bias.float() if self.dt_proj_bias is not None else None
+        scan_outputs = torch.empty((batch_size, 0, seq_len), device=hidden_states.device, dtype=hidden_states.dtype)
+
+        if use_precomputed_states:
+            for n in range(self.n_mamba_heads):
+                scan_outputs_ = selective_state_update(
+                    cache_params.ssm_states[self.layer_idx][:, n],
+                    hidden_states[n, ..., 0],
+                    discrete_time_step[n, ..., 0],
+                    A[n],
+                    B[n, :, 0],
+                    C[n, :, 0],
+                    self.D[n],
+                    gate[n, ..., 0],
+                    time_proj_bias[n],
+                    dt_softplus=True,
+                ).unsqueeze(-1)
+                scan_outputs = torch.cat((scan_outputs, scan_outputs_), dim=1)
+
+        else:
+            ssm_state = torch.empty(
+                (batch_size, 0, self.mamba_head_dim, self.ssm_state_size),
+                device=hidden_states.device,
+                dtype=hidden_states.dtype,
+            )
+            for n in range(self.n_mamba_heads):
+                scan_outputs_, ssm_state_ = selective_scan_fn(
+                    hidden_states[n],
+                    discrete_time_step[n],
+                    A[n],
+                    B[n].transpose(1, 2),
+                    C[n].transpose(1, 2),
+                    self.D[n].float(),
+                    gate[n],
+                    time_proj_bias[n],
+                    delta_softplus=True,
+                    return_last_state=True,
+                )
+                scan_outputs = torch.cat((scan_outputs, scan_outputs_), dim=1).contiguous()
+                ssm_state = torch.cat((ssm_state, ssm_state_.unsqueeze(1)), dim=1)
+            if ssm_state is not None and cache_params is not None:
+                cache_params.ssm_states[self.layer_idx].copy_(ssm_state)
+
+        # 4. Final linear projection
+        contextualized_states = self.out_proj(scan_outputs.transpose(1, 2))
+        return contextualized_states
+
+    def slow_forward(self, input_states, cache_params: ZambaHybridDynamicCache = None, attention_mask=None):
+        batch_size, seq_len, _ = input_states.shape
+        dtype = input_states.dtype
+        # 1. Gated linear projection
+        projected_states = self.in_proj(input_states).transpose(1, 2)
+
+        hidden_states, gate = projected_states.view(batch_size, -1, 2, seq_len).chunk(2, dim=2)
+        hidden_states = hidden_states.squeeze(2).contiguous()
+        gate = gate.squeeze(2)
+        gate = gate.reshape(batch_size, self.n_mamba_heads, -1, seq_len).transpose(0, 1)
+
+        use_cache = isinstance(cache_params, ZambaHybridDynamicCache)
+        # 2. Convolution sequence transformation
+        if use_cache and cache_params.ssm_states[self.layer_idx].shape[0] == batch_size:
+            if self.training:
+                # In training mode, we don't want to perform in-place operations on ssm_state so we can compute the backwards pass
+                ssm_state = cache_params.ssm_states[self.layer_idx].clone()
+            else:
+                ssm_state = cache_params.ssm_states[self.layer_idx]
+
+            ssm_state = ssm_state.to(hidden_states.device)
+
+            if (
+                cache_params.has_previous_state
+                and seq_len == 1
+                and cache_params.conv_states[self.layer_idx].shape[0] == batch_size
+            ):
+                conv_state = cache_params.conv_states[self.layer_idx]
+                conv_state = torch.roll(conv_state, shifts=-1, dims=-1)
+                conv_state[:, :, -1] = hidden_states[:, :, 0]
+                cache_params.conv_states[self.layer_idx] = conv_state
+                hidden_states = torch.sum(conv_state * self.conv1d.weight[:, 0, :], dim=-1)
+                if self.use_conv_bias:
+                    hidden_states += self.conv1d.bias
+                hidden_states = self.act(hidden_states).to(dtype).unsqueeze(-1)
+            else:
+                if attention_mask is not None and not torch.all(attention_mask == 1):
+                    hidden_states = hidden_states * attention_mask[:, -hidden_states.shape[-1] :].unsqueeze(1)
+                conv_state = nn.functional.pad(hidden_states, (self.conv_kernel_size - hidden_states.shape[-1], 0))
+                cache_params.conv_states[self.layer_idx] = conv_state
+                hidden_states = self.act(self.conv1d(hidden_states)[..., :seq_len])
+                if attention_mask is not None and not torch.all(attention_mask == 1):
+                    hidden_states = hidden_states * attention_mask[:, -hidden_states.shape[-1] :].unsqueeze(1)
+        else:
+            ssm_state = torch.zeros(
+                (batch_size, self.n_mamba_heads, self.mamba_head_dim, self.ssm_state_size),
+                device=hidden_states.device,
+                dtype=dtype,
+            )
+            if attention_mask is not None and not torch.all(attention_mask == 1):
+                hidden_states = hidden_states * attention_mask.unsqueeze(1)
+            hidden_states = self.act(self.conv1d(hidden_states)[..., :seq_len])
+            if attention_mask is not None and not torch.all(attention_mask == 1):
+                hidden_states = hidden_states * attention_mask.unsqueeze(1)
+
+        # 3. State Space Model sequence transformation
+        # 3.a. Selection:  [batch, seq_len, self.time_step_rank + self.ssm_state_size * 2]
+        hidden_states = hidden_states.reshape(-1, self.n_mamba_heads, self.mamba_head_dim, seq_len).transpose(0, 1)
+        ssm_parameters = (self.x_proj_weight[:, None, :, :] @ hidden_states).transpose(-1, -2)
+
+        time_step, B, C = torch.split(
+            ssm_parameters, [self.time_step_rank, self.ssm_state_size, self.ssm_state_size], dim=-1
+        )
+        discrete_time_step = (self.dt_proj_weight[:, None] @ time_step.transpose(-1, -2)) + self.dt_proj_bias[
+            :, None, :, None
+        ]
+
+        discrete_time_step = nn.functional.softplus(discrete_time_step)
+
+        # 3.b. Discretization: B and C to [batch, seq_len, intermediate_size, ssm_state_size] (SRAM)
+        A = -torch.exp(self.A_log.float())
+        discrete_A = torch.exp(A[:, None, :, None, :] * discrete_time_step[:, :, :, :, None])
+        discrete_B = discrete_time_step[:, :, :, :, None] * B[:, :, None, :, :].float()
+        deltaB_u = discrete_B * hidden_states[:, :, :, :, None].float()
+        # 3.c perform the recurrence y ← SSM(A, B, C)(x)
+        scan_outputs = []
+        for i in range(seq_len):
+            ssm_state = discrete_A[:, :, :, i, :].transpose(0, 1) * ssm_state + deltaB_u[:, :, :, i, :].transpose(0, 1)
+            scan_output = torch.matmul(ssm_state.transpose(0, 1).to(dtype), C[:, :, i, :].unsqueeze(-1))
+            scan_outputs.append(scan_output[:, :, :, 0])
+        scan_output = torch.stack(scan_outputs, dim=-1)
+        scan_output = scan_output + (hidden_states * self.D[:, None, :, None])
+        scan_output = scan_output * self.act(gate)
+
+        if use_cache:
+            cache_params.ssm_states[self.layer_idx] = ssm_state
+
+        # 4. Final linear projection
+        contextualized_states = self.out_proj(
+            scan_output.transpose(0, 1).reshape(batch_size, -1, seq_len).transpose(1, 2)
+        )
+        return contextualized_states
+
+    def forward(self, hidden_states, cache_params: ZambaHybridDynamicCache = None, attention_mask=None):
+        if self.use_fast_kernels:
+            if not is_fast_path_available or "cuda" not in self.x_proj_weight.device.type:
+                raise ValueError(
+                    "Fast Mamba kernels are not available. Make sure to they are installed and that "
+                    "the mamba module is on a CUDA device. lease run 'pip install causal-conv1d>=1.2.0' "
+                    "and 'pip install mamba-ssm', or set use_mamba_kernels=False in the model's config."
+                )
+            return self.cuda_kernels_forward(hidden_states, cache_params, attention_mask=attention_mask)
+        return self.slow_forward(hidden_states, cache_params, attention_mask=attention_mask)
+
+
+# Copied from transformers.models.mistral.modeling_mistral.MistralMLP with Mistral->Zamba
+class ZambaMLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.hidden_size = config.hidden_size
+        self.intermediate_size = config.intermediate_size
+        self.gate_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.up_proj = nn.Linear(self.hidden_size, self.intermediate_size, bias=False)
+        self.down_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=False)
+        self.act_fn = ACT2FN[config.hidden_act]
+
+    def forward(self, x):
+        down_proj = self.down_proj(self.act_fn(self.gate_proj(x)) * self.up_proj(x))
+        return down_proj
+
+
+class ZambaAttentionDecoderLayer(nn.Module):
+    def __init__(self, config: ZambaConfig, layer_idx: Optional[int] = None):
+        super().__init__()
+        self.self_attn = ZambaAttention(config, layer_idx)
+
+        self.feed_forward = ZambaMLP(config)
+        self.input_layernorm = ZambaRMSNorm(config.attention_hidden_size, eps=config.rms_norm_eps)
+        self.pre_ff_layernorm = ZambaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        original_hidden_states: torch.Tensor,
+        layer_idx: int,
+        attention_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[ZambaHybridDynamicCache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        **kwargs: Unpack[FlashAttentionKwargs],
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): output of previous Mamba layer of shape `(batch, seq_len, embed_dim)`
+            original_hidden_states (`torch.FloatTensor`): word embedding output of shape `(batch, seq_len, embed_dim)`.
+                This is concatenated with `hidden_states` (which is the output of the previous (mamba) layer). The
+                concatenated tensor is then used as input of the pre-attention RMSNorm
+                (see fig. 2 in https://huggingface.co/papers/2405.16712).
+            layer_idx (`int`): layer_idx in the forward pass. Used to distinguish Zamba's tied transformer layers.
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            past_key_values (`ZambaHybridDynamicCache`, *optional*): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence.
+        """
+        hidden_states = torch.concatenate([hidden_states, original_hidden_states], dim=-1)
+        hidden_states = self.input_layernorm(hidden_states)
+        hidden_states, self_attn_weights = self.self_attn(
+            hidden_states=hidden_states,
+            layer_idx=layer_idx,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            **kwargs,
+        )
+        # feed-forward (MLP)
+        hidden_states = self.pre_ff_layernorm(hidden_states)
+        hidden_states = self.feed_forward(hidden_states)
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        return outputs
+
+
+class ZambaMambaDecoderLayer(nn.Module):
+    def __init__(self, config: ZambaConfig, layer_idx: int):
+        super().__init__()
+        self.mamba = ZambaMambaMixer(config=config, layer_idx=layer_idx)
+        self.input_layernorm = ZambaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+        self.layer_idx = layer_idx
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        original_hidden_states: Optional[torch.Tensor] = None,
+        layer_idx: Optional[int] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[ZambaHybridDynamicCache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+        transformer_hidden_states: Optional[torch.Tensor] = None,
+        **kwargs,
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            past_key_values (`ZambaHybridDynamicCache`, *optional*): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence.
+        """
+
+        residual = hidden_states
+
+        # `transformer_hidden_states` is the output from shared transformer + linear layer (see fig. 2 in https://huggingface.co/papers/2405.16712).
+        # `transformer_hidden_states` is then added to the input to the mamba layer below (as described in eq. (6) of https://huggingface.co/papers/2405.16712).
+        hidden_states = (
+            hidden_states + transformer_hidden_states if transformer_hidden_states is not None else hidden_states
+        )
+        hidden_states = self.input_layernorm(hidden_states)
+
+        hidden_states = self.mamba(
+            hidden_states=hidden_states,
+            cache_params=past_key_values,
+            attention_mask=attention_mask,
+        )
+
+        self_attn_weights = None
+
+        # residual connection after mamba
+        hidden_states = residual + hidden_states
+
+        outputs = (hidden_states,)
+
+        if output_attentions:
+            outputs += (self_attn_weights,)
+
+        if use_cache:
+            outputs += (past_key_values,)
+
+        return outputs
+
+
+class ZambaHybridLayer(nn.Module):
+    def __init__(self, shared_transf: ZambaAttentionDecoderLayer, linear: nn.Linear, mamba: ZambaMambaDecoderLayer):
+        super().__init__()
+        self.shared_transf = shared_transf
+        self.linear = linear
+        self.mamba_decoder = mamba
+
+    @deprecate_kwarg("past_key_value", new_name="past_key_values", version="4.58")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        original_hidden_states: Optional[torch.Tensor] = None,
+        layer_idx: Optional[int] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        causal_mask: Optional[torch.Tensor] = None,
+        past_key_values: Optional[ZambaHybridDynamicCache] = None,
+        output_attentions: Optional[bool] = False,
+        use_cache: Optional[bool] = False,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> tuple[torch.FloatTensor, Optional[tuple[torch.FloatTensor, torch.FloatTensor]]]:
+        """
+        Args:
+            hidden_states (`torch.FloatTensor`): input to the layer of shape `(batch, seq_len, embed_dim)`
+            original_hidden_states (`torch.FloatTensor`): word embedding output that will be concatenated with
+            hidden activations to form the input of the shared transformer layer.
+            layer_idx (`int`): layer number.
+            attention_mask (`torch.FloatTensor`, *optional*): attention mask of size
+                `(batch, sequence_length)` where padding elements are indicated by 0.
+            past_key_values (`ZambaHybridDynamicCache`, *optional*): cached past key and value projection states
+            output_attentions (`bool`, *optional*):
+                Whether or not to return the attentions tensors of all attention layers. See `attentions` under
+                returned tensors for more detail.
+            use_cache (`bool`, *optional*):
+                If set to `True`, `past_key_values` key value states are returned and can be used to speed up decoding
+                (see `past_key_values`).
+            cache_position (`torch.LongTensor` of shape `(sequence_length)`, *optional*):
+                Indices depicting the position of the input sequence tokens in the sequence.
+        """
+
+        layer_outputs = self.shared_transf(
+            hidden_states,
+            original_hidden_states=original_hidden_states,
+            layer_idx=layer_idx,
+            attention_mask=causal_mask,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+        )
+
+        transformer_hidden_states = layer_outputs[0]
+
+        if output_attentions:
+            self_attn_weights = layer_outputs[1]
+
+        transformer_hidden_states = self.linear(transformer_hidden_states)
+
+        layer_outputs = self.mamba_decoder(
+            hidden_states,
+            transformer_hidden_states=transformer_hidden_states,
+            attention_mask=attention_mask,
+            past_key_values=past_key_values,
+            output_attentions=output_attentions,
+            use_cache=use_cache,
+            cache_position=cache_position,
+        )
+
+        if output_attentions:
+            layer_outputs = (layer_outputs[0], self_attn_weights) + layer_outputs[2:]
+
+        return layer_outputs
+
+
+@auto_docstring
+class ZambaPreTrainedModel(PreTrainedModel):
+    config: ZambaConfig
+    base_model_prefix = "model"
+    supports_gradient_checkpointing = True
+    _no_split_modules = ["ZambaAttentionDecoderLayer", "ZambaMambaDecoderLayer"]
+    _skip_keys_device_placement = "past_key_values"
+    _supports_flash_attn = False
+    _supports_sdpa = False
+    # Note: only supports ZambaHybridDynamicCache
+    _is_stateful = True
+
+    def _init_weights(self, module):
+        std = self.config.initializer_range
+        if isinstance(module, (nn.Linear, nn.Conv1d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, ZambaRMSNorm):
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, ZambaMambaMixer):
+            module.x_proj_weight.data.normal_(mean=0.0, std=std)
+            dt_init_std = self.config.mamba_dt_rank**-0.5
+            nn.init.uniform_(module.dt_proj_weight, -dt_init_std, dt_init_std)
+
+            mamba_head_dim = self.config.mamba_expand * self.config.hidden_size // self.config.n_mamba_heads
+            dt = torch.exp(
+                torch.rand(self.config.n_mamba_heads, mamba_head_dim)
+                * (math.log(self.config.time_step_max) - math.log(self.config.time_step_min))
+                + math.log(self.config.time_step_min)
+            ).clamp(min=self.config.time_step_floor)
+            # # Inverse of softplus: https://github.com/pytorch/pytorch/issues/72759
+            inv_dt = dt + torch.log(-torch.expm1(-dt))
+            module.dt_proj_bias.data.copy_(inv_dt)
+
+            A = torch.arange(1, module.ssm_state_size + 1, dtype=torch.float32)[None, :]
+            A = A.expand(module.intermediate_size, -1).contiguous()
+            module.A_log.data.copy_(torch.log(A).reshape(module.n_mamba_heads, module.mamba_head_dim, -1))
+            module.D.data.fill_(1.0)
+
+
+@auto_docstring
+class ZambaModel(ZambaPreTrainedModel):
+    """
+    Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`ZambaDecoderLayer`]
+
+    Args:
+        config: ZambaConfig
+    """
+
+    def __init__(self, config: ZambaConfig):
+        super().__init__(config)
+        self.padding_idx = config.pad_token_id
+        self.vocab_size = config.vocab_size
+
+        self.embed_tokens = nn.Embedding(config.vocab_size, config.hidden_size, self.padding_idx)
+        block = ZambaAttentionDecoderLayer(config)
+        mamba_layers = []
+        linear_layers = []
+        self.layers_block_type = config.layers_block_type
+        for i in range(config.num_hidden_layers):
+            if config.layers_block_type[i] == "mamba":
+                mamba_layers.append(ZambaMambaDecoderLayer(config, layer_idx=i))
+            elif config.layers_block_type[i] == "hybrid":
+                linear_layers.append(nn.Linear(self.config.hidden_size, self.config.hidden_size, bias=False))
+                mamba_layers.append(ZambaMambaDecoderLayer(config, layer_idx=i))
+        mamba_layers = iter(mamba_layers)
+        linear_layers = iter(linear_layers)
+        layers = []
+        self._tied_weights_keys = []
+        for layer_id, layer_type in enumerate(self.layers_block_type):
+            if layer_type == "hybrid":
+                prefix_name = f"layers.{layer_id}."
+                tied_keys = [
+                    "shared_transf.self_attn.q_proj.weight",
+                    "shared_transf.self_attn.k_proj.weight",
+                    "shared_transf.self_attn.v_proj.weight",
+                    "shared_transf.self_attn.o_proj.weight",
+                    "shared_transf.feed_forward.gate_proj.weight",
+                    "shared_transf.feed_forward.up_proj.weight",
+                    "shared_transf.feed_forward.down_proj.weight",
+                    "shared_transf.input_layernorm.weight",
+                    "shared_transf.pre_ff_layernorm.weight",
+                ]
+                self._tied_weights_keys = [*self._tied_weights_keys, *[prefix_name + key for key in tied_keys]]
+                layers.append(ZambaHybridLayer(block, next(linear_layers), next(mamba_layers)))
+            else:
+                layers.append(next(mamba_layers))
+        self.layers = nn.ModuleList(layers)
+
+        self._attn_implementation = config._attn_implementation
+        self.final_layernorm = ZambaRMSNorm(config.hidden_size, eps=config.rms_norm_eps)
+
+        self.gradient_checkpointing = False
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[ZambaHybridDynamicCache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+    ) -> Union[tuple, BaseModelOutputWithPast]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+
+        if self.gradient_checkpointing and self.training and use_cache:
+            logger.warning_once(
+                "`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`."
+            )
+            use_cache = False
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        hidden_states = inputs_embeds
+
+        original_hidden_states = torch.clone(inputs_embeds)
+        # original_hidden_states: word embedding output that will be concatenated with hidden activations to form the input of the shared transformer layer
+
+        if use_cache and past_key_values is None:
+            logger.warning_once(
+                "Zamba requires an initialized `ZambaHybridDynamicCache` to return a cache. None was "
+                "provided, so no cache will be returned."
+            )
+
+        if cache_position is None:
+            cache_position = torch.arange(hidden_states.shape[1], device=hidden_states.device)
+        if position_ids is None:
+            position_ids = cache_position.unsqueeze(0)
+
+        causal_mask = self._update_causal_mask(attention_mask, inputs_embeds, cache_position)
+
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attns = () if output_attentions else None
+
+        for layer_idx, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states += (hidden_states,)
+
+            if self.gradient_checkpointing and self.training:
+                layer_outputs = self._gradient_checkpointing_func(
+                    layer.__call__,
+                    hidden_states,
+                    original_hidden_states,
+                    layer_idx,
+                    attention_mask,
+                    causal_mask,
+                    past_key_values,
+                    output_attentions,
+                    use_cache,
+                    cache_position,
+                )
+            else:
+                layer_outputs = layer(
+                    hidden_states,
+                    original_hidden_states=original_hidden_states,
+                    layer_idx=layer_idx,
+                    attention_mask=attention_mask,
+                    causal_mask=causal_mask,
+                    past_key_values=past_key_values,
+                    output_attentions=output_attentions,
+                    use_cache=use_cache,
+                    cache_position=cache_position,
+                )
+            hidden_states = layer_outputs[0]
+
+            if output_attentions:
+                if layer_outputs[1] is not None:
+                    # append attentions only of attention layers. Mamba layers return `None` as the attention weights
+                    all_self_attns += (layer_outputs[1],)
+
+        hidden_states = self.final_layernorm(hidden_states)
+
+        # add hidden states from the last decoder layer
+        if output_hidden_states:
+            all_hidden_states += (hidden_states,)
+
+        if past_key_values and not past_key_values.has_previous_state:
+            past_key_values.has_previous_state = True
+
+        output = BaseModelOutputWithPast(
+            last_hidden_state=hidden_states,
+            past_key_values=past_key_values if use_cache else None,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attns,
+        )
+        return output if return_dict else output.to_tuple()
+
+    # Copied from transformers.models.jamba.modeling_jamba.JambaModel._update_causal_mask
+    def _update_causal_mask(self, attention_mask, input_tensor, cache_position):
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+
+        dtype, device = input_tensor.dtype, input_tensor.device
+        min_dtype = torch.finfo(dtype).min
+        sequence_length = input_tensor.shape[1]
+        target_length = cache_position[-1] + 1
+
+        causal_mask = torch.full((sequence_length, target_length), fill_value=min_dtype, dtype=dtype, device=device)
+        if sequence_length != 1:
+            causal_mask = torch.triu(causal_mask, diagonal=1)
+        causal_mask *= torch.arange(target_length, device=device) > cache_position.reshape(-1, 1)
+        causal_mask = causal_mask[None, None, :, :].expand(input_tensor.shape[0], 1, -1, -1)
+        if attention_mask is not None:
+            causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+            if attention_mask.dim() == 2:
+                mask_length = attention_mask.shape[-1]
+                padding_mask = causal_mask[..., :mask_length].eq(0.0) * attention_mask[:, None, None, :].eq(0.0)
+                causal_mask[..., :mask_length] = causal_mask[..., :mask_length].masked_fill(padding_mask, min_dtype)
+
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type in ["cuda", "xpu", "npu"]
+        ):
+            # Attend to all tokens in fully masked rows in the causal_mask, for example the relevant first rows when
+            # using left padding. This is required by F.scaled_dot_product_attention memory-efficient attention path.
+            # Details: https://github.com/pytorch/pytorch/issues/110213
+            causal_mask = AttentionMaskConverter._unmask_unattended(causal_mask, min_dtype)
+
+        return causal_mask
+
+
+# Adapted from transformers.models.jamba.modeling_jamba.JambaForCausalLM with Jamba->Zamba, JAMBA->ZAMBA
+class ZambaForCausalLM(ZambaPreTrainedModel, GenerationMixin):
+    def __init__(self, config: ZambaConfig):
+        super().__init__(config)
+        self.model = ZambaModel(config)
+        self._tied_weights_keys = ["lm_head.weight", *self.model._tied_weights_keys]
+        self.vocab_size = config.vocab_size
+        self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[ZambaHybridDynamicCache] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        cache_position: Optional[torch.LongTensor] = None,
+        logits_to_keep: Union[int, torch.Tensor] = 0,
+        **kwargs,
+    ) -> Union[tuple, CausalLMOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
+            config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
+            (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+
+        Example:
+
+        ```python
+        >>> from transformers import AutoTokenizer, ZambaForCausalLM
+
+        >>> model = ZambaForCausalLM.from_pretrained("Zyphra/Zamba-7B-v1")
+        >>> tokenizer = AutoTokenizer.from_pretrained("Zyphra/Zamba-7B-v1")
+
+        >>> prompt = "Hey, are you conscious? Can you talk to me?"
+        >>> inputs = tokenizer(prompt, return_tensors="pt")
+
+        >>> # Generate
+        >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
+        >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
+        "Hey, are you conscious? Can you talk to me?\nI'm not conscious, but I can talk to you."
+        ```"""
+
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        # decoder outputs consists of (dec_features, layer_state, dec_hidden, dec_attn)
+        outputs = self.model(
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            cache_position=cache_position,
+            return_dict=return_dict,
+        )
+
+        hidden_states = outputs[0]
+        # Only compute necessary logits, and do not upcast them to float if we are not computing the loss
+        slice_indices = slice(-logits_to_keep, None) if isinstance(logits_to_keep, int) else logits_to_keep
+        logits = self.lm_head(hidden_states[:, slice_indices, :])
+
+        loss = None
+        if labels is not None:
+            loss = self.loss_function(logits, labels, self.vocab_size, **kwargs)
+
+        if not return_dict:
+            output = (logits,) + outputs[1:]
+            return (loss,) + output if loss is not None else output
+
+        return CausalLMOutputWithPast(
+            loss=loss,
+            logits=logits,
+            past_key_values=outputs.past_key_values,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        **kwargs,
+    ):
+        # Overwritten -- has a unique cache type, `ZambaHybridDynamicCache`
+
+        empty_past_kv = past_key_values is None
+
+        # Omit tokens covered by past_key_values
+        if not empty_past_kv:
+            # If we have cache: let's slice `input_ids` through `cache_position`, to keep only the unprocessed tokens
+            # Exception 1: when passing input_embeds, input_ids may be missing entries
+            # Exception 2: some generation methods do special slicing of input_ids, so we don't need to do it here
+            # Exception 3: with synced GPUs cache_position may go out of bounds, but we only want dummy token in that case.
+            #              (we can't check exception 3 while compiling)
+            if (
+                inputs_embeds is not None  # Exception 1
+                or cache_position[-1] >= input_ids.shape[1]  # Exception 3
+            ):
+                input_ids = input_ids[:, -cache_position.shape[0] :]
+            elif input_ids.shape[1] != cache_position.shape[0]:  # Default case (the "else", a no op, is Exception 2)
+                input_ids = input_ids[:, cache_position]
+        else:
+            past_key_values = ZambaHybridDynamicCache(
+                self.config, input_ids.shape[0], dtype=self.dtype, device=self.device
+            )
+
+        if attention_mask is not None and position_ids is None:
+            # create position_ids on the fly for batch generation
+            position_ids = attention_mask.long().cumsum(-1) - 1
+            position_ids.masked_fill_(attention_mask == 0, 1)
+            if not empty_past_kv:
+                position_ids = position_ids[:, -input_ids.shape[1] :]
+
+        # if `inputs_embeds` are passed, we only want to use them in the 1st generation step
+        if inputs_embeds is not None and empty_past_kv:
+            model_inputs = {"inputs_embeds": inputs_embeds}
+        else:
+            model_inputs = {"input_ids": input_ids.contiguous()}  # `contiguous()` needed for compilation use cases
+
+        model_inputs.update(
+            {
+                "position_ids": position_ids,
+                "past_key_values": past_key_values,
+                "use_cache": use_cache,
+                "attention_mask": attention_mask,
+                "logits_to_keep": self.config.num_logits_to_keep,
+                "cache_position": cache_position,
+            }
+        )
+
+        # Forward ALL kwargs that are uninitialized (e.g. `use_cache`).
+        for key, value in kwargs.items():
+            if key not in model_inputs:
+                model_inputs[key] = value
+
+        return model_inputs
+
+
+@auto_docstring(
+    custom_intro="""
+    The Zamba Model with a sequence classification head on top (linear layer).
+
+    [`ZambaForSequenceClassification`] uses the last token in order to do the classification, as other causal models
+    (e.g. GPT-2) do.
+
+    Since it does classification on the last token, it requires to know the position of the last token. If a
+    `pad_token_id` is defined in the configuration, it finds the last token that is not a padding token in each row. If
+    no `pad_token_id` is defined, it simply takes the last value in each row of the batch. Since it cannot guess the
+    padding tokens when `inputs_embeds` are passed instead of `input_ids`, it does the same (take the last value in
+    each row of the batch).
+    """
+)
+class ZambaForSequenceClassification(ZambaPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.num_labels = config.num_labels
+        self.model = ZambaModel(config)
+        self._tied_weights_keys = self.model._tied_weights_keys
+        self.score = nn.Linear(config.hidden_size, self.num_labels, bias=False)
+
+        # Initialize weights and apply final processing
+        self.post_init()
+
+    @auto_docstring
+    def forward(
+        self,
+        input_ids: Optional[torch.LongTensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        position_ids: Optional[torch.LongTensor] = None,
+        past_key_values: Optional[Union[Cache, list[torch.FloatTensor]]] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        labels: Optional[torch.LongTensor] = None,
+        use_cache: Optional[bool] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[tuple, SequenceClassifierOutputWithPast]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+
+        transformer_outputs = self.model(
+            input_ids,
+            attention_mask=attention_mask,
+            position_ids=position_ids,
+            past_key_values=past_key_values,
+            inputs_embeds=inputs_embeds,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = transformer_outputs[0]
+        logits = self.score(hidden_states)
+
+        if input_ids is not None:
+            batch_size = input_ids.shape[0]
+        else:
+            batch_size = inputs_embeds.shape[0]
+
+        if self.config.pad_token_id is None and batch_size != 1:
+            raise ValueError("Cannot handle batch sizes > 1 if no padding token is defined.")
+        if self.config.pad_token_id is None:
+            last_non_pad_token = -1
+        elif input_ids is not None:
+            # To handle both left- and right- padding, we take the rightmost token that is not equal to pad_token_id
+            non_pad_mask = (input_ids != self.config.pad_token_id).to(logits.device, torch.int32)
+            token_indices = torch.arange(input_ids.shape[-1], device=logits.device, dtype=torch.int32)
+            last_non_pad_token = (token_indices * non_pad_mask).argmax(-1)
+        else:
+            last_non_pad_token = -1
+            logger.warning_once(
+                f"{self.__class__.__name__} will not detect padding tokens in `inputs_embeds`. Results may be "
+                "unexpected if using padding tokens in conjunction with `inputs_embeds.`"
+            )
+
+        pooled_logits = logits[torch.arange(batch_size, device=logits.device), last_non_pad_token]
+
+        loss = None
+        if labels is not None:
+            labels = labels.to(logits.device)
+            if self.config.problem_type is None:
+                if self.num_labels == 1:
+                    self.config.problem_type = "regression"
+                elif self.num_labels > 1 and (labels.dtype == torch.long or labels.dtype == torch.int):
+                    self.config.problem_type = "single_label_classification"
+                else:
+                    self.config.problem_type = "multi_label_classification"
+
+            if self.config.problem_type == "regression":
+                loss_fct = MSELoss()
+                if self.num_labels == 1:
+                    loss = loss_fct(pooled_logits.squeeze(), labels.squeeze())
+                else:
+                    loss = loss_fct(pooled_logits, labels)
+            elif self.config.problem_type == "single_label_classification":
+                loss_fct = CrossEntropyLoss()
+                loss = loss_fct(pooled_logits.view(-1, self.num_labels), labels.view(-1))
+            elif self.config.problem_type == "multi_label_classification":
+                loss_fct = BCEWithLogitsLoss()
+                loss = loss_fct(pooled_logits, labels)
+        if not return_dict:
+            output = (pooled_logits,) + transformer_outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+
+        return SequenceClassifierOutputWithPast(
+            loss=loss,
+            logits=pooled_logits,
+            past_key_values=transformer_outputs.past_key_values,
+            hidden_states=transformer_outputs.hidden_states,
+            attentions=transformer_outputs.attentions,
+        )
+
+
+__all__ = ["ZambaForCausalLM", "ZambaForSequenceClassification", "ZambaModel", "ZambaPreTrainedModel"]
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba2/__pycache__/__init__.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba2/__pycache__/__init__.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..755f06de5f2b50c5a8574cb70a482b65f217aa56
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba2/__pycache__/__init__.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba2/__pycache__/configuration_zamba2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba2/__pycache__/configuration_zamba2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..54f645ae0ffa0675f417d86a1610183e92aa5171
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba2/__pycache__/configuration_zamba2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba2/__pycache__/modeling_zamba2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba2/__pycache__/modeling_zamba2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..c0b9b895f0ea0d0dda6751db50520079f0527ba8
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba2/__pycache__/modeling_zamba2.cpython-312.pyc differ
diff --git a/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba2/__pycache__/modular_zamba2.cpython-312.pyc b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba2/__pycache__/modular_zamba2.cpython-312.pyc
new file mode 100644
index 0000000000000000000000000000000000000000..471ac1df0fe011d5119fa462078ba162efb8c42d
Binary files /dev/null and b/Prism/LLaDA/LLaDA_Prism/.venv/lib/python3.12/site-packages/transformers/models/zamba2/__pycache__/modular_zamba2.cpython-312.pyc differ